In [None]:
# For example, here's several helpful packages to load in 
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.notebook import tqdm #progress bars with support for jupyter notebooks
import datetime as dt
import keras as ks
import tensorflow as tf

import sklearn.preprocessing as sklpp
import gc
import collections
import os
import pickle

import os
print(os.listdir("../input"))


def unique_union(x,y):
    """
    takes two lists and returns their union with only unique elements.  No ordering.

    *x expands x, {*x} makes a set, *{*x} expands the set.
    """
    return [*({*x}.union({*y}))]

with open('../input/preprocessor-for-data-bowl-2019/event_ids_map.pkl', 'rb') as open_file:
    event_ids_map = pickle.load(open_file)

worlds_map = {'TREETOPCITY': 0, 'MAGMAPEAK': 1, 'NONE': 2, 'CRYSTALCAVES': 3} #{x:i for (x,i) in zip(worlds, np.arange(len(worlds)))}
assessment_titles = ['Chest Sorter (Assessment)',
 'Cart Balancer (Assessment)',
 'Cauldron Filler (Assessment)',
 'Mushroom Sorter (Assessment)',
 'Bird Measurer (Assessment)']
types_map = {'Clip': 0, 'Activity': 1, 'Game': 2, 'Assessment': 3}
types = ['Clip', 'Activity', 'Game', 'Assessment']

# TODO: reduce runs
def times_to_numbers(data):
    # DROP_TIME is the number of seconds between distinct sessions
## DROP_TIME is the number of seconds between distinct sessions
    # TODO: treat this as a hyperparameter!
    DROP_TIME = 900
    # sort by timestamp
    data_sorted = data.sort_values(by=['timestamp'])
    # 'end_of_session' is a bool which denotes whether the event is the last in a session
    data_sorted['end_of_session'] = ((data_sorted['timestamp'].shift(periods=-1) - data_sorted['timestamp']).map(lambda x : x.total_seconds()) > DROP_TIME)
    # get session number by adding up 'end_of_session'
    data_sorted['session_number'] = data_sorted.groupby(level=0)['end_of_session'].cumsum()
    # don't need the rest of it
    data_sorted = data_sorted.drop(columns = ['timestamp','end_of_session'])
    
    #data_sorted.iloc[-1]['game_time'] = data_sorted.iloc[-2]['game_time']
    return data_sorted

pattern = '"correct":true'

def assessment_to_num(a_bool):
    if a_bool:
        return 1
    else:
        return (-1)

def scale_game_time(gt):
   max_session_num = 114.0
   max_game_time = 306910249
   #return (gt * max_session_num) / max_game_time
   LIMEROBOT_NORM = 1000 # this is what limerobot used...
   return gt / LIMEROBOT_NORM 

    
last_assessment_map = {}

columns_to_keep = ['installation_id', 'world', 'type', 'event_data', 'event_id', 'timestamp', 'game_time', 'title']
test_converters = {'world': lambda x : worlds_map.get(x, -1) + 1,
                   'type': lambda x : types_map.get(x,-1) + 1,
                   'event_id': lambda x : event_ids_map.get(x,-1) + 1,
                   'event_data': lambda x : assessment_to_num(pattern in x),
                   'game_time': lambda x : scale_game_time(int(x)) + 1,
                   'timestamp': pd.to_datetime}



SEQ_LENGTH = 2000

# and https://discourse.julialang.org/t/reshape-a-1-d-array-into-an-array-of-different-size-arrays/25999
n = [3,1,1,1]
split_points = np.cumsum(n[0:-1])

# get the models
from tensorflow.keras import layers, Model, losses

assessments = ['Chest Sorter (Assessment)', 'Cart Balancer (Assessment)', 'Mushroom Sorter (Assessment)', 'Bird Measurer (Assessment)', 'Cauldron Filler (Assessment)']

models = {assessment:tf.keras.models.load_model('../input/fork-of-data-science-bowl-model-1f2596/' + assessment + '.h5', 
                                                custom_objects={
                                                    'SEQ_LENGTH': SEQ_LENGTH, 
                                                    'model_params': {
                                                          'LEARNING_RATE': 0.001, #default is 0.001
                                                          'LOSS_FN': tf.keras.losses.CategoricalCrossentropy(),
                                                          'METRICS': ['categorical_accuracy'],
                                                          'CLIP_NORM': 1,
                                                          'DENSE_DROPOUT': 0.1},
                                                    'tf.keras.losses': tf.keras.losses,
                                                    'my_optimizer': tf.keras.optimizers.Adam(learning_rate=.001,  
                                                                beta_1=0.9, 
                                                                beta_2=0.999, 
                                                                amsgrad=True,
                                                                clipnorm = 1)
                                                }) for assessment in assessments}

def prepare_batch_for_prediction(batch):
    def prepare_for_prediction(df):
        row = np.array([np.array([0,0,0]), np.array([0]), np.array([0]), np.array([0])], dtype=object)
        X0 = np.empty([SEQ_LENGTH, 3])
        X1 = np.empty([SEQ_LENGTH])
        X2 = np.empty([SEQ_LENGTH])
        X3 = np.empty([SEQ_LENGTH])
        Xentry = np.tile(row, (SEQ_LENGTH,1))
        #print(df)
        id_array = df.to_numpy().astype(int)[-SEQ_LENGTH:]
        #id_array[:] = id_array[:,idx]
        # could maybe use something fancy from numpy but let's just loop
        for i in np.arange(id_array.shape[0]):
            Xentry[i,:] = np.split(id_array[i], split_points, axis=0)
        # TODO: make this batch-y instead of silly reshaping
        #X0 = np.vstack(Xentry[:,0]).reshape([SEQ_LENGTH,3]).astype(float) # TODO: why astype??
        X0 = np.vstack(Xentry[:,0]).astype(float)#.reshape([SEQ_LENGTH,3]).astype(float) # TODO: why astype??
        X1 = Xentry[:,1].astype(int).reshape([SEQ_LENGTH]) # this has event_id -- last one is the 
        X2 = Xentry[:,2].astype(int).reshape([SEQ_LENGTH])
        X3 = Xentry[:,3].astype(int).reshape([SEQ_LENGTH])
        return [X0, X1, X2, X3]
    XX0, XX1, XX2, XX3 = [], [], [], []
    for df in batch:
       X0, X1, X2, X3 = prepare_for_prediction(df)
       XX0.append(X0) #TODO: make this less hacky!
       XX1.append(X1)
       XX2.append(X2)
       XX3.append(X3)
    return XX0, XX1, XX2, XX3
    #return [prepare_for_prediction(df) for df in batch]

import csv
def process_row(row):
        row[0] = test_converters['event_id'](row[0])
        row[2] = test_converters['timestamp'](row[2])
        row[3] = test_converters['event_data'](row[3])
        row[9] = test_converters['type'](row[9])
        row[10] = test_converters['world'](row[10])
        title = row[8]
        indices_to_keep = [4,7,2,3,0,9,10]
        row = [row[i] for i in indices_to_keep]
        return row, str(title)
MAX_BATCH_SIZE = 32
with open('unsorted_submission.csv', 'w') as submission_file:
     submission_file.write("installation_id,accuracy_group" + "\n")

with open('../input/data-science-bowl-2019/test.csv', 'r') as csvfile:
    csvreader = csv.reader(csvfile)
    next(csvreader)# header row
    header = ['game_time', 'timestamp', 'correct_assessment', 'event_id', 'type','world']
    # note that 'timestamp' is not the eventual column name -- times_to_numbers changes it
    # to 'game_session'
    a_row, last_title = process_row(next(csvreader))
    rows = [a_row]
    last_ID = a_row[0]
    batch = {assessment: [] for assessment in assessments}
    for row in csvreader:
        the_row, the_title = process_row(row)
        ID = the_row[0]
        if ID == last_ID:
            rows.append(the_row)
            last_title = the_title
        else:
            last_ID = ID
            
            assessment = last_title
            predict_ID = rows[0][0]
            rows = [row[1:] for row in rows] # drop 'installation_id'
            # TODO: couldn't we drop this when we add it?  doesn't seem pressing
            #print("Found a frame.")
            df = pd.DataFrame.from_records(rows, columns=header)
            df = times_to_numbers(df)
            
            batch[assessment].append((predict_ID, df))
            if len(batch[assessment]) == MAX_BATCH_SIZE:
                IDs = [pair[0] for pair in batch[assessment]]
                dfs = prepare_batch_for_prediction([pair[1] for pair in batch[assessment]])
                predictions_raw = models[assessment].predict_on_batch(dfs)
                predictions = [np.argmax(prediction) for prediction in predictions_raw]
                with open('unsorted_submission.csv', 'a') as submission_file:
                    for (i,j) in zip(IDs, predictions):
                        submission_file.write(i + ',' + str(j) + '\n')
                del batch[assessment]
                gc.collect()
                batch[assessment] = [] #TODO: determine if this is better than just = []
              #  i+=1
             #   print("Predicted on batch " + str(i) + ".")
            
            rows = [the_row]
            last_title = the_title
            last_ID = ID
    
    # need to pass the last rows to a batch
    assessment = last_title
    predict_ID = rows[0][0]
    rows = [row[1:] for row in rows]
    df = pd.DataFrame.from_records(rows, columns=header)
    df = times_to_numbers(df)
    batch[assessment].append((predict_ID, df))
    
    # predict for those remaining in batch
    for assessment in batch:
        if batch[assessment]:
            IDs = [pair[0] for pair in batch[assessment]]
            dfs = prepare_batch_for_prediction([pair[1] for pair in batch[assessment]])
            predictions_raw = models[assessment].predict_on_batch(dfs)
            predictions = [np.argmax(prediction) for prediction in predictions_raw]
            with open('unsorted_submission.csv', 'a') as submission_file:
                for (i,j) in zip(IDs, predictions):
                    submission_file.write(i + ',' + str(j) + '\n')
            batch[assessment] = []     ## TODO: bad repetition!

In [0]:
with open('unsorted_submission.csv', 'r') as unsorted_submission:
    with open('submission.csv', 'w') as submission_file:
        header = next(unsorted_submission)
        submission_file.write(header)
        for row in sorted(unsorted_submission):
            submission_file.write(row)