In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm #progress bars with support for jupyter notebooks
import datetime as dt

import matplotlib.pyplot as plt

tqdm.pandas(desc="my bar!")
import os

In [None]:
def unique_union(x,y):
    """
    takes two lists and returns their union with only unique elements.  No ordering.

    *x expands x, {*x} makes a set, *{*x} expands the set.
    """
    return [*({*x}.union({*y}))]

In [None]:
# right way to do this is probably three different functions with usecols argument...
def keep_cols(cols,col):
    return col in cols

def read_data():
    print('Reading train.csv.')
    train = pd.read_csv('../input/data-science-bowl-2019/train.csv')
    print('Read train.csv with {} rows and {} columns.'.format(train.shape[0],train.shape[1]))

    print('Reading test.csv.')
    test = pd.read_csv('../input/data-science-bowl-2019/test.csv', usecols=['event_id'])
    print('Read test.csv with {} rows and {} columns.'.format(test.shape[0],test.shape[1]))

    print('Reading train_labels.csv.')
    train_labels = pd.read_csv('../input/data-science-bowl-2019/train_labels.csv')
    print('Read train_labels.csv with {} rows and {} columns.'.format(train_labels.shape[0],train_labels.shape[1]))

#    print('Reading specs.csv.')
#    specs = pd.read_csv('../input/data-science-bowl-2019/specs.csv')
#    print('Read specs.csv with {} rows and {} columns.'.format(specs.shape[0],specs.shape[1]))

    return train, test, train_labels, #specs
train, test, train_labels = read_data()

In [None]:

## Filter out all the installations which never complete an assessment
# Sorting by event_code is definitely not enough.
# I don't think 'Assessment' is -- user could start an event of 'type' 'Assessment' and not finish it
train = train.reset_index(drop=True).groupby('installation_id').filter(
    lambda x : len(x[((x['event_code'] == 4100) | (x['event_code'] == 4110)) & (x['type'] == 'Assessment')].index) > 0)

train.drop(columns=['event_code'], inplace=True)


## 'pattern' to detect successful assessments
# (originally had r'string' here for "raw string" -- not necessary!  only for backslashes)
pattern = '"correct":true'
train['correct_assessment'] = train['event_data'].str.contains(pattern)
train.drop(columns = ['event_data'], inplace=True) # no further use

## Double-check that 'True' only appears on assessments!
train[train['correct_assessment'] == True]['title'].unique()
# !!!


In [None]:
## An example of feature engineering that we don't do:
# make a dict {event_id -> event_info entries to keep}!
# good project for next time!
print(train)

In [None]:
assessment_titles = {*list(train[train['type'] == 'Assessment']['title'])}
train.drop(columns=['title'], inplace=True)

In [None]:
event_ids = unique_union(train['event_id'],test['event_id'])
worlds = {*list(train['world'])}

## recover memory
# TODO: it's principled but silly to bring in test for 'world' and 'type'...
del test


# number the lists
worlds_map = {x:i for (x,i) in zip(worlds, np.arange(len(worlds)))}

event_ids_map = {x:i for (x,i) in zip(event_ids, np.arange(len(event_ids)))}

types_map = {'Clip': 0, 'Activity': 1, 'Game': 2, 'Assessment': 3}

In [None]:
# change from text labels to int labels via maps
# TODO: could this be one pass?
# (uses apply: https://stackoverflow.com/a/44648068)
train['world'] = train['world'].map(worlds_map)
train['event_id'] = train['event_id'].map(event_ids_map)
train['type'] = train['type'].map(types_map)

#test['world'] = test['world'].map(worlds_map)
#test['event_id'] = test['event_id'].map(event_ids_map)
#test['type'] = test['type'].map(types_map)


In [None]:
## Want a column 'session_number' which measures not just time in-app but real time 
## since starting.
## This seems better than 'game_session' which counts something like turning the app on and off
## This is the most active 'feature engineering' -- worth it?

train['timestamp'] = pd.to_datetime(train['timestamp'])
#test['timestamp'] = pd.to_datetime(test['timestamp'])
# replaces 'timestamp' with 'session_number'


#TODO: make one pass?
def times_to_numbers(data):
    ## DROP_TIME is the number of seconds between distinct sessions
    # TODO: treat this as a hyperparameter!
    DROP_TIME = 900
    # sort by timestamp
    data_sorted = data.groupby('installation_id').apply(lambda x : x.sort_values(by=['timestamp']))
    # 'end_of_session' is a bool which denotes whether the event is the last in a session
    data_sorted['end_of_session'] = ((data_sorted['timestamp'].shift(periods=-1) - data_sorted['timestamp']).map(lambda x : x.total_seconds()) > DROP_TIME)
    # get session number by adding up 'end_of_session'
    data_sorted['session_number'] = data_sorted.groupby(level=0)['end_of_session'].cumsum()
    # don't need the rest of it
    data_sorted = data_sorted.drop(columns = ['timestamp','end_of_session', 'game_session'])
    return data_sorted

# apply
train = times_to_numbers(train)

In [None]:
## TODO: move up
train = train.drop(columns='event_count')

## Need to scale down 'game_time'
# kind of makes sense for it to have a similar scale to 'session_number'?
max_game_time = train['game_time'].max()
print('max_game_time: ' + str(max_game_time))
max_session_num = train['session_number'].max()
print('max_session_num: ' + str(max_session_num))
### scales 'game_time' to be smaller
# Usually need to subtract min but the min is zero.
# everything will get shifted by 1 anyway...
LIMEROBOT_NORM = 1000 # this is what limerobot used...
def scale_game_time(gt):
   #return gt  / max_game_time
   return gt / LIMEROBOT_NORM 

## to support masking
# TODO: how important is this if we're not using an official Masking layer?
# TODO: make one pass
train['event_id'] = train['event_id'].transform(lambda x : x + 1)
train['type'] = train['type'].transform(lambda x : x + 1)
train['world'] = train['world'].transform(lambda x : x + 1)
train['game_time'] = train['game_time'].transform(lambda x : scale_game_time(x))
train['session_number'] = train['session_number'].transform(lambda x : x + 1)
train['correct_assessment'] = train['correct_assessment'].transform(lambda x : 1 if x else -1) # to avoid mask?  not sure about this one.

## TODO: use this construction above!
train_ids = set(train['installation_id'].unique())


In [None]:
original_columns = list(train.columns)
del original_columns[1]
print("Original columns:")
print(original_columns)
# (when we apply this permutation we'll have already dropped 'installation_id' so ignored below)
# ['event_id', 'game_time','type','world','correct_assessment','session_number']
# we want
# ['game_time', 'session_number', 'correct_assessment', 'event_id', 'type','world']

# permutation written as [f(0),f(1),f(2),f(3),f(4),f(5)] under the permutation
permutation = [3,0,4,5,2,1]
idx = np.empty_like(permutation)
idx[permutation] = np.arange(len(permutation))
# used below when we pipe pandas into numpy:

new_columns = [original_columns[i] for i in idx]
print("Want: \n['game_time', 'session_number', 'correct_assessment', 'event_id', 'type', 'world']")
print("New columns:")
print(new_columns)
## splits [0,1,2,3,4,5] into [[0,1,2],3,4,5]
# and https://discourse.julialang.org/t/reshape-a-1-d-array-into-an-array-of-different-size-arrays/25999
n = [3,1,1,1]
split_points = np.cumsum(n[0:-1])

In [None]:
## sequences need to be of the same length
# what should it be?
# note that we have already filtered out players who did not take any assessments!
train['installation_id'].value_counts().describe(percentiles=[.25, .5, .75, .9, .99])

plt.figure();
train['installation_id'].value_counts().hist(bins=60)
plt.xticks([0, 2000, 13000, 60000], rotation=45)

print(train['installation_id'].value_counts().describe(percentiles=[.25,.33,.5,.66,.75]))

## Cuts off a lot of the long tail
SEQ_LENGTH = 2000
# originally had SEQ_LENGTH = 13000
# cutting to 2000 cuts training time by 75%, seems worth it
# another interesting hyperparameter to play with!

In [None]:
### time to work with train_labels
import feather
## two ways we could do this:
#   1) take the max accuracy group.  in other words, take the best result.
#   2) take the last accuracy group.  in other words, take the most recent result.
# I think 2) makes more sense since our goal is predict the next result, not the "average" result
# (Q: what does it mean for the app if these heavily diverge?)

# 'last_session' is the id of the most recent session for an ('installation_id', 'title')
train_labels['last_session'] = train_labels.groupby(['installation_id','title'])['game_session'].tail(n=1)
# keep only the most recent sessions
train_labels = train_labels[train_labels['game_session'] == train_labels['last_session']]
# actually only one label per (installation, title)?
print(train_labels.groupby(['installation_id'])['title']
      .apply(lambda x : x.duplicated())
      .any())
# yes

# TODO: 3rd finish uses 'num_correct' and 'num_incorrect'
train_labels.drop(columns =['game_session','last_session', 'num_correct','num_incorrect', 'accuracy'], inplace=True)

## assessment: 'installation_id's which took the assessment
took_assessment_ids_map = {activity: list(
    train_labels[(train_labels['title'] == activity)]['installation_id']) 
                           for activity in assessment_titles}

## fill in assessments not taken
title_index  = pd.MultiIndex.from_product([train_labels['installation_id'].unique(), assessment_titles],names=['installation_id','title'])#, names=['installation_id', 'title'])
# TODO: can't be right to have set_index.reindex!
filled_train_labels = train_labels.set_index(['installation_id','title']).reindex(index=title_index, fill_value=0).reset_index()
feather.write_dataframe(filled_train_labels, "train_labels_processed.fth")
# want to fill in all assessments for each session
# so create data frame with the same 'game_session', 'installation_id' and fill in known values
# rest are zero

In [None]:
## We're done with train!  write it
feather.write_dataframe(train, "train_processed.fth")

In [None]:
# writing records that will be useful elsewhere
import pickle
with open('event_ids_map.pkl', 'wb') as file:
    pickle.dump(event_ids_map, file)
    
with open('took_assessments_map.pkl', 'wb') as file:
    pickle.dump(took_assessment_ids_map, file)

In [None]:
### Make nice numpy arrays

## make a directory if it's not already there
import os
try:
    os.mkdir('data')
# if FileExistsError AND the file is a directory -- good, move on
# otherwise, something screwy is happening!  stop everything
# all kinds of concurrency issues here, but not an issue for us
except FileExistsError: 
    if not os.path.isdir('data'):
        raise


row = np.array([np.array([0,0,0]), np.array([0]), np.array([0]), np.array([0])], dtype=object)

# one-hot encodes 'accuracy_group' 
def my_dumb_one_hot(num):
    if num == 0:
        return np.array([1,0,0,0])
    elif num == 1:
        return np.array([0,1,0,0])
    elif num == 2:
        return np.array([0,0,1,0])
    elif num == 3:
        return np.array([0,0,0,1])
   
for activity in tqdm(took_assessment_ids_map):
    relevant_ids = took_assessment_ids_map[activity]
    X0 = np.empty([len(relevant_ids), SEQ_LENGTH, 3])
    X1 = np.empty([len(relevant_ids), SEQ_LENGTH])
    X2 = np.empty([len(relevant_ids), SEQ_LENGTH])
    X3 = np.empty([len(relevant_ids), SEQ_LENGTH])
    y = np.empty([len(relevant_ids), 4])
    j = 0
    for an_id in tqdm(relevant_ids):
        # fill a new array with rows of zeros
        Xentry = np.tile(row, (SEQ_LENGTH,1))
        # to form id_array: take train for a particular id, drop the id, make into numpy
        # fix type, cut off/pad at SEQ_LENGTH
        id_array = train.loc[an_id].drop(columns = 'installation_id').to_numpy().astype(int)[-SEQ_LENGTH:]
        
        # permute columns to be correct
        id_array[:] = id_array[:,idx]
        
        # TODO: should be a way to do this in numpy without explicit loop
        for i in np.arange(id_array.shape[0]):
            Xentry[i,:] = np.split(id_array[i], split_points, axis=0)
        
        # now we have an array like
        # [
        #  [[x,y,z],a,b,c]
        #  ...
        #  [[x,y,z],a,b,c]
        # ]
        # but we want four arrays like:
        # [[x,y,z],...,[x,y,z]], [a...], [b...], [c...]
        X0[j] = np.vstack(Xentry[:,0])
        X1[j] = Xentry[:,1].astype(int)
        X2[j] = Xentry[:,2].astype(int)
        X3[j] = Xentry[:,3].astype(int)
        # now find the label and one-hot encode it
        y_temp = filled_train_labels.set_index(['installation_id','title']).loc[(an_id, activity)]
        y_temp = y_temp[0]
        y_temp = my_dumb_one_hot(y_temp)
        # add it to the array of labels
        y[j] = y_temp 
        
        # TODO: determine if this is a necessary evil
        j = j +1
        
    np.savez_compressed(os.path.join('data', 'X_' + activity + '.npz'), x0=X0, x1=X1, x2=X2, x3=X3)
   
    np.save(os.path.join('data', 'Y_' + activity + '.npy'), y)

# need to do the same for test_labels, but we also need to compute test_labels!


correct_pattern = r'"correct":true'
incorrect_pattern = r'"correct":false'
test['correct_assessment'] = test['event_data'].str.contains(correct_pattern)
test[test['correct_assessment'] == True]['title'].unique()
test['incorrect_assessment'] = test['event_data'].str.contains(incorrect_pattern)
test[test['incorrect_assessment'] == True]['title'].unique()


test_labels = test[test['title'].str.contains('Assessment')].query('event_code == 4100 | event_code == 4110').drop(columns=['type','world','event_count','game_time','event_id','event_code','title_event_code']).reset_index(drop=True)
test_labels['title'] = test_labels['title'].map(activities_map)



test_labels['max_session'] = test_labels.reset_index(drop=True).groupby(['installation_id','title'])['session_number'].transform(max)
## TODO: rewrite with get?
def fn(df):
    if df[df['incorrect_assessment'] == False].index.empty:
        return 3000000 # TODO: do better
    else:
        return sum(df.loc[: df[(df['incorrect_assessment'] == False)].index[0] , :]['incorrect_assessment'])

def fn2(ser):
    df = ser.to_frame(name='incorrect_assessment')
    return fn(df)
    
test_labels['until_correct'] = test_labels.groupby(['installation_id','session_number','title'])['incorrect_assessment'].transform(fn2)
accuracy_groups_map = {0: 3, 1: 2, 3000000: 0}
# 'default' is 1 -- it counts for 3 or more tries, i.e. 2 or more incorrect attempts
test_labels['accuracy_group'] = test_labels['until_correct'].map(lambda x : accuracy_groups_map.get(x, 1) )
test_labels = test_labels.drop(columns=['event_data','incorrect_assessment','correct_assessment','until_correct'])
test_labels = test_labels[test_labels['session_number'] == test_labels['max_session']]
# actually this isn't enough -- one can take the same assessment multiple times in a session!
test_labels = test_labels.groupby(['installation_id','title']).last().drop(columns =['session_number','max_session'])

title_index  = pd.MultiIndex.from_product([test_labels.index.levels[0], assessment_codes], names=['installation_id', 'title'])
test_labels = test_labels.reindex(title_index, fill_value=0)
feather.write_dataframe(test_labels, "test_labels_processed.fth")



test = test.drop(columns=['title', 'incorrect_assessment','title_event_code','event_code','title_event_code','event_count'])

test['event_id'] = test['event_id'].transform(lambda x : x + 1)
test['type'] = test['type'].transform(lambda x : x + 1)
test['world'] = test['world'].transform(lambda x : x + 1)
test['game_time'] = test['game_time'].transform(lambda x : x + 1)
test['session_number'] = test['session_number'].transform(lambda x : x + 1)
test['correct_assessment'] = test['correct_assessment'].transform(lambda x : 1 if x else -1) # to avoid mask?  not sure about this one.


test_ids = set(test['installation_id'].unique())
test = test.reset_index(drop=True)
for an_id in tqdm(test_ids):
    id_array = test.groupby('installation_id').get_group(an_id).drop(columns = ['event_data','installation_id']).to_numpy().astype(int)
    id_array[:] = id_array[:,idx] # idx defined when we did this to train_ids
    new_array = np.empty((id_array.shape[0],4), dtype=object) # TODO: understand whether dtype=object is fishy or not
    # could maybe use something fancy from numpy but let's just loop
    for i in np.arange(id_array.shape[0]):
        new_array[i,:] = np.split(id_array[i], split_points, axis=0)

    np.save(os.path.join('data','test_' + an_id + ".npy"),
            new_array
           )
feather.write_dataframe(test, "test_processed.fth")

import pickle

