# Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json 
import os
sns.set_style("whitegrid")
my_pal = sns.color_palette(n_colors=10)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Load Data (a sample)

## 1. Train 

In [2]:
path ='../Kaggle_DSB_data/train.csv'
train = pd.read_csv(path, nrows=1000000, parse_dates=['timestamp'])

In [3]:
train['installation_id'] = train['installation_id'].apply(lambda x: int(x,16))

In [4]:
train = train.set_index(['installation_id','game_session'])

In [5]:
train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,event_id,timestamp,event_data,event_count,event_code,game_time,title,type,world
installation_id,game_session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
125199,45bb1e1b6b50c07b,27253bdc,2019-09-06 17:53:46.937,"{""event_code"": 2000, ""event_count"": 1}",1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
125199,17eeb7f223665f53,27253bdc,2019-09-06 17:54:17.519,"{""event_code"": 2000, ""event_count"": 1}",1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
125199,0848ef14a8dc6892,77261ab5,2019-09-06 17:54:56.302,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
125199,0848ef14a8dc6892,b2dba42b,2019-09-06 17:54:56.387,"{""description"":""Let's build a sandcastle! Firs...",2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
125199,0848ef14a8dc6892,1bb5fbdb,2019-09-06 17:55:03.253,"{""description"":""Let's build a sandcastle! Firs...",3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


Noise: game_time =0  and incorrect 

## 2. Train Labels

In [7]:
path_labels ='../Kaggle_DSB_data/train_labels.csv'
train_labels = pd.read_csv(path_labels, nrows=1000000)

In [8]:
train_labels['installation_id'] = train_labels['installation_id'].apply(lambda x: int(x,16))

In [9]:
train_labels = train_labels.set_index(['installation_id','game_session'])

In [10]:
train_labels.head();

## 3. Merge all Assessment train data

In [11]:
assessments = pd.merge(train[train.type == 'Assessment'],
                     train_labels, left_index=True, right_index=True, how='inner')

# clean df
assessments = assessments.drop(['title_y','type'],axis=1)\
                         .rename({'title_x':'title'},axis=1)

In [15]:
assessments.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,event_id,timestamp,event_data,event_count,event_code,game_time,title,world,num_correct,num_incorrect,accuracy,accuracy_group
installation_id,game_session,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
435871,6bdf9623adc94d89,3bfd1a65,2019-08-06 05:37:50.020,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",1,2000,0,Mushroom Sorter (Assessment),TREETOPCITY,1,0,1.0,3
435871,6bdf9623adc94d89,db02c830,2019-08-06 05:37:50.078,"{""event_count"":2,""game_time"":77,""event_code"":2...",2,2025,77,Mushroom Sorter (Assessment),TREETOPCITY,1,0,1.0,3
435871,6bdf9623adc94d89,a1e4395d,2019-08-06 05:37:50.082,"{""description"":""Pull three mushrooms out of th...",3,3010,77,Mushroom Sorter (Assessment),TREETOPCITY,1,0,1.0,3
435871,6bdf9623adc94d89,7da34a02,2019-08-06 05:37:52.799,"{""coordinates"":{""x"":199,""y"":484,""stage_width"":...",4,4070,2784,Mushroom Sorter (Assessment),TREETOPCITY,1,0,1.0,3
435871,6bdf9623adc94d89,28ed704e,2019-08-06 05:37:53.631,"{""height"":1,""coordinates"":{""x"":171,""y"":519,""st...",5,4025,3625,Mushroom Sorter (Assessment),TREETOPCITY,1,0,1.0,3


# Feature Engineering

Let's first try to construct "domain-like" features which might infleunce user performance. I'll divide these features into the following categories:

1. Level of game difficulty
2. User Expertise
3. User Focus 
4. Clarity of Game Instructions
5. Other



In [14]:
train.columns.tolist()

['event_id',
 'timestamp',
 'event_data',
 'event_count',
 'event_code',
 'game_time',
 'title',
 'type',
 'world']

## 1. Level of game difficulty

a) rate title based on user performance <br>
b) rate world based on user performance <br>



### a) Title

In [None]:
pct_corrects = lambda g: round(g.sum()/g.count(),3)
assess_title = assessments.groupby(['title'])[['num_correct']]\
                          .agg(['count','sum', pct_corrects])

In [None]:
# Clean df
assess_title.columns = ['num_assessments', 'num_correct', 'pct_correct']
assess_title['pct_incorrect'] =  1 - assess_title.pct_correct
assess_title.index = [title.split('(')[0].strip() for title in assess_title.index]

In [None]:
assess_title.loc[:,['pct_correct','pct_incorrect']]\
            .plot.barh(stacked=True,figsize=(10, 6), title='Correct Assessment (by title)')
plt.gca().legend(frameon=True,loc=(1.05,0.9))
plt.gca().axvline(x=0.5, ymin=0, ymax=1, color='black')

### b) World

In [None]:
assess_world = assessments.groupby(['world'])[['num_correct']]\
                          .agg(['count','sum', pct_corrects])

# Clean df
assess_world.columns = ['num_assessments', 'num_correct', 'pct_correct']
assess_world['pct_incorrect'] =  1 - assess_world.pct_correct
assess_world.index = [title.split('(')[0].strip() for title in assess_world.index]

assess_world.loc[:,['pct_correct','pct_incorrect']]\
            .plot.barh(stacked=True,figsize=(10, 6), title='Correct Assessment (by title)')
plt.gca().legend(frameon=True,loc=(1.05,0.9))
plt.gca().axvline(x=0.5, ymin=0, ymax=1, color='black')

Good Features: ```'world'``` and ```'title'```

## 2. User Expertise

a) measure net game time <br>
b) measure net game time per title <br>
c) measure game time per world <br>
d) measure how the performance changes with time <br>
e) measure time of completion of each task <br>

### a) Net Game Time

In [None]:
def net_game_times(df):
    
    # drop game sessions with zero time
    df = df[df.game_time!=0]
    
    # compute total game time and number of sessions per id
    game_times = df.groupby('installation_id')['game_time']\
                   .agg(['sum','count'])\
                   .rename({'sum': 'net_time', 'count': 'num_sessions'}, axis=1)
    
    # convert milliseconds to mins
    game_times['net_time'] = game_times['net_time'].apply(lambda x: round(x/(1000*60),2))
    
    # compute avg game time per session
    game_times['avg_session_time'] = (game_times.net_time/game_times.num_sessions).apply(lambda x: round(x,2))
    
    return game_times
    

In [None]:
df_game_times = net_game_times(train)

In [None]:
df_game_times.head()

In [None]:
less_than = 1
print('Number of ids with net_time less than {0} min: {1}/{2}'
      .format(less_than, 
              np.sum(df_game_times['net_time']<less_than), 
              len(df_game_times))
     )

In [None]:
_ = df_game_times['net_time'].apply(np.log1p)\
                             .plot(kind='hist' ,bins=50,edgecolor='white',
                                   figsize=(15,6),color=my_pal[1])
                                   
_ = plt.gcf().suptitle("Distribution of Log(net_time)",fontsize = 18)
_ = plt.ylabel('num_ids', fontsize=18)



In [None]:
_ =df_game_times['num_sessions'].apply(np.log1p)\
                                .plot(kind='hist' ,bins=50,edgecolor='white',
                                   figsize=(15,5),color=my_pal[1])

_ = plt.gcf().suptitle("Distribution of Log(num_sessions)",fontsize = 18)
_ = plt.ylabel('num_ids', fontsize=18)
                                  

Good Features: ```['net_time','num_sessions', 'avg_session_time']``` 

Comments: 
- perhaps use log of net_time and num_session
- treat differently ids with zero to less than a cut_off-value of game time


### b) Game Time per Title

In [None]:
def get_times_per_title(df):
    
    # drop game sessions with zero time
    df = df[df.game_time!=0]
    
    # compute total game time and number of sessions per id
    game_times = df.groupby(['installation_id','title'])['game_time']\
                   .agg(['sum','count'])\
                   .rename({'sum': 'net_time', 'count': 'num_sessions'}, axis=1)
    
    # convert milliseconds to mins
    game_times['net_time'] = game_times['net_time'].apply(lambda x: round(x/(1000*60),2))
    
    # compute avg game time per session
    game_times['avg_session_time'] = (game_times.net_time/game_times.num_sessions).apply(lambda x: round(x,2))
    
    return game_times
    

In [None]:
times_per_title = get_times_per_title(train)
times_per_title.head()

In [None]:
all_titles = list(set(times_per_title.index.get_level_values('title')))

In [None]:
times_per_title[times_per_title.index.get_level_values('title')==all_titles[0]]

Good Features: compute the net_time, num_sessions_, avg_session_time for each title

### c) Game Time per World

In [None]:
def get_times_per_world(df):
    
    # drop game sessions with zero time
    df = df[df.game_time!=0]
    
    # compute total game time and number of sessions per id
    game_times = df.groupby(['installation_id','world'])['game_time']\
                   .agg(['sum','count'])\
                   .rename({'sum': 'net_time', 'count': 'num_sessions'}, axis=1)
    
    # convert milliseconds to mins
    game_times['net_time'] = game_times['net_time'].apply(lambda x: round(x/(1000*60),2))
    
    # compute avg game time per session
    game_times['avg_session_time'] = (game_times.net_time/game_times.num_sessions).apply(lambda x: round(x,2))
    
    return game_times
    

In [None]:
times_per_world = get_times_per_world(train)
times_per_world.head()

In [None]:
all_worlds = list(set(times_per_world.index.get_level_values('world')))
all_worlds

In [None]:
times_per_world[times_per_world.index.get_level_values('world')==all_worlds[0]]

Good Features: compute the net_time, num_sessions_, avg_session_time for each world

### e) Measure time of completion of each task (Activity)

In [None]:
set(train.event_code)

In [None]:
train.type.unique()

In [None]:
train[train.event_code == 4100].event_id.unique().tolist()

In [None]:
event_data_test = train[train.type == "Activity"].event_data.apply(json.loads)

In [None]:
list(event_data_test.values)[45:50]

## 3. User focus

a) time of the day user signed in <br>
b) how much time spent per session <br>
c) boredom by playing too much?

### a) Hour of the day/ weekday signed in

In [72]:
train['date'] = train.timestamp.dt.date

In [None]:
train['hour'] = train.timestamp.dt.hour

In [25]:
train['weekday'] = train.timestamp.dt.weekday

installation_id  game_session    
125199           45bb1e1b6b50c07b    4
                 17eeb7f223665f53    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6892    4
                 0848ef14a8dc6

### b) Time spent per session

In [26]:
time_per_sess = train.groupby(['installation_id','game_session']).agg({'event_count':'count','game_time':'sum'})

### c) Boredom? How often do they sign in AND play?

In [44]:
index_to_use = time_per_sess[time_per_sess.event_count > 1].index 

In [95]:
df_test = train[train.event_count == 1].loc[index_to_use][['timestamp']]\
                                       .reset_index()\
                                       .sort_values(['timestamp'],axis=0)\
                                       .set_index(['installation_id','game_session'])

In [103]:
df_test['timestamp'].dt.date.diff().sort_values(ascending = False)

installation_id  game_session    
105577444        c590a9f5fb0424b0   1 days
383808598        ab216b23f5895389   1 days
197394209        30ad4f55c92ed296   1 days
322296085        deffa72a2416392a   1 days
6716296          1f646a88bbe785fd   1 days
60126850         95f7e55ff2e42cbe   1 days
247679358        a51cf96a4fca0409   1 days
104742402        f2886d13db864d54   1 days
71337387         5b1eee9fee74426d   1 days
92498238         582831c5f49dcc2a   1 days
29218592         284c96e23f38743a   1 days
257634555        5d36a1a32fcf29be   1 days
2252647          b2705e46e281e578   1 days
97159125         258d1eb53a97b3a4   1 days
104742402        292f4c931113fca0   1 days
32234582         5f1b43df00617876   1 days
320556311        1eeca4f6836e886e   1 days
251013805        81ca7b36eabc2537   1 days
60705479         8a8fa9f348dcda23   1 days
329073920        f7dfa0d45ba719c3   1 days
47054845         c46b76eee75b1ab2   1 days
187085588        43e308146155bcf8   1 days
383054822        549

In [107]:
df_test['timestamp'].diff().sort_values(ascending = False)

installation_id  game_session    
305807521        88b560cb8571fcbc   16:23:02.905000
378598818        a0434f0d02aac4d8   12:29:06.935000
81668782         a37e86e6faf80ea5   12:13:53.250000
320556311        1eeca4f6836e886e   11:43:38.840000
101899943        2c7ec7ae4144ed31   11:36:29.261000
13784938         a69e98a93df8850e   11:09:59.975000
22553105         056271c2a1229f61   11:08:34.651000
199496302        b2067d31a3ece773   10:50:52.077000
152615611        77b2abbfe7ab7edc   10:41:33.043000
348464122        cb883ccd3389163c   10:35:42.463000
84897028         0995da558675b5a6   10:30:48.349000
153706856        363e7ace880f7cd5   10:25:09.489000
379748121        fd5690a95dcde8c8   10:20:54.735000
100169250        78317aebb6d2288a   10:15:31.733000
94613089         1fee42a7b191e8be   10:12:17.882000
153706856        c3ef1b8879566f49   10:03:56.188000
305807521        2ac7c9c48b1bf242   09:58:59.799000
224135772        238fa066e6bfaf4b   09:57:42.379000
278547406        acd203553f86a

## 4. Clarity of game instructions

Game Clarity:<br>
a) rate hints based on user performance <br>
b) check if the user was doing something weird: from coords (e.g. false negatives) <br>
c) check if the user was doing something weird to familiarize himself/herself with the game (e.g. at the beginnig might press randomly)