In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from IPython.display import display

In [2]:
print('current working directory: {}'.format(os.getcwd()))
print('\n')
print('file/directories in cwd: {}'.format(os.listdir()))

current working directory: C:\Users\agarw\Dropbox\Kaggle data-science-bowl 2019\Prarit-data-science-bowl-2019


file/directories in cwd: ['.git', '.ipynb_checkpoints', "Feature Engineering Speed Up Experiments (Joonho Kim's conflicted copy 2019-11-18).ipynb", 'Feature Engineering Speed Up Experiments.ipynb', 'Feature Engineering Speed Up Final.ipynb', 'Feature Engineering.ipynb', 'Initial EDA.ipynb', 'sample_submission.csv', 'specs.csv', 'test.csv', 'train.csv', 'train_features.csv', 'train_labels.csv']


In [3]:
train=pd.read_csv('train.csv')

In [4]:
train.head(2)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK


In [5]:
# create representative data to test the functions being defined
# use the first 10 installation_id's to do this
ins_id=train.installation_id.unique()[0:10]
rep_data=train.loc[train.installation_id.isin(ins_id)]

In [6]:
rep_data.head(2)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK


In [7]:
# function to extract features from data
# the argument testing is False be default but will need to be set to true if the passed dataset is test data
def features(dataset):  
    
    
    # convert timestamp to datetime 
    dataset['datetime']=pd.to_datetime(dataset.timestamp)
    
    # timestamp can be replaced by datetime, so drop the column for timestamp
    # sort by event_count and group the dataset by game_session
    trngrp=dataset.drop(columns=['timestamp']).sort_values('event_count').groupby('game_session', sort=False)
    
    # the first event in each session
    start=trngrp.first()
    print('The first row of each session has event_count 1: {}'.
          format((start.event_count==1).all()))
    # we only want this for purposes of recording the start time of each session
    # so we will only keep the corresponding column      
    start=pd.DataFrame(start['datetime'])      
    
    # the last event in each session
    # the record for this event contains almost all the interesting information about the session
    end=trngrp.last()
    print('The last row of each session has max event_count: {}'.
          format((end.event_count==trngrp.event_count.max()).all()))
    
    # merge start with end so that we have a record of the start time of every session in end
    # also reset end's index to regain the game_session as a column in a DataFrame
    end=end.merge(start, left_index=True, right_index=True, suffixes=('_end','_start')).reset_index()
    # remove sessions which were exited immediately after start i.e. their game_time = 0
    # note that since only the start event of clips was recorded, this implies clips will have a game_time = 0
    # this implies that they will be removed from end after this step, we will take care of this later
    print('shape of dataset: {} before removing 0 game_time sessions: {}'.format('end', end.shape))
    print('number of sessions with 0 game_time: {}'.format(end.loc[end.game_time==0].shape[0]))
    end=end.loc[end.game_time>0]
    print('shape of dataset: {} after removing 0 game_time sessions: {}'.format('end', end.shape))
    
    
    
    # assessments which the player attempted to solve
    attempts=dataset.loc[(dataset.type=='Assessment')
                         &((dataset.title!='Bird Measurer (Assessment)') 
                          & (dataset.event_code==4100)) 
                         | ((dataset.title=='Bird Measurer (Assessment)') 
                            &(dataset.event_code==4110))].groupby('game_session')
    print('number of assessments with valid attempts: {}'.format(attempts.ngroups))
    
    # stats for each assessment solved by the player
    results=attempts.agg({'event_data': lambda x: assessment_stats(x),
                          'installation_id': lambda x: x.values[0], 
                          'datetime': lambda x: x.values[0], 
                          'title':lambda x: x.values[0]})
    
    
    # the event_data column of results contains a tuple of stats for each session
    # the elements in this tuple are num_correct, num_incorrect, total_attempts, accuracy, accuracy_grp
    # let's restructure this so that the dataframe contains a seperate column for each entry in the above tuple
    # print('results before restructring:\n')
    # display(results.head(10))
    temp=pd.DataFrame(list(map(lambda x: list(x), results.event_data.values)),
                      columns=['num_correct', 'num_incorrect', 'total_attempts', 'accuracy', 'accuracy_grp'])
    # print('temp dataframe: \n')
    # display(temp.head(10))
    # combining results and temp
    # note that results uses game_session as its index value, while temp using numbers as its index
    # since pd.concat() will use index to join the two dataframes, we will reset the index of results before joining 
    results=pd.concat([results.reset_index(), temp], axis=1)
    # drop the event_data column from results, as that has been rendered reduntant
    results.drop(columns=['event_data'], inplace=True)
    # print('new results: \n')
    # display(results.head(2))
    
    # merge 'results' and 'end' on installation_id to creat pair every assessment attempted by a player with all the sessions of that player
    # Also, we will only use the above pairs to obtain the history of each assessment
    # therefore the only properties of an assessment that are required here are: game_session, installation_id and datetime
    session_pairs=results[['game_session','installation_id','datetime']].merge(end, left_on='installation_id', 
                                right_on='installation_id', suffixes=('_assessment','_other') )
    # for reasons that were clarified in the notebook 'Feature Engineering Speed Up Experiments', it is
    # best to not include entries corresponding to instances where both elements of a session pair are of type = Assessments'
    # we will deal with these cases seperately
    session_pairs=session_pairs.loc[session_pairs.type!='Assessment']
    
    # compute the time difference between the datetime of each pair of sessions in session_pair
    # session_pairs['timedelta']=list(map(lambda x: pd.Timedelta(x).delta,
    #                               (session_pairs.datetime-session_pairs.datetime_start).values))
    # since we are only interested in the history of an assessment, therefore we will chose only those rows where timedelta>0
    # we will also group these by game_session_assessment, type, title_other
    # history=session_pairs.loc[session_pairs.timedelta>0].groupby(
    #    ['game_session_assessment','type','title'], sort=False)
    history=session_pairs.loc[session_pairs.datetime>
                              session_pairs.datetime_start].groupby(['game_session_assessment',
                                                                     'type','title'], sort=False)
    print('number of groups in history: {}'.format(history.ngroups))
    
    # Multiplicity of various titles in the history of an assessment
    mult_title=pd.DataFrame(history.size()).unstack().fillna(0).sum(level=0)
    
    # Multiplicity of sessions of type game or activity in the history of an assessemnt
    mult_type=pd.DataFrame(history.size()).unstack().fillna(0).sum(axis=1).unstack().fillna(0)
    
    # Cummulative time spent on each title before attempting an assessment
    cumm_time_title=pd.DataFrame(history.agg({'game_time': 'sum'})).unstack().fillna(0).sum(level=0)
    
    # Cummulative time spend in sessions of each type
    cumm_time_type=pd.DataFrame(history.agg({'game_time':'sum'})).unstack().fillna(0).sum(axis=1).unstack().fillna(0)
    
    # Average event_count in each title
    mn_evt_ct_title=pd.DataFrame(history.agg({'event_count':'mean'})).unstack().fillna(0).sum(level=0)
    
    # Average event_count in each type
    mn_evt_ct_type=pd.DataFrame(history.agg({'event_count':'sum'})).unstack().fillna(0).mean(axis=1).unstack().fillna(0)
    
    # Stats for previous assessments
    # for this we will merge results with itself 
    assess_pairs=results[['game_session','installation_id','datetime']].merge(results, left_on='installation_id', 
                               right_on="installation_id",suffixes=('_current', '_previous'))
    # time difference between current assessment and previous assessemnt
    # assess_pairs['timedelta']=list(map(lambda x: pd.Timedelta(x).delta,
    #                                   (assess_pairs.datetime_current-
    #                                    assess_pairs.datetime_previous).values))
    # assessments taken prior to current have timedelta>0
    # assess_pairs=assess_pairs.loc[assess_pairs.timedelta>0]
    assess_pairs=assess_pairs.loc[assess_pairs.datetime_current>assess_pairs.datetime_previous]
    # we will also need the game_time and event_count for each previous session
    # we can get these from 'end'
    assess_pairs=assess_pairs.merge(end[['game_session','game_time','event_count']],
                                    left_on='game_session_previous', right_on='game_session').drop(columns='game_session')
    # print('assess_pairs columns: {}'.format(assess_pairs.columns))
    assess_pairs=assess_pairs.groupby(['game_session_current','title'])
    
    # Multiplicity of previous assessment titles
    mult_assess_titles=pd.DataFrame(assess_pairs.size()).unstack().fillna(0)
    
    # Number of any assessments attempted before the current one
    num_assess=mult_assess_titles.sum(axis=1)
    
    # Total accumalated time in each title
    assess_cumm_time_title=pd.DataFrame(assess_pairs.agg({'game_time':'sum'})).unstack().fillna(0)
    
    # Total time spent on all assessments in history
    assess_cumm_time=assess_cumm_time_title.sum(axis=1)
    
    # Average event count per assessment title 
    mn_evt_ct_assess_title=pd.DataFrame(assess_pairs.agg({'event_count':'mean'})).unstack().fillna(0) 
    
    # Average event_count for all assessments
    mn_evt_ct_assess=pd.DataFrame(assess_pairs.agg({'event_count':'sum'})).unstack().fillna(0).mean(axis=1)
    
    # Average accuracy attempts per assessment title
    mn_accuracy_title=pd.DataFrame(assess_pairs.accuracy.mean()).unstack().fillna(0)
    
    # Average accuracy for all assessments in history
    mn_accuracy=pd.DataFrame(assess_pairs.accuracy.sum()).unstack().fillna(0).mean(axis=1)
    
    
    # Average accuracy group per assessment title
    mn_acc_grp_title=pd.DataFrame(assess_pairs.accuracy_grp.mean()).unstack().fillna(0)
    
    # Average accuracy group for all assessemnts
    mn_acc_grp=pd.DataFrame(assess_pairs.accuracy_grp.sum()).unstack().fillna(0).mean(axis=1)
    
    
    
    # Data from watching clips
    assess_clip_pairs= results[['game_session','installation_id','datetime']].merge(
        dataset.loc[dataset.type=='Clip',['installation_id','title','game_session','game_time','datetime']], 
        left_on='installation_id', right_on='installation_id', suffixes=('_assess','_clip'))
    
    # pick only clips in the past
    assess_clip_pairs=assess_clip_pairs.loc[assess_clip_pairs.datetime_assess>
                                            assess_clip_pairs.datetime_clip].groupby(['game_session_assess','title'])
    
    # number of various titles watched before an assessment
    num_clip_titles=pd.DataFrame(assess_clip_pairs.size()).unstack().fillna(0)
    
    # total number of clips watched before an assessement
    num_clips=num_clip_titles.sum(axis=1)
    
    # title of current assessment
    curr_title=results[['game_session','title']].set_index('game_session')
    
    
    features=pd.concat([curr_title, mult_title, mult_type, cumm_time_title, 
                        cumm_time_type, mn_evt_ct_title, mn_evt_ct_type,
                        mult_assess_titles, num_assess, assess_cumm_time_title,
                        assess_cumm_time,mn_evt_ct_assess_title, mn_evt_ct_assess, 
                        mn_accuracy_title, mn_accuracy, mn_acc_grp_title,
                        mn_acc_grp, num_clip_titles, num_clips], axis=1, sort=False).fillna(0)
    
    labels=results[['game_session','accuracy_grp']].set_index('game_session')
    
        
    
    return features, labels
    



# function to compute num_correct, num_incorrect, accuracy and accuracy group from the event_data of an assessemnt
def assessment_stats(event_data : pd.Series) -> tuple :
    
    event_data=event_data.values
    
    # correct/in correct attempts
    results = list(map(lambda x: int(x.find('true')>0), event_data))
    
    # total number of attempts
    total_attempts=len(results)
    
    # number of correct attempts
    num_correct=sum(results)
    
    # number of incorrect attempts
    num_incorrect=total_attempts-num_correct
    
    # accuracy
    accuracy=num_correct/total_attempts
    
    # accuracy group
    # if accuracy is 0, 1, or 0.5, the accuracy group is 0,3 and 2 respectively
    # but if accuracy is less than of equal to 1/3 but not 0, then accuracy group is 1
    # we can get this by creating an accuracy_dictionary with keys 0,1 and 0.5
    # then use the dict.get() method to provide a default value 1 for 
    # the cases when accuracy is not in the accuracy_dict keys i.e. when the accuracy is less than of equal to 1/3 but not 0
    # I learnt this from the following stackexchange post: https://stackoverflow.com/questions/20840803/how-to-convert-false-to-0-and-true-to-1-in-python
    accuracy_grp=acc_grp_dict.get(accuracy, 1)
    
    return num_correct, num_incorrect, total_attempts, accuracy, accuracy_grp

# dictionary to convert accuracy to accuracy group
acc_grp_dict={0:0, 1:3, 0.5:2}

In [8]:
%%time
# takes about 3 minutes on training dataset
train_features, train_labels=features(train)
print('shape of train_features: {}'.format(train_features.shape))
display(train_features.head())

The first row of each session has event_count 1: True
The last row of each session has max event_count: True
shape of dataset: end before removing 0 game_time sessions: (303319, 12)
number of sessions with 0 game_time: 186361
shape of dataset: end after removing 0 game_time sessions: (116958, 12)
number of assessments with valid attempts: 17690
number of groups in history: 164249
shape of features: (17690, 115)


Unnamed: 0,title,"(0, Chicken Balancer (Activity))","(0, Chow Time)","(0, Leaf Leader)","(0, Happy Camel)","(0, Scrub-A-Dub)","(0, All Star Sorting)","(0, Dino Drink)","(0, Fireworks (Activity))","(0, Watering Hole (Activity))",...,"(0, Ordering Spheres)","(0, Pirate's Tale)","(0, Rulers)","(0, Slop Problem)","(0, Treasure Map)","(0, Tree Top City - Level 1)","(0, Tree Top City - Level 2)","(0, Tree Top City - Level 3)","(0, Welcome to Lost Lagoon!)",5
00097cda27afb726,Mushroom Sorter (Assessment),1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,2.0,24.0
000f68cff32664ef,Chest Sorter (Assessment),1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,4.0,22.0
0014403daadf67aa,Bird Measurer (Assessment),3.0,3.0,2.0,2.0,3.0,2.0,2.0,2.0,3.0,...,2.0,2.0,2.0,3.0,2.0,2.0,2.0,2.0,1.0,40.0
0014daa1d3e26eb2,Mushroom Sorter (Assessment),0.0,0.0,0.0,0.0,0.0,2.0,0.0,2.0,0.0,...,2.0,2.0,1.0,0.0,1.0,2.0,2.0,1.0,2.0,21.0
001c49e9e9968dbe,Bird Measurer (Assessment),1.0,1.0,1.0,1.0,1.0,1.0,0.0,2.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,24.0


Wall time: 2min 17s


In [9]:
train_labels.head(2)

Unnamed: 0_level_0,accuracy_grp
game_session,Unnamed: 1_level_1
00097cda27afb726,3
000f68cff32664ef,0


# Extracting test features

In [18]:
test=pd.read_csv('test.csv')
test.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,0ea9ecc81a565215,2019-09-10T16:50:24.910Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,c1ea43d8b8261d27,2019-09-10T16:50:55.503Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,27253bdc,7ed86c6b72e725e2,2019-09-10T16:51:51.805Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Magma Peak - Level 2,Clip,MAGMAPEAK
3,27253bdc,7e516ace50e7fe67,2019-09-10T16:53:12.825Z,"{""event_code"": 2000, ""event_count"": 1}",00abaee7,1,2000,0,Crystal Caves - Level 1,Clip,CRYSTALCAVES
4,7d093bf9,a022c3f60ba547e7,2019-09-10T16:54:12.115Z,"{""version"":""1.0"",""round"":0,""event_count"":1,""ga...",00abaee7,1,2000,0,Chow Time,Game,CRYSTALCAVES


In [19]:
print('number of unique installation_id in test set: {}'.format(test.installation_id.nunique()))

number of unique installation_id in test set: 1000


In [35]:
# choose the assesements to be used for prediction by extracting those which have a single row for their data
test_assess_size=test.loc[test.type=="Assessment"].groupby('game_session').size()
submission_assessments=np.array(test_assess_size.loc[test_assess_size==1].keys())
print('number of submission assessments: {}'.format(submission_assessments.shape[0]))

number of submission assessments: 1000


This agrees with the expectation that their should be one submission assessment per installation_id in the test data

In [37]:
test.loc[test.game_session.isin(submission_assessments)]

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
867,90d848e0,348d7f09f96af313,2019-09-12T13:52:12.193Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",00abaee7,1,2000,0,Cauldron Filler (Assessment),Assessment,MAGMAPEAK
3586,7ad3efc6,1fef5d54cb4b775a,2019-10-09T20:23:16.209Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",01242218,1,2000,0,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
3736,3bfd1a65,4b165a330a0bdd6c,2019-09-21T11:28:21.757Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",017c5718,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
3970,3bfd1a65,be0b655ad1fee30c,2019-07-27T16:28:10.394Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",01a44906,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
4922,7ad3efc6,46e8bbed71df7520,2019-09-06T18:05:26.197Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",01bc6cb6,1,2000,0,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
5755,7ad3efc6,73cf848935e13a2e,2019-08-25T19:16:14.447Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",02256298,1,2000,0,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
6643,3bfd1a65,363c252fbb51ba5a,2019-10-01T18:02:11.838Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0267757a,1,2000,0,Mushroom Sorter (Assessment),Assessment,TREETOPCITY
8429,f56e0afc,3f92464665bbc7e8,2019-09-23T19:21:45.255Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",027e7ce5,1,2000,0,Bird Measurer (Assessment),Assessment,TREETOPCITY
10142,5b49460a,3d5b5a3897771980,2019-08-17T18:53:01.548Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",02a29f99,1,2000,0,Chest Sorter (Assessment),Assessment,CRYSTALCAVES
10166,7ad3efc6,b1e50bb61bf9a4d4,2019-09-01T22:06:38.480Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0300c576,1,2000,0,Cart Balancer (Assessment),Assessment,CRYSTALCAVES
