In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
print('current working directory: {}'.format(os.getcwd()))

current working directory: C:\Users\agarw\Dropbox\Kaggle data-science-bowl 2019\Prarit-data-science-bowl-2019


In [3]:
print('files in cwd: {}'.format(os.listdir()))

files in cwd: ['.git', '.ipynb_checkpoints', 'Feature Engineering Speed Up Experiments.ipynb', 'Feature Engineering.ipynb', 'Initial EDA.ipynb', 'sample_submission.csv', 'specs.csv', 'test.csv', 'train.csv', 'train_features.csv', 'train_labels.csv']


In [4]:
trainpath='train.csv'
train=pd.read_csv(trainpath)

In [5]:
trainlblspath='train_labels.csv'
trainlbls=pd.read_csv(trainlblspath)

In [6]:
print('shape of train: {}'.format(train.shape))
print('shape of trainlbls: {}'.format(trainlbls.shape))

shape of train: (11341042, 11)
shape of trainlbls: (17690, 7)


In [7]:
n_players=train.installation_id.nunique()
print('number of unique installation ids: {}'.format(n_players))

number of unique installation ids: 17000


In [8]:
train.head(2)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK


In [9]:
trainlbls.head(2)

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0


In [10]:
# converting timestamp in train to datetime
train['datetime']=pd.to_datetime(train.timestamp)
train.head(2)

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world,datetime
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,2019-09-06 17:53:46.937
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,2019-09-06 17:54:17.519


In [11]:
train.drop(columns=['timestamp'], inplace=True)
train.head(2)

Unnamed: 0,event_id,game_session,event_data,installation_id,event_count,event_code,game_time,title,type,world,datetime
0,27253bdc,45bb1e1b6b50c07b,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,2019-09-06 17:53:46.937
1,27253bdc,17eeb7f223665f53,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK,2019-09-06 17:54:17.519


In [12]:
# number of unique sessions in train
print('The number of unique sessions in train are: {}'.format(train.game_session.nunique()))

The number of unique sessions in train are: 303319


In [13]:
# group the data according to installation_id, type, title and game_session
# Since we are only interested in things like, total time and number of events in each game session, we
# will most probably need all the rows in the train_data for now.
# so group them as above and then only choosing the relevant rows might help
# also notice that we don't care about sorting the game_sessions by their ids, so we can turn off the sort parameter which is True by default. This will improve the time 
trngrp=train.groupby(['game_session'], sort=False)

In [14]:
# note that the total time spent and the number of events in a particular game session can be easily obtained by looking at the last row in the data for each game_session
# Basically, the game_time column contains the number of milli-sec passed (since the start of the session) when the event occurred
# similarly, the entry under the event_count column for the last row will give us the number of events that occurred in that session
# Meanwhile the first row of each game_session gives us the start time

# confirm that the the first row of each game_session is indeed the begining of the session
# this implies that the event_count of that row should be 1
print('The first row in each group has event count == 1? :{}'.format((trngrp.first().event_count==1).all()))

# start of each game_session
start=trngrp.first()

# end of each game_session
end=trngrp.last()

The first row in each group has event count == 1? :True


In [15]:
# confirm that the rows in the datafram 'end' indeed correspond to the last event in that game_session
# this can be confirmed by checking that the entry under event_count of 'end' is indeed the largest event_count value for each session
print('All entries in the DataFrame {} are equal to max event_count for their game?: {}'.format('end',(end.event_count==trngrp.event_count.max()).all()))

All entries in the DataFrame end are equal to max event_count for their game?: False


In [16]:
# Apparently, there are rows in the dataframe 'end' which do not correspond to the max event_count for that session
# let's look at the corresponding sessions and try to understand why is this the case
end['max_event_count']=(end.event_count==trngrp.event_count.max())
not_max=end.loc[end.max_event_count==False]

# let's look at the first session in not_max
weird_session_id=not_max.index[0]
print('An example of game_session when the dataframe end does not have max event_count:{}'.format(weird_session_id))

An example of game_session when the dataframe end does not have max event_count:fa7e513faec3d0d8


In [17]:
# To access a particular group in a groupby object, use the 'get_group()' method
# this tip was given in the following stack-exchange post: https://stackoverflow.com/questions/22702486/pandas-how-to-get-a-particular-group-after-groupby

weird_session=trngrp.get_group(weird_session_id)

print('max event_count in {} is {}'.format(weird_session_id, weird_session.event_count.max()))
print('the corresponding event_count in the dataframe end is {}'.format(end.loc[weird_session_id, 'event_count']))

max event_count in fa7e513faec3d0d8 is 63
the corresponding event_count in the dataframe end is 62


In [18]:
weird_session

Unnamed: 0,datetime,event_code,event_count,event_data,event_id,game_time,installation_id,title,type,world
4752,2019-08-29 16:17:08.865,2000,1,"{""version"":""1.0"",""level"":0,""round"":0,""event_co...",6d90d394,0,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK
4753,2019-08-29 16:17:11.207,4010,2,"{""coordinates"":{""x"":1142,""y"":951,""stage_width""...",7040c096,2342,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK
4754,2019-08-29 16:17:13.073,2080,3,"{""movie_id"":""scrubadub_intro"",""duration"":13766...",5a848010,4209,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK
4755,2019-08-29 16:17:14.657,2081,4,"{""movie_id"":""scrubadub_intro"",""duration"":13766...",c1cac9a2,5793,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK
4756,2019-08-29 16:17:16.582,3010,7,"{""description"":""First you need to match the an...",f71c4741,7718,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK
4757,2019-08-29 16:17:16.582,2020,6,"{""round_target"":{""size"":2,""type"":""Tub"",""animal...",26fd2d99,7718,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK
4758,2019-08-29 16:17:16.582,2040,5,"{""level"":1,""round"":0,""event_count"":5,""game_tim...",dcaede90,7718,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK
4759,2019-08-29 16:17:17.765,3110,8,"{""description"":""First you need to match the an...",f7e47413,8901,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK
4760,2019-08-29 16:17:17.766,4020,9,"{""size"":2,""item_type"":""Tub"",""position"":1,""anim...",5c3d2b2f,8901,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK
4761,2019-08-29 16:17:18.947,4070,10,"{""coordinates"":{""x"":1374,""y"":319,""stage_width""...",cf82af56,10085,0006a69f,Scrub-A-Dub,Game,MAGMAPEAK


by looking at all the events in game_session == '007881b97f50de0f', we realize that while for most of the game_sessions, the rows are sorted by their event_count, but for many sessions, this does not appear to be the case. Thus, picking up the last row in each game_session is not a reliable method to find the last event in that game_session. 

One way to circumvent this is to sort the rows of each group according to their event_count. A similar problem was discussed in [this](https://stackoverflow.com/questions/15705630/get-the-rows-which-have-the-max-value-in-groups-using-groupby) stackexchange post. This is very useful post and in particular the answers by user a) Zelazny7, b) landewednack and c) WeNYoBen look particularly useful and promising. I personally think the best answer is by WeNYoBen (though it does not have the most votes). Not only is it elegant, but someone has also commented that it is faster than the solutions by the other 2. WeNYoBen's solution follows the logic that we should first sort the dataframe itself by event_count and then groupby 'game_session'. In this way the rows in the resulting groupby object so created will automatically be ordered by their event_count.  We will follow this approach. 

In [19]:
trngrp=train.sort_values('event_count', axis=0).groupby('game_session', sort=False)
start=trngrp.first()
end=trngrp.last()

print('The first row in each group has event count == 1? :{}'.format((start.event_count==1).all()))
print('The last row in each group has maximum event count?: {}'.
      format((end.event_count==trngrp.event_count.max()).all()))

The first row in each group has event count == 1? :True
The last row in each group has maximum event count?: True


In [20]:
# for all the assessments, let us also pick out all the events which have code == 4100 or 4110
# Since these events correspond to attempts made by the player to solve the assessment
attempts=train.loc[(train.type=='Assessment')&((train.event_code==4100)| (train.event_code==4110))].groupby('game_session', sort=False)

In [21]:
print('No. of different assessment sessions captured in the dataframe attempts {}'.format(attempts.ngroups))

No. of different assessment sessions captured in the dataframe attempts 17692


Recall, some assessments that were unsuccessfully attempted but were abandoned before arriving at a correct solution. How do I find assessments that were abandoned before completion? I tried to look at the last event in some completed assessments but could not find something that seemed to be true for all completed assessments. 

For example, I looked at the game_session == '901acc108f55a5a1'. This was a completed assessment. The last event (row = 2232 of train) had an event_code = 2010, its event_data had a substring 'game_completed' and its event_id is 'a5be6304'.

In [22]:
end.loc['901acc108f55a5a1', :]

event_id                                                    a5be6304
event_data         {"session_duration":39803,"exit_type":"game_co...
installation_id                                             0006a69f
event_count                                                       48
event_code                                                      2010
game_time                                                      39803
title                                   Mushroom Sorter (Assessment)
type                                                      Assessment
world                                                    TREETOPCITY
datetime                                  2019-08-06 05:22:41.147000
Name: 901acc108f55a5a1, dtype: object

In [23]:
end.loc['901acc108f55a5a1'].event_data

'{"session_duration":39803,"exit_type":"game_completed","event_count":48,"game_time":39803,"event_code":2010}'

 By looking at the above result, we might think that all completed assessments might have 2010 for the event_code of their last event. Let's check if this is true.

In [24]:
print('No. of assessments containing an event with event code 2010: {}'.format(train.loc[(train.type=='Assessment') & (train.event_code==2010)].game_session.nunique()))

No. of assessments containing an event with event code 2010: 11691


Recall that train_labels.csv has data of 17690 different assessments. Assuming that these all were assessments that were not abandoned prematurely, we see that there are only 11691 assessments that have an event with event_code 2010. This is far less than the assessments in train_labels and hence can not be a universal approach to finding completed assessments. 

Similarly, we can look for assessments whose last event has the phrase 'game completed' in its event_data

In [25]:
# find the position of the phrase 'game completed' in event_data
# if the substring exists then str.find(substring) returns a non-negative value else it returns -1
completed=end.loc[end.type=='Assessment'].event_data.apply(lambda x: x.find('game_completed')>-1)
print('No. of assessments containing the phrase game_completed: {} '.format(completed.loc[completed==True].size))

No. of assessments containing the phrase game_completed: 2925 


Once again, we see that this number is way less than the number of sessions than recored in train_labels.csv

Let us check if all completed assessments will have event_id == 'a5be6304'. 

In [26]:
completed=end.loc[(end.type=='Assessment') &(end.event_id=='a5be6304')]
print('No. of assessments having event_id a5be6304: {}'.format(completed.shape[0]))

No. of assessments having event_id a5be6304: 2926


This too is way less than the number of sessions in train_labels.csv

As a last resort, we notice that the event_data for the last event of a completed session seems always contain info about session duration. This seems to be preceeded by the phrase 'session_duration'. Let's check how many assessments contain this phrase.

In [27]:
completed=end.loc[end.type=='Assessment'].event_data.apply(lambda x: x.find('session_duration')>-1)
print('No. of assessments containing the phrase session_duration: {} '.format(completed.loc[completed==True].size))

No. of assessments containing the phrase session_duration: 11691 


This same as the number of session with last event having an event_code 2010. At this point, I am unable to find a generic pattern that fits all completed session. Perhaps, it is wrong to assume that all the sessions in train_labels.csv were completed. It is probably better to look at all sessions which were attempted at least once whether successfully or unsuccessfully and treat them as valid assessments. 

At this point, I have the following 3 dataframes containing various kinds of information extracted from train.csv:

  - start: This contains info about the starting event in each session
  
  - end: This contains info about the last event in each session
  
  - attempts: This contains the events with event_code 4100 or 4110 and corresponds to the players   attempts at solving the assessments


Note that the last event for each session already contains information about its event_count and session_duration i.e. this info is easily available from the dataframe 'end'. To look at the history of a player before they attempt a particular assessment, all I need is to add the session start_time for each session in 'end'. The session start_time can be obtained from the dataframe 'start'. Let's include the session start_time for each session in 'end'.

In [28]:
# since (for now), the only relevant information in begin is the session's starting time, we
# will only keep its 'datetime' column and re-define start to only contain this information
start=pd.DataFrame(start['datetime'])

# join 'end' and 'start' according to the game_session id
# recall that session_id corresponds to the row index of the dataframes above
# therefore merge should be performed with left_index=True and right_index=True
end=end.merge(start, left_index=True, right_index=True, suffixes=('_end','_begin'))

# reset index 'end' index to regain the game_session column
end=end.reset_index()

end.head(2)

Unnamed: 0,game_session,event_id,event_data,installation_id,event_count,event_code,game_time,title,type,world,datetime_end,datetime_begin
0,45bb1e1b6b50c07b,27253bdc,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE,2019-09-06 17:53:46.937,2019-09-06 17:53:46.937
1,63609927192e83a2,27253bdc,"{""event_code"": 2000, ""event_count"": 1}",572c3ef7,1,2000,0,Crystal Caves - Level 2,Clip,CRYSTALCAVES,2019-09-28 20:59:42.368,2019-09-28 20:59:42.368


In [29]:
# are there sessions with invalid game_time i.e. game_time is 0 or less? if so, what are their game_time values
end.loc[end.game_time<=0].game_time.unique()

array([0], dtype=int64)

In [30]:
# remove game sessions which were exited immediately after starting i.e. sessions whose game_time is 0
print('shape of end before removing sessions with 0 game_time: '.format(end.shape))
print('number of sessions with 0 game_time: {}'.format(end.loc[end.game_time==0].shape[0]))
end = end.loc[end.game_time>0]
print('shape of end after removing sessions with 0 game_time: {}'.format(end.shape))

shape of end before removing sessions with 0 game_time: 
number of sessions with 0 game_time: 186361
shape of end after removing sessions with 0 game_time: (116958, 12)


In [31]:
end.head(2)

Unnamed: 0,game_session,event_id,event_data,installation_id,event_count,event_code,game_time,title,type,world,datetime_end,datetime_begin
3,0382082774e38509,1325467d,"{""coordinates"":{""x"":2,""y"":546,""stage_width"":10...",572c3ef7,86,4070,2532634,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019-09-29 01:22:53.621,2019-09-29 00:36:46.713
4,fa63d1d1806f0895,1325467d,"{""coordinates"":{""x"":44,""y"":101,""stage_width"":1...",572c3ef7,74,4070,2612710,Sandcastle Builder (Activity),Activity,MAGMAPEAK,2019-09-29 02:06:57.526,2019-09-29 01:23:19.039


In [32]:
# the number of attempts for each assessment in 'attempts' can be computed by using the size() method
# Also see the following documentation for an excellent overview of how to perform various task on groupby objects
# https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#group-by-split-apply-combine
n_attempts=attempts.size()

In [33]:
# Note that there are 17692 assessments captured in the dataset 'attempts' while train_labels.csv only has 
# 17690 assessments. So in order to compare 'attempts' to train_labels.csv, we should restrict ourselves to 
# only those assessments that are common to both 'attempts' and train_labels.csv
# Let's find the intersection of game_session in 'attempts' and train_labels.csv

set1=set(n_attempts.index.values)
set2=set(trainlbls.game_session.unique())
common=set1.intersection(set2)
print('The number of assessments common to attempts and trainlbls is: {}'.format(len(common)))

The number of assessments common to attempts and trainlbls is: 17690


We see that the number of assessments common to 'attempts' and 'trainlbls' is 17690. Since, trainlbls has exactly 17690 rows, this implies all the assessments included in trainlbls are there in 'attempts' too.

Earlier, we had conjectured that the number of rows for each group_session in 'attempts' corresponds to the number of attempts made during that assessments, since each row corressponded to an event with event_code=4100 or 4110. Let us quickly check that this is indeed true by comparing with the corresponding numbers in 'trainlbls'.

In [34]:
# we wish to add the entries in n_attempts as a column to trainlbls
# Thus we wish to merge 'n_attempts' and 'trainlbls'
# but pandas.DataFrame.merge does not accept a pandas.Series, so will convert n_attempts to a dataframe
temp_dat=n_attempts.reset_index()
temp_dat.rename(columns={0:'n_attempts'}, inplace=True)

trainlbls=trainlbls.merge( temp_dat, how='left', left_on='game_session', right_on='game_session')

# the number of attempts in trainlbls can be obtained from the sum of entries in the columns num_correct and num_incorrect
trainlbls['num']=trainlbls['num_correct']+trainlbls['num_incorrect']

# Are the entries in column n_attempts equal to those in the column num?
print('n_attempts is equal to total number of attempts for all assessments: {}'.
      format((trainlbls['num']==trainlbls['n_attempts']).all()))

n_attempts is equal to total number of attempts for all assessments: False


In [35]:
# Looking for game_sessions where n_attempts is not same as total number of attemtps

trainlbls.loc[trainlbls['num']!=trainlbls['n_attempts']].head()

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,n_attempts,num
4,a9ef3ecb3d1acc6a,0006a69f,Bird Measurer (Assessment),1,0,1.0,3,2,1
6,957406a905d59afd,0006c192,Bird Measurer (Assessment),1,1,0.5,2,3,2
8,ae691ec5ad5652cf,00129856,Bird Measurer (Assessment),1,0,1.0,3,2,1
20,5025f22d6e944533,002db7e3,Bird Measurer (Assessment),1,4,0.2,1,6,5
22,8f50f2fd987cade4,002db7e3,Bird Measurer (Assessment),1,1,0.5,2,3,2


In [36]:
# From the previous cell, we see that the session with id 'a9ef3ecb3d1acc6a' is one such assessment
# where n_attempts is not same as number of attempts
# let's check out why?
# display all the entries in 'attempts' for the game_session=='a9ef3ecb3d1acc6a'
attempts.get_group('a9ef3ecb3d1acc6a')

Unnamed: 0,event_id,game_session,event_data,installation_id,event_count,event_code,game_time,title,type,world,datetime
4137,17113b36,a9ef3ecb3d1acc6a,"{""correct"":true,""caterpillars"":[4,8,5],""event_...",0006a69f,14,4110,13050,Bird Measurer (Assessment),Assessment,TREETOPCITY,2019-08-06 20:50:12.115
4153,070a5291,a9ef3ecb3d1acc6a,"{""correct"":true,""hats"":[4,8,5],""event_count"":3...",0006a69f,30,4100,34209,Bird Measurer (Assessment),Assessment,TREETOPCITY,2019-08-06 20:50:33.283


We see that this was an assessment titled 'Bird Measurer'. Recall that attempts in 'Bird Measurer' are assigned an event code 4110. The events with code 4100 in 'Bird Measurer (Assessment)' are therefore not to be interpreted as attempts. Since 'attempts' contains all the events with code 4100 or 4110, thus for 'Bird Measurer (Assessment)' it can possible contain more rows then attempts as was the case with session 'a9ef3ecb3d1acc6a'. 

Let us check what other assessments titles have a discrepancy between n_attempts and number of attempts. 

In [37]:
trainlbls.loc[trainlbls.n_attempts!=trainlbls.num].title.unique()

array(['Bird Measurer (Assessment)'], dtype=object)

We see that the discrepancy arrises only for the 'Bird Measurer (Assessment)'. It must therefore be because of the events with event_code 4100. let's recompute attempts with this caveat in mind. 

In [38]:
attempts=train.loc[(train.type=='Assessment')
          & (((train.title!='Bird Measurer (Assessment)') & (train.event_code==4100))
             |((train.title=='Bird Measurer (Assessment)') & (train.event_code==4110)))].groupby('game_session'
                                                                                                 ,sort=False)

In [39]:
n_attempts=attempts.size().reset_index()
n_attempts.rename(columns = {0:'n_attempts_corrected'}, inplace=True)
n_attempts.head(2)

Unnamed: 0,game_session,n_attempts_corrected
0,901acc108f55a5a1,1
1,77b8ee947eb84b4e,11


In [40]:
# drop the previous n_attempts column from trainlbls
trainlbls.drop(['n_attempts'], axis=1, inplace=True)

# merge the recalculated n_attempts with trainlbls
trainlbls=trainlbls.merge(n_attempts, how='left', left_on='game_session', right_on='game_session')

trainlbls.head(2)

Unnamed: 0,game_session,installation_id,title,num_correct,num_incorrect,accuracy,accuracy_group,num,n_attempts_corrected
0,6bdf9623adc94d89,0006a69f,Mushroom Sorter (Assessment),1,0,1.0,3,1,1
1,77b8ee947eb84b4e,0006a69f,Bird Measurer (Assessment),0,11,0.0,0,11,11


In [41]:
# check that the recaculated n_attempts gives the corrected number of attempts
print('n_attempts captures the correct number of attempts for all assessments: {}'.format(
    (trainlbls.num==trainlbls.n_attempts_corrected).all()))

n_attempts captures the correct number of attempts for all assessments: True


In [42]:
print('No. of different assessment sessions captured in the dataframe attempts: {}'.format(attempts.ngroups))

No. of different assessment sessions captured in the dataframe attempts: 17690


Note that this also cures a previous issue with 'attempts' that it seemed to contain 2 more assessment sessions that were not there in train_labels.csv

Next, we will look at the various events in each session and compute num_correct, num_incorrect, accuracy and accuracy_group of each session. While for the training data, this information can be easily obtained from train_labels.csv, for the test data we have to extract this info from the raw data itself. It is therefore helpful to do this already. 

To collect the aforementioned statistics, we can use the [Groupby.agg()](https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.core.groupby.DataFrameGroupBy.agg.html) method which applies the function passed to it to each group in the groupby object

We will therefore define a function called 'num_correct' which will accept the event_data for each group in the 'attempts' dataframe. Note that the event_data for each group is a pandas.Series object whose entries correspond to the event data for each event in that session. We will than look for the word 'true' in each entry. This can be done by using the [map()](mhttps://www.geeksforgeeks.org/python-map-function/)method. This will give us the total number of correct attempts. Similarly, we also get num_incorrect, total_attempts, accuracy and accuracy_group.

In [43]:
# defining the function num_correct to compute number of correct/incorrect attempts for each assessment

accuracy_dict={0:0, 1:3, 0.5:2}

def num_correct(event_data: pd.Series):
    event_data=event_data.values
    result=list(map(lambda x: int(x.find('true')>0), event_data))
    num_correct=sum(result)
    num_incorrect=len(result)-num_correct
    total_attempts=len(result)
    accuracy=num_correct/total_attempts
    
    # if accuracy is 0, 1, or 0.5, the accuracy group is 0,3 and 2 respectively
    # but if accuracy is less than of equal to 1/3 but not 0, then accuracy group is 1
    # we can get this by creating an accuracy_dictionary with keys 0,1 and 0.5
    # then use the dict.get() method to provide a default value 1 for 
    # the cases when accuracy is not in the accuracy_dict keys i.e. when the accuracy is less than of equal to 1/3 but not 0
    # I learnt this from the following stackexchange post: https://stackoverflow.com/questions/20840803/how-to-convert-false-to-0-and-true-to-1-in-python
    accuracy_group=accuracy_dict.get(accuracy, 1)
    
    return num_correct, num_incorrect, total_attempts, accuracy, accuracy_group  

In [44]:
results=attempts.agg({'event_data': num_correct,
                      'installation_id':lambda x: x.values[0],
                      'datetime': lambda x: x.values[0],
                      'title': lambda x: x.values[0]}).reset_index()
results.head()

Unnamed: 0,game_session,event_data,installation_id,datetime,title
0,901acc108f55a5a1,"(1, 0, 1, 1.0, 3)",0006a69f,2019-08-06 05:22:32.357,Mushroom Sorter (Assessment)
1,77b8ee947eb84b4e,"(0, 11, 11, 0.0, 0)",0006a69f,2019-08-06 05:35:54.898,Bird Measurer (Assessment)
2,6bdf9623adc94d89,"(1, 0, 1, 1.0, 3)",0006a69f,2019-08-06 05:38:08.036,Mushroom Sorter (Assessment)
3,9501794defd84e4d,"(1, 1, 2, 0.5, 2)",0006a69f,2019-08-06 20:35:12.290,Mushroom Sorter (Assessment)
4,a9ef3ecb3d1acc6a,"(1, 0, 1, 1.0, 3)",0006a69f,2019-08-06 20:50:12.115,Bird Measurer (Assessment)


In [45]:
# since the event_data column in results is a tuple, we will create a new data frame where each entry of the tuple is put in seperate column
# the easiest way to do this is to define a new dataframe as follows
results_dat=pd.DataFrame(list(map(lambda x: list(x), results.event_data.values)), 
             columns=['n_correct','n_wrong', 'n_total','accuracy','accuracy_group'])
results_dat['game_session']=results['game_session']
results_dat['installation_id']=results['installation_id']
results_dat['datetime']=results['datetime']
results_dat['title']=results['title']
results_dat=results_dat.sort_values('game_session')
results_dat.head()

Unnamed: 0,n_correct,n_wrong,n_total,accuracy,accuracy_group,game_session,installation_id,datetime,title
110,1,0,1,1.0,3,00097cda27afb726,01bdd720,2019-09-15 16:37:00.433,Mushroom Sorter (Assessment)
4736,0,6,6,0.0,0,000f68cff32664ef,3f0dca37,2019-10-12 23:07:44.119,Chest Sorter (Assessment)
3117,1,3,4,0.25,1,0014403daadf67aa,29d1aaee,2019-08-03 02:43:41.573,Bird Measurer (Assessment)
6517,1,2,3,0.333333,1,0014daa1d3e26eb2,55fdf49f,2019-09-24 22:29:11.121,Mushroom Sorter (Assessment)
1949,1,1,2,0.5,2,001c49e9e9968dbe,19d4b097,2019-07-26 15:50:46.687,Bird Measurer (Assessment)


In [46]:
# we now wish to make sure that the results collected in results_dat are indeed correct and match with those in train_labels.csv

# sorting trainlbls by game_session so that the same row in bot results_dat and trainlbls contain info about the same assessment session
# this will make comparing their values easy
trainlbls=trainlbls.sort_values('game_session')

print('The num of correct attempts for each assessment session is same: {}'.
      format((results_dat.n_correct.values==trainlbls.num_correct.values).all()))

print('The num of incorrect attempts for each assessment session is same: {}'.
      format((results_dat.n_wrong.values==trainlbls.num_incorrect.values).all()))

print('The accuracy group for each assessment session is same: {}'.
      format((results_dat.accuracy_group.values==trainlbls.accuracy_group.values).all()))

print('Corresponding rows contain the same session_id: {}'.
      format((results_dat.game_session.values==trainlbls.game_session.values).all()))

print('Corresponding rows contain the same installation_id: {}'.
      format((results_dat.installation_id.values==trainlbls.installation_id.values).all()))

The num of correct attempts for each assessment session is same: True
The num of incorrect attempts for each assessment session is same: True
The accuracy group for each assessment session is same: True
Corresponding rows contain the same session_id: True
Corresponding rows contain the same installation_id: True


Note that at this point, result_dat has all the info we wanted for each assessments session: game_session, installation_id, datetime, n_correct, n_incorrect, accuracy, accuracy_group. 
We therefore don't need attempts for the timebeing and can delete it if the need to save memory arises.

In [47]:
results_dat.shape

(17690, 9)

We will now merge 'results_dat' with 'end' on their installation_id. For each installation_id, this will create a dataframe containing pairs formed by pairing every assessment of that player with all the game_sessions of that player.  

In [48]:
session_pairs=results_dat.merge(end,how='left', left_on='installation_id',
                                right_on='installation_id', suffixes=('_assessment','_y'))
session_pairs.head(2)

Unnamed: 0,n_correct,n_wrong,n_total,accuracy,accuracy_group,game_session_assessment,installation_id,datetime,title_assessment,game_session_y,event_id,event_data,event_count,event_code,game_time,title_y,type,world,datetime_end,datetime_begin
0,1,0,1,1.0,3,00097cda27afb726,01bdd720,2019-09-15 16:37:00.433,Mushroom Sorter (Assessment),12384d925d55856b,cdd22e43,"{""object"":""chicken"",""layout"":{""left"":{""chicken...",219,4035,243052,Chicken Balancer (Activity),Activity,CRYSTALCAVES,2019-09-15 00:20:15.130,2019-09-15 00:16:12.108
1,1,0,1,1.0,3,00097cda27afb726,01bdd720,2019-09-15 16:37:00.433,Mushroom Sorter (Assessment),284c96e23f38743a,7ec0c298,"{""description"":""It's Chow Time! We have some V...",120,3010,129121,Chow Time,Game,CRYSTALCAVES,2019-09-15 00:12:18.338,2019-09-15 00:10:09.282


In [49]:
session_pairs.shape

(1047509, 20)

In [50]:
session_pairs.game_session_assessment.nunique()

17690

In [51]:
# since we are interested in the history of each assessment session, therefore 
# we only want to keep those session pairs where the assessment datetime is greater than the other sessions datetime

# compute the time-difference between the assessments starting time and the other sessions starting time
session_pairs['timedelta']=np.array(list(map(lambda x: pd.Timedelta(x).delta,(session_pairs['datetime']-session_pairs['datetime_begin']).values)))

In [52]:
session_pairs.head(2)

Unnamed: 0,n_correct,n_wrong,n_total,accuracy,accuracy_group,game_session_assessment,installation_id,datetime,title_assessment,game_session_y,...,event_data,event_count,event_code,game_time,title_y,type,world,datetime_end,datetime_begin,timedelta
0,1,0,1,1.0,3,00097cda27afb726,01bdd720,2019-09-15 16:37:00.433,Mushroom Sorter (Assessment),12384d925d55856b,...,"{""object"":""chicken"",""layout"":{""left"":{""chicken...",219,4035,243052,Chicken Balancer (Activity),Activity,CRYSTALCAVES,2019-09-15 00:20:15.130,2019-09-15 00:16:12.108,58848325000000
1,1,0,1,1.0,3,00097cda27afb726,01bdd720,2019-09-15 16:37:00.433,Mushroom Sorter (Assessment),284c96e23f38743a,...,"{""description"":""It's Chow Time! We have some V...",120,3010,129121,Chow Time,Game,CRYSTALCAVES,2019-09-15 00:12:18.338,2019-09-15 00:10:09.282,59211151000000


Note that the assessment time entry under the datetime column of session_pair correspond to the time of the first attempt at solving that session. Meanwhile, the entry under the datetime_begin column corresponds to the start of the corresponding game_session. Thus even when both the members of a game_session pair correspond to the same assessment the value of datetime-datetime_begin is non-zero. In fact, it will be positive. 

In [53]:
session_pairs.loc[session_pairs.timedelta==0]

Unnamed: 0,n_correct,n_wrong,n_total,accuracy,accuracy_group,game_session_assessment,installation_id,datetime,title_assessment,game_session_y,...,event_data,event_count,event_code,game_time,title_y,type,world,datetime_end,datetime_begin,timedelta


In [54]:
# checking that datetime-datetime_begin is positive for all instances where both members of a game_session pair correspond to the same assessment
assess=session_pairs.loc[session_pairs.game_session_assessment==session_pairs.game_session_y]
print(' datetime-timebegin is positive for all pairs when both members are the same assessment: {}'.format((assess.timedelta>0).all()))

 datetime-timebegin is positive for all pairs when both members are the same assessment: True


In [55]:
# To obtain the history of an assessment, we now have to choose only those rows whose timedelta >0
# Also, we have to be careful to remember that this will also contain pairs where both members are the same assessment
# in principle, one might try to choose only those pairs whose timedelta>0 and for which game_session_assessment!=game_session_y
# but doing this is not preferable here, because there are assessments which have no other activities in their history
# since the only entry in session_pairs for these assessments will have its game_session_assessment=game_session_y, therefore
# we will lose info about such assessments if we choose for entries with game_session_assessment!=game_session_y

history=session_pairs.loc[(session_pairs.timedelta>0)]
history.head(2)

Unnamed: 0,n_correct,n_wrong,n_total,accuracy,accuracy_group,game_session_assessment,installation_id,datetime,title_assessment,game_session_y,...,event_data,event_count,event_code,game_time,title_y,type,world,datetime_end,datetime_begin,timedelta
0,1,0,1,1.0,3,00097cda27afb726,01bdd720,2019-09-15 16:37:00.433,Mushroom Sorter (Assessment),12384d925d55856b,...,"{""object"":""chicken"",""layout"":{""left"":{""chicken...",219,4035,243052,Chicken Balancer (Activity),Activity,CRYSTALCAVES,2019-09-15 00:20:15.130,2019-09-15 00:16:12.108,58848325000000
1,1,0,1,1.0,3,00097cda27afb726,01bdd720,2019-09-15 16:37:00.433,Mushroom Sorter (Assessment),284c96e23f38743a,...,"{""description"":""It's Chow Time! We have some V...",120,3010,129121,Chow Time,Game,CRYSTALCAVES,2019-09-15 00:12:18.338,2019-09-15 00:10:09.282,59211151000000


In [56]:
history.shape

(555285, 21)

In [68]:
hs1=history.groupby(by='game_session_assessment', sort=False)
hs2=history.groupby(by=['game_session_assessment','type','title_y'], sort=False)

In [69]:
print('No. of groups in hs2:{}'.format(hs2.ngroups))

No. of groups in hs2:218062


In [71]:
# getting the index for each group in hs2
# based on the following stackoverflow post: 
# https://stackoverflow.com/questions/42513049/get-all-keys-from-groupby-object-in-pandas
grp_names=hs2.groups.keys()
print('no. of group index collected: {}'.format(len(grp_names)))

no. of group index collected: 218062


In [65]:
print('No. of assessments which have no other activity in their history: {}'.format((hs1.size()==1).values.astype(int).sum()))

No. of assessments which have no other activity in their history: 466


In [60]:
# I tried to look if it is possible to apply agg at different levels of a multi-index groupby object but could not find any
# However, the following tutorials though not useful for my current case might still be interesting for other purposes
# For a brief tutorial on Multi-index objects see:
#    a) https://www.datacamp.com/community/tutorials/pandas-multi-index for tutorial on multi-index 
#    b) https://stackoverflow.com/questions/35244623/how-to-find-the-number-of-groups-in-multi-index-groupby-object-in-pandas

# for a very brief tutorial on Data aggregation with Multi-index objects see:
# https://jakevdp.github.io/PythonDataScienceHandbook/03.05-hierarchical-indexing.html#Data-Aggregations-on-Multi-Indices

In [77]:
# function to collect the statistics of each group in the dataframe 'hs2'

In [81]:
hs2['event_count'].agg(lambda x: x.mean())

game_session_assessment  type        title_y                      
00097cda27afb726         Activity    Chicken Balancer (Activity)      219.000000
                         Game        Chow Time                        120.000000
                         Assessment  Cart Balancer (Assessment)        18.500000
                         Game        Leaf Leader                      135.000000
                                     Happy Camel                       57.000000
                                     Scrub-A-Dub                      489.000000
                                     All Star Sorting                  57.000000
                                     Dino Drink                       159.000000
                         Activity    Fireworks (Activity)              81.000000
                         Assessment  Mushroom Sorter (Assessment)      36.000000
                         Activity    Watering Hole (Activity)         129.000000
                                     Flowe