In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Reading Data

In [0]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import warnings
warnings.filterwarnings("ignore")


import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter
from scipy import stats

from sklearn import metrics
from itertools import product
import copy
import time

In [0]:
%%time
train = pd.read_csv('/content/drive/My Drive/introml/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('/content/drive/My Drive/introml/data-science-bowl-2019/train_labels.csv')
specs = pd.read_csv('/content/drive/My Drive/introml/data-science-bowl-2019/specs.csv')
sample_submission = pd.read_csv('/content/drive/My Drive/introml/data-science-bowl-2019/sample_submission.csv')

CPU times: user 53.4 s, sys: 6.28 s, total: 59.7 s
Wall time: 1min 37s


In [0]:
train.head()

Unnamed: 0,event_id,game_session,timestamp,event_data,installation_id,event_count,event_code,game_time,title,type,world
0,27253bdc,45bb1e1b6b50c07b,2019-09-06T17:53:46.937Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Welcome to Lost Lagoon!,Clip,NONE
1,27253bdc,17eeb7f223665f53,2019-09-06T17:54:17.519Z,"{""event_code"": 2000, ""event_count"": 1}",0001e90f,1,2000,0,Magma Peak - Level 1,Clip,MAGMAPEAK
2,77261ab5,0848ef14a8dc6892,2019-09-06T17:54:56.302Z,"{""version"":""1.0"",""event_count"":1,""game_time"":0...",0001e90f,1,2000,0,Sandcastle Builder (Activity),Activity,MAGMAPEAK
3,b2dba42b,0848ef14a8dc6892,2019-09-06T17:54:56.387Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,2,3010,53,Sandcastle Builder (Activity),Activity,MAGMAPEAK
4,1bb5fbdb,0848ef14a8dc6892,2019-09-06T17:55:03.253Z,"{""description"":""Let's build a sandcastle! Firs...",0001e90f,3,3110,6972,Sandcastle Builder (Activity),Activity,MAGMAPEAK


In [0]:
train.shape

(11341042, 11)

## Pre-processing

In [0]:
#dropping 
# 1. title --> welcome to lost lagoon is a start screen (world == None) (can drop)
# 2. some take the assessment but are not in train_labels 
# 3. some didn’t take assessment
# 4. Event data (its already out into individual columns) 

#drop world == NONE
train = train[train['world'] != 'NONE']

#drop rows that are not found inside train_label data --> never take assessment
assessment_data_id = list(train_labels['installation_id'].unique())
train = train[train['installation_id'].isin(assessment_data_id)]

#drop event_data 
train.drop(columns='event_data')

#change timestamp to datetime
# train['timestamp'] = pd.to_datetime(train['timestamp'])

print(train.shape)

In [0]:
print(f'Number of rows in train_labels: {train_labels.shape[0]}')
print(f'Number of unique game_sessions in train_labels: {train_labels.game_session.nunique()}')

Number of rows in train_labels: 17690
Number of unique game_sessions in train_labels: 17690


## Feature Engineering

### Encoding and Compiling of Event Data

In [0]:
def encode_titles(train, test, train_labels):
  train['edited_title'] = train['title'].astype(str).str.replace(' ', '')
  train['title_event_code'] = train['edited_title'].astype(str) + '_' + train['event_code'].astype(str)
  test['title_event_code'] = test['title'].astype(str) + '_' +  test['event_code'].astype(str)

  train['type_world'] = train['type'] + '_' + train['world']
  test['type_world'] = test['type'] + '_' + test['world']

  #make a list of all the unique values of columns that are not numbers
  list_of_all_titles = train['title'].unique()
  list_of_all_title_event_code = train['title_event_code'].unique()
  list_of_all_type_world = train['type_world'].unique()
  list_of_all_event_id = train['event_id'].unique()
  list_of_all_event_code = train['event_code'].unique()
  list_of_all_assessment_title = train[train['type'] == 'Assessment']['title'].unique()
  list_of_all_world = train['world'].unique()

  list_of_all_installation_id = list(set(train['installation_id'].unique()).union(test['installation_id'].unique()))
  list_of_all_game_session = list(set(train['game_session'].unique()).union(test['game_session'].unique()))

  #making a dictionary of all the list with keys for mapping later:
  dict_of_all_titles = dict(zip(list_of_all_titles, range(len(list_of_all_titles))))
  dict_of_all_labels = dict(zip(range(len(list_of_all_titles)), list_of_all_titles))
  # dict_of_all_type_worlds = dict(zip(list_of_all_type_world, range(len(list_of_all_type_world))))
  dict_of_all_world_labels = dict(zip(range(len(list_of_all_world)), list_of_all_world))
  dict_of_all_worlds = dict(zip(list_of_all_world, range(len(list_of_all_world))))

  dict_of_all_installation_ids = dict(zip(list_of_all_installation_id, range(len(list_of_all_installation_id))))
  dict_of_all_game_session = dict(zip(list_of_all_game_session, range(len(list_of_all_game_session))))


  #replace text with numbers from the dict above:
  train['title'] = train['title'].map(dict_of_all_titles)
  test['title'] = test['title'].map(dict_of_all_titles)

  train['world'] = train['world'].map(dict_of_all_worlds)
  test['world'] = test['world'].map(dict_of_all_worlds)

  train_labels['title'] = train_labels['title'].map(dict_of_all_worlds)

  #get the dict of win code 4100 but for bird measurer is 4110
  win_code = dict(zip(dict_of_all_titles.values(), (4100 * np.ones(len(dict_of_all_titles))).astype('int')))
  win_code[dict_of_all_titles['Bird Measurer (Assessment)']] = 4110

  #converting to datetime
  train['timestamp'] = pd.to_datetime(train['timestamp'])
  test['timestamp'] = pd.to_datetime(test['timestamp'])

  #putting all the list in to a dictionary
  event_data = {}
  event_data['win_code'] = win_code
  event_data['list_of_titles'] = list_of_all_titles
  event_data['list_of_all_title_event_code'] = list_of_all_title_event_code
  event_data['list_of_all_type_world'] = list_of_all_type_world
  event_data['list_of_all_event_id'] = list_of_all_event_id
  event_data['list_of_all_event_code'] = list_of_all_event_code
  event_data['list_of_all_assessment_title'] = list_of_all_assessment_title
  event_data['list_of_all_world'] = list_of_all_world
  event_data['dict_of_all_titles'] = dict_of_all_titles
  # event_data['dict_of_all_type_worlds'] = dict_of_all_type_worlds
  event_data['dict_of_all_worlds'] = dict_of_all_worlds
  event_data['dict_of_all_labels'] = dict_of_all_labels
  event_data['dict_of_all_world_labels'] = dict_of_all_world_labels
  event_data['dict_of_all_installation_ids'] = dict_of_all_installation_ids
  event_data['dict_of_all_game_session'] = dict_of_all_game_session
 
  #dropping of useless columns:
  train.drop(['edited_title'], axis=1, inplace=True)

  return train, test, event_data

### Aggregation of Rows + Assessment Features

In [0]:
## This function was adapted from the following links with some features added/removed:
## https://www.kaggle.com/mhviraf/a-new-baseline-for-dsb-2019-catboost-model
## https://www.kaggle.com/artgor/quick-and-dirty-regression


def get_data(user_sample, test_set=False):
    last_activity = 0
    
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_actions = 0
    counter = 0

    count_of_title = {title: 0 for title in event_data['list_of_titles']}
    count_of_event_id = {eid: 0 for eid in event_data['list_of_all_event_id']}
    count_of_event_code = {code: 0 for code in event_data['list_of_all_event_code']}
    last_accuracy_title = {'acc_' + title: -1 for title in event_data['list_of_all_assessment_title']}
    title_event_code_count: Dict[str, int] = {t_eve: 0 for t_eve in event_data['list_of_all_title_event_code']}
    
    for i, session in user_sample.groupby('game_session', sort=False):

        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
                    
        # Engineering features for assessments
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            features = user_activities_count.copy()
            features.update(last_accuracy_title.copy())
            features.update(count_of_event_code.copy())
            features.update(count_of_event_id.copy())
            features.update(count_of_title.copy())
            features.update(title_event_code_count.copy())
            
            features['installation_id'] = session['installation_id'].iloc[-1]
            features['session_title'] = session['title'].iloc[0]

            # accuracy features
            # get all attempts for assessments
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # calculating correct and incorrect attempts
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts

            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            # store the accuracy_group for the latest assessment taken for a particular assessment type
            last_accuracy_title['acc_' + session_title_text] = accuracy

            # based on competition guidelines
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1

            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            features['accumulated_actions'] = accumulated_actions
            
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        # helper function to count the titles, event code and event ids
        def update_counters(counter: dict, col: str):
                num_of_session_count = Counter(session[col])
                for k in num_of_session_count.keys():
                    x = k
                    if col == 'title':
                        x = activities_labels[k]
                    counter[x] += num_of_session_count[k]
                return counter
            
        count_of_event_code = update_counters(count_of_event_code, "event_code")
        count_of_event_id = update_counters(count_of_event_id, "event_id")
        count_of_title = update_counters(count_of_title, 'title')
        title_event_code_count = update_counters(title_event_code_count, 'title_event_code')

        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type 
                        
    if test_set: # only store the row corresponding to the last assessment for the test dataset
        return all_assessments[-1]
    return all_assessments # for train, store all rows

In [0]:
train, test, event_data = encode_titles(train,test,train_labels)

In [0]:
#Credits go to Massoud Hosseinali

compiled_data = []
for i, (ins_id, user_sample) in tqdm(enumerate(train.groupby('installation_id', sort=False)), total=train.installation_id.nunique(), desc='Installation_id', position=0):
    compiled_data += get_data(user_sample)

In [0]:
reduce_train = pd.DataFrame(compiled_data)
del compiled_data # reduce space
reduce_train.shape

(17690, 103)

In [0]:
reduce_train.head()

Unnamed: 0,Clip,Activity,Assessment,Game,Mushroom Sorter (Assessment),Bird Measurer (Assessment),"Heavy, Heavier, Heaviest",Pirate's Tale,Bottle Filler (Activity),Magma Peak - Level 1,Bubble Bath,Crystal Caves - Level 2,Chicken Balancer (Activity),Flower Waterer (Activity),Crystal Caves - Level 3,Tree Top City - Level 2,Treasure Map,Ordering Spheres,Balancing Act,Sandcastle Builder (Activity),Tree Top City - Level 1,Tree Top City - Level 3,Dino Drink,Leaf Leader,Welcome to Lost Lagoon!,Watering Hole (Activity),Air Show,All Star Sorting,Cauldron Filler (Assessment),Magma Peak - Level 2,Dino Dive,Lifting Heavy Things,Costume Box,Fireworks (Activity),Chow Time,Honey Cake,Cart Balancer (Assessment),Crystals Rule,12 Monkeys,Rulers,Crystal Caves - Level 1,Bug Measurer (Activity),Chest Sorter (Assessment),Egg Dropper (Activity),Pan Balance,Happy Camel,Scrub-A-Dub,Slop Problem,2050,4100,4230,5000,4235,2060,4110,5010,2070,2075,2080,2081,2083,3110,4010,3120,3121,4020,4021,4022,4025,4030,4031,3010,4035,4040,3020,3021,4045,2000,4050,2010,2020,4070,2025,2030,4080,2035,2040,4090,4220,4095,installation_id,session_title,accumulated_correct_attempts,accumulated_uncorrect_attempts,duration_mean,accumulated_accuracy,accuracy_group,0,1,2,3,accumulated_accuracy_group,accumulated_actions
0,11,3,0,4,0,0,0,0,0,0,0,0,0,164,0,0,0,0,0,89,0,0,0,0,0,0,0,163,0,0,0,0,0,91,0,0,0,0,0,0,0,0,0,0,0,0,115,0,6,0,0,0,0,0,0,0,0,0,4,1,2,77,4,7,9,92,14,31,19,121,0,79,1,0,7,9,0,18,0,0,20,94,4,18,0,0,6,4,0,0,0006a69f,0,0,0,0.0,0.0,3,0,0,0,0,0.0,647
1,14,4,1,6,0,0,0,0,0,0,0,0,0,164,0,0,0,0,0,89,0,0,0,0,0,0,193,163,0,0,0,0,0,91,0,0,0,78,0,0,0,104,0,0,0,0,115,0,6,5,0,0,0,1,2,0,1,0,4,1,2,223,6,11,16,127,14,31,37,149,0,226,6,2,11,16,0,25,0,1,26,156,5,22,0,1,6,4,0,0,0006a69f,1,1,0,39.0,1.0,0,0,0,0,1,3.0,1143
2,14,4,2,6,0,0,0,0,0,0,0,0,0,164,0,0,0,0,0,89,0,0,0,0,0,0,193,163,0,0,0,0,0,91,0,0,0,78,0,0,0,104,0,0,0,0,115,0,6,5,0,0,0,1,13,0,1,0,4,1,2,225,6,22,16,127,14,31,59,171,0,228,6,2,22,16,0,26,0,1,27,160,5,22,0,1,6,4,0,0,0006a69f,0,1,11,65.5,0.5,3,1,0,0,1,1.5,1230
3,24,9,4,10,0,0,0,0,165,0,133,0,0,254,0,0,0,0,0,126,0,0,110,0,0,80,193,229,0,0,0,0,0,1611,0,0,0,78,0,0,0,104,0,0,0,0,195,0,9,6,0,5,0,2,13,5,2,0,8,2,5,336,10,25,40,243,29,45,93,314,6,341,14,9,25,40,2,47,0,2,52,348,9,43,0,5,10,4,9,1,0006a69f,0,2,11,41.25,0.5,2,2,0,0,2,1.5,2159
4,28,10,5,13,0,0,0,0,165,0,133,0,0,254,0,0,0,0,0,126,0,0,110,0,0,80,336,229,0,0,0,0,0,1611,0,0,0,310,0,0,0,184,0,0,0,0,195,0,9,12,0,5,0,3,13,5,2,1,8,2,5,457,12,30,53,277,29,45,105,331,6,463,15,10,30,53,2,56,0,3,64,387,10,53,0,6,10,4,9,1,0006a69f,1,3,12,39.2,0.5,3,2,0,1,2,1.6,2586


In [0]:
import feather
path2 = '/content/drive/My Drive/introml/fe/fe2/fe2_train.feather'
feather.write_dataframe(reduce_train, path2)