In [1]:
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/data-science-bowl-2019/sample_submission.csv
/kaggle/input/data-science-bowl-2019/specs.csv
/kaggle/input/data-science-bowl-2019/train_labels.csv
/kaggle/input/data-science-bowl-2019/test.csv
/kaggle/input/data-science-bowl-2019/train.csv


# Importing libraries

In [2]:
import pandas as pd
import numpy as np
from time import time
import datetime as dt
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier

# Getting data

In [5]:
# removing 'event_data' from loading data to save RAM

load_columns = ['event_id',
                'game_session',
                'timestamp',                
                'installation_id',
                'event_count',
                'event_code',
                'game_time',
                'title',
                'type',
                'world']

path = '/kaggle/input/data-science-bowl-2019/' # create url path to the datasets

t0 = time()

print('Loading datasets...')
X_train = pd.read_csv(path + 'train.csv', usecols = load_columns) # all users events 
X_labels = pd.read_csv(path + 'train_labels.csv') # all users labels
# specs = pd.read_csv(path + 'specs.csv') # save memory
X_test = pd.read_csv(path + 'test.csv', usecols = load_columns)
submission = pd.read_csv(path + 'sample_submission.csv')
print("Datasets loaded successfully! \nLoading time:", round(time()-t0, 3), "s")

Loading datasets...
Datasets loaded successfully! 
Loading time: 55.04 s


# Data preprocessing

###### **<font color='blue'>(!!!) Reducing train df with users having accuracy scores **</font>

In [6]:
# Updating X_train with data about users who have accuracy score
X_train = X_train[X_train['installation_id'].isin(set(X_labels.installation_id))]

###### **<font color='blue'>(optional) collect garbage to reduce 11.7GB RAM usage-> **</font>

In [7]:
# import gc
# gc.collect()

0

###### **<font color='blue'>(!!!) Encoding categorical features of train & test**</font>

In [8]:
# Kaggle crashes at 16GB RAM usage
# This Kernel will have less features vs Colab version

# 1. Cast categorical variables into pandas categorical data type
X_train['world'] = pd.Categorical(X_train['world'])
# X_train['event_code'] = pd.Categorical(X_train['event_code']) # removing event_code from encoding to save memory
X_train['title'] = pd.Categorical(X_train['title'])
X_train['type'] = pd.Categorical(X_train['type'])

X_test['world'] = pd.Categorical(X_test['world'])
# X_test['event_code'] = pd.Categorical(X_test['event_code']) # removing event_code from encoding to save memory
X_test['title'] = pd.Categorical(X_test['title'])
X_test['type'] = pd.Categorical(X_test['type'])

In [9]:
# 2. Convert categorical variables into dummy/indicator variables
# Also drop the encoded columns
X_train = pd.get_dummies(data = X_train, columns=['world', 'title', 'type'], prefix = 'dummy')
X_test = pd.get_dummies(data = X_test, columns=['world', 'title', 'type'], prefix = 'dummy')

###### **<font color='blue'>(!!!) sum/last train & test dfs game_time and event_count**</font>

In [10]:
X_train['game_time_total'] = X_train.groupby('installation_id')['game_time'].transform('last') # not sure if game_time accumulates or separate for each game session
X_test['game_time_total'] = X_test.groupby('installation_id')['game_time'].transform('last') # not sure if game_time accumulates or separate for each game session

In [None]:
X_train['event_count_total'] = X_train.groupby('installation_id')['event_count'].transform('sum')
X_test['event_count_total'] = X_test.groupby('installation_id')['event_count'].transform('sum')

###### **<font color='blue'>(!!!) preparing train_labels**</font>

In [11]:
# To match test df, additional data from train_labels is not used for training
#X_labels['correct_total'] = X_labels.groupby('installation_id')['num_correct'].transform('sum')
#X_labels['incorrect_total'] = X_labels.groupby('installation_id')['num_incorrect'].transform('sum')
#X_labels['accuracy_mean'] = X_labels.groupby('installation_id')['accuracy'].transform('mean')
#X_labels['accuracy_group_mean'] = X_labels.groupby('installation_id')['num_incorrect'].transform('mean')
X_labels['Y_target'] = X_labels.groupby('installation_id')['accuracy_group'].transform('last')

In [None]:
X_labels.head()

###### **<font color='blue'>(!!! v2) preparing train_labels with just Y_target last value**</font>

In [12]:
X_labels['Y_target'] = X_labels.groupby('installation_id')['accuracy_group'].transform('last')

In [None]:
X_labels.head(2)

###### **<font color='blue'>(!) dropping unusused columns in train, test & train_labels**</font>[](http://)

In [None]:
X_train.head()

In [13]:
# v2: event data was not even loaded to save memory
X_train = X_train.drop(['event_id', 'game_session', 'timestamp', 'event_count', 'event_code', 'game_time'], axis=1) # dp: event_code dropped just for Kaggle to save memory
X_test = X_test.drop(['event_id', 'game_session', 'timestamp', 'event_count',  'event_code', 'game_time'], axis=1) # dp: event_code dropped just for Kaggle to save memory
X_labels = X_labels.drop(['game_session', 'title', 'num_correct', 'num_incorrect', 'accuracy', 'accuracy_group'], axis=1) # havent included separate title (Sorter, etc) correct/incorrect/accuracy 

In [14]:
X_train.shape, X_test.shape, X_labels.shape

((7734558, 54), (1156414, 54), (17690, 2))

In [15]:
X_train.head(2)

Unnamed: 0,installation_id,dummy_CRYSTALCAVES,dummy_MAGMAPEAK,dummy_NONE,dummy_TREETOPCITY,dummy_12 Monkeys,dummy_Air Show,dummy_All Star Sorting,dummy_Balancing Act,dummy_Bird Measurer (Assessment),...,dummy_Tree Top City - Level 1,dummy_Tree Top City - Level 2,dummy_Tree Top City - Level 3,dummy_Watering Hole (Activity),dummy_Welcome to Lost Lagoon!,dummy_Activity,dummy_Assessment,dummy_Clip,dummy_Game,event_count_total
1538,0006a69f,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,226162
1539,0006a69f,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,226162


###### **<font color='blue'>(!!!) Grouping train and test dfs**</font>

In [16]:
X_train = X_train.groupby('installation_id').agg(sum)

In [17]:
X_test = X_test.groupby('installation_id').agg(sum) 

In [None]:
X_test.head(2)

###### **<font color='blue'>(!) Grouping users in X_labels by the last assessment accuracy group**</font>

In [18]:
X_labels = X_labels.groupby('installation_id').last() 

In [None]:
X_labels.head()

###### **<font color='blue'>(!!!) merge train with train_labels**</font>

In [19]:
X_train = pd.merge(X_train, X_labels, how = 'left', on = ['installation_id'])

In [20]:
X_train.shape

(3614, 54)

In [21]:
X_train.head()

Unnamed: 0_level_0,dummy_CRYSTALCAVES,dummy_MAGMAPEAK,dummy_NONE,dummy_TREETOPCITY,dummy_12 Monkeys,dummy_Air Show,dummy_All Star Sorting,dummy_Balancing Act,dummy_Bird Measurer (Assessment),dummy_Bottle Filler (Activity),...,dummy_Tree Top City - Level 2,dummy_Tree Top City - Level 3,dummy_Watering Hole (Activity),dummy_Welcome to Lost Lagoon!,dummy_Activity,dummy_Assessment,dummy_Clip,dummy_Game,event_count_total,Y_target
installation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0006a69f,0.0,1910.0,4.0,1887.0,2.0,295.0,203.0,0.0,119.0,293.0,...,2.0,2.0,148.0,4.0,1771.0,261.0,37.0,1732.0,859641762,3
0006c192,207.0,1343.0,4.0,670.0,1.0,0.0,0.0,2.0,200.0,250.0,...,1.0,2.0,43.0,4.0,1206.0,343.0,32.0,643.0,434685248,0
00129856,201.0,319.0,0.0,310.0,0.0,0.0,0.0,1.0,40.0,219.0,...,0.0,0.0,0.0,0.0,786.0,43.0,1.0,0.0,59203070,3
001d0ed0,373.0,281.0,1.0,373.0,0.0,0.0,0.0,2.0,51.0,0.0,...,2.0,2.0,0.0,1.0,76.0,202.0,38.0,712.0,70143524,3
00225f67,316.0,0.0,1.0,640.0,1.0,37.0,122.0,1.0,37.0,0.0,...,1.0,1.0,0.0,1.0,291.0,65.0,15.0,586.0,66915354,0


# Model

In [22]:
X_train.columns.shape, X_test.columns.shape

((54,), (53,))

In [23]:
# Setting target & features
y = X_train.Y_target
feature_names = X_train.columns.drop('Y_target')
X = X_train[feature_names]

In [24]:
# RandomForest

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

forest_model = RandomForestClassifier(random_state = 1)
forest_model.fit(train_X, train_y)
rfc_preds = forest_model.predict(val_X)

# Save RandomForest accuracy
rfc_score = accuracy_score(val_y, rfc_preds)
print(accuracy_score(val_y, rfc_preds))

0.4668141592920354




In [25]:
# XGBoost

from sklearn.model_selection import train_test_split
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state = 0)

import xgboost as xgb
from sklearn.metrics import accuracy_score

xgb_model = XGBClassifier(max_depth = 5, learning_rate = 0.01, n_estimators = 2000, n_jobs = -1, colsample_bytree = 0.1)
xgb_model.fit(train_X, train_y)
xgb_preds = xgb_model.predict(val_X)

# Save XGBoost accuracy
xgb_score = accuracy_score(val_y, xgb_preds)
print(accuracy_score(val_y, xgb_preds))

0.5331858407079646


# Submission

In [26]:
submission.head()

Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3
1,01242218,3
2,017c5718,3
3,01a44906,3
4,01bc6cb6,3


In [27]:
xgb_preds = xgb_model.predict(X_test)
submission['accuracy_group'] = xgb_preds
submission.to_csv("submission.csv", index = False)
submission.head(2)

Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3
1,01242218,3
2,017c5718,3
3,01a44906,3
4,01bc6cb6,3
...,...,...
995,fee254cf,3
996,ff57e602,3
997,ffc73fb2,3
998,ffe00ca8,3


# Summary

XGBoost is a marginal winner
Gives LB value of 0.125 of QWK