In [None]:
# Presettings (imports and so on)
import numpy as np
import pandas as pd
import seaborn as sns
import pandas as pd

from sklearn import preprocessing
le = preprocessing.LabelEncoder()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sample_submission = pd.read_csv("../input/data-science-bowl-2019/sample_submission.csv")
specs = pd.read_csv("../input/data-science-bowl-2019/specs.csv")
test = pd.read_csv("../input/data-science-bowl-2019/test.csv")
train = pd.read_csv("../input/data-science-bowl-2019/train.csv")
train_labels = pd.read_csv("../input/data-science-bowl-2019/train_labels.csv")

In [None]:
train_labels.info()

In [None]:
full_train = train.merge(train_labels, on='game_session', how='right')

# DATA EXPLORATION

### * - Train*

In [None]:
train.head(5)

In [None]:
train.info()

In [None]:
train.describe()

> ### event_id

*  Type: object
*  Correlation (target) : ???
*  Domain knowledge: randomly generated unique identifier for the event type. Maps to event_id column in specs table
*  Verdict: map to specs column and do a research [Feature engineering]

> ### game_session

*  Type: object
*  Correlation (target) : ???
*  Domain knowledge: Randomly generated unique identifier grouping events within a single game or video play session.
*  Verdict: delete cause we can't get info from this feature from now

> ### timestamp 

*  Type: object
*  Correlation (target) : ???
*  Domain knowledge: Client-generated datetime
*  Verdict: exctruct the time of day, then use bins encoding for the next correlation exploration [Feature engineering]

> ### event_data  

*  Type: object
*  Correlation (target) : ???
*  Domain knowledge: Semi-structured JSON formatted string containing the events parameters. Default fields are: event_count, event_code, and game_time; otherwise fields are determined by the event type
*  Verdict: remove at the moment

>  ### installation_id

*  Type: object
*  Correlation (target) : random feature
*  Domain knowledge: randomly generated unique identifier grouping game sessions within a single installed application instance
*  Verdict: remove from dataset at the moment

>  ### event_count

In [None]:
sns.distplot(full_train['event_count']);

In [None]:
sns.relplot(x="event_count", y="accuracy_group", kind="line", data=full_train);

In [None]:
print('Correlation on full dataset:', round(full_train['accuracy_group'].corr(full_train['event_count']), 2))

*  Type: numeric
*  Correlation (target) : is present
*  Domain knowledge: Incremental counter of events within a game session (offset at 1). Extracted from event_data.
*  Verdict: use in ML with outliers dropping

>  ### event_code

In [None]:
sns.distplot(full_train['event_code']);

In [None]:
sns.relplot(x="event_code", y="accuracy_group", kind="line", data=full_train);

In [None]:
print('Correlation on full dataset:', round(full_train['accuracy_group'].corr(full_train['event_code']), 2))

*  Type: numeric
*  Correlation (target) : is present
*  Domain knowledge: Identifier of the event 'class'. Unique per game, but may be duplicated across games. E.g. event code '2000' always identifies the 'Start Game' event for all games. Extracted from event_data.
*  Verdict: use in ML with bins encoding (3 bins overall)

>  ### game_time

In [None]:
sns.distplot(full_train['game_time']);

In [None]:
sns.boxplot(full_train['game_time']);

In [None]:
print('Correlation on full dataset:', round(full_train['accuracy_group'].corr(full_train['game_time']), 2))

*  Type: numeric
*  Correlation (target) : is not present
*  Domain knowledge: Time in milliseconds since the start of the game session. Extracted from event_data.
*  Verdict: remove from dataset at the moment

>  ### title 

In [None]:
print('Num of unique values in "Title" column:', full_train['title_x'].nunique())

In [None]:
ax = sns.barplot(x=full_train['title_x'], y=full_train['accuracy_group'])
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)

In [None]:
full_train['title_x'] = le.fit_transform(full_train['title_x'])
print('Correlation on full dataset:', round(full_train['accuracy_group'].corr(full_train['title_x']), 2))

*  Type: object
*  Correlation (target) : is present
*  Domain knowledge: Title of the game or video.
*  Verdict: use in ML

>  ### type

In [None]:
print('Num of unique values in "Type" column:', full_train['type'].nunique())

*  Type: object
*  Correlation (target) : ???
*  Domain knowledge: Media type of the game or video. Possible values are: 'Game', 'Assessment', 'Activity', 'Clip'. There is only one name in selected data.
*  Verdict: use in ML at the moment with label encoding

>  ### world 

In [None]:
print('Num of unique values in "World" column:', full_train['world'].nunique())

In [None]:
ax = sns.barplot(x=full_train['world'], y=full_train['accuracy_group'])
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)

*  Type: object
*  Correlation (target) : is present
*  Domain knowledge: The section of the application the game or video belongs to. Helpful to identify the educational curriculum goals of the media. Possible values are: 'NONE' (at the app's start screen), TREETOPCITY' (Length/Height), 'MAGMAPEAK' (Capacity/Displacement), 'CRYSTALCAVES' (Weight).. There are only three names in selected data.
*  Verdict: use in ML with label encoding

# FEATURE ENGINEERING

### * - Train*

> ### event_id

In [None]:
train_with_specs = full_train.join(specs.set_index('event_id'), on='event_id')
print('Num of unique values in "info" column:', train_with_specs['info'].nunique())
print('Num of unique values in "args" column:', train_with_specs['args'].nunique())

I'm gonna remove "info" and continue to work only "args" cause they contain the same information for us

In [None]:
train_with_specs.drop('info', axis=1, inplace=True)
train_with_specs['args'] = le.fit_transform(train_with_specs['args'])
train_with_specs.head(2)

In [None]:
sns.distplot(train_with_specs['args'])

In [None]:
ax = sns.barplot(x='args', y='accuracy_group', data=train_with_specs, color="salmon", saturation=.6)
p = ax.set_xticklabels(ax.get_xticklabels(),rotation=90)

In [None]:
print('Correlation on full dataset:', round(train_with_specs['accuracy_group'].corr(train_with_specs['args']), 2))

Finally, remove unnecessary "event_id" column

In [None]:
train_with_specs.drop('event_id', axis=1, inplace=True)

> ### game_session

In [None]:
train_with_specs.drop('game_session', axis=1, inplace=True)

> ### timestamp

In [None]:
train_with_specs['time'] = pd.to_datetime(train_with_specs['timestamp'])
train_with_specs['weekday'] = train_with_specs['time'].dt.weekday
train_with_specs['hours'] = train_with_specs['time'].dt.hour

In [None]:
sns.distplot(train_with_specs['weekday'])

In [None]:
ax = sns.barplot(x='weekday', y='accuracy_group', data=train_with_specs)

In [None]:
print('Correlation on full dataset:', 
      round(train_with_specs['accuracy_group'].corr(train_with_specs['weekday']), 2))

In [None]:
train_with_specs.drop('weekday', axis=1, inplace=True)

As we can see 'weekday' doesn't give us usefull info

In [None]:
sns.distplot(train_with_specs['hours'])

In [None]:
ax = sns.barplot(x='hours', y='accuracy_group', data=train_with_specs, color="salmon", saturation=.6)

In [None]:
print('Correlation on full dataset:', 
      round(train_with_specs['accuracy_group'].corr(train_with_specs['hours']), 2))

'hours' also doesn't give us usefull info

In [None]:
train_with_specs.drop('hours', axis=1, inplace=True)

>  ### event_data

In [None]:
train_with_specs.drop('event_data', axis=1, inplace=True)

>  ### installation_id

In [None]:
train_with_specs.drop('installation_id_x', axis=1, inplace=True)
train_with_specs.drop('installation_id_y', axis=1, inplace=True)

>  ### event_count

Remove outliers

In [None]:
train_with_specs = train_with_specs.loc[train_with_specs['event_count'] <= 200]

In [None]:
sns.distplot(train_with_specs['event_count'])

In [None]:
sns.relplot(x="event_count", y="accuracy_group", kind="line", data=train_with_specs);

In [None]:
print('Correlation on full dataset:', round(train_with_specs['accuracy_group'].corr(train_with_specs['event_count']), 2))

>  ### event_code

Bins encoding

In [None]:
bins = pd.IntervalIndex.from_tuples([(0, 2500), (2500, 3500), (3500, 5000)])
train_with_specs['event_code'] = le.fit_transform(pd.cut(train_with_specs['event_code'], bins))

In [None]:
sns.distplot(train_with_specs['event_code'])

In [None]:
sns.relplot(x="event_count", y="accuracy_group", kind="line", data=train_with_specs);

In [None]:
print('Correlation on full dataset:', 
      round(train_with_specs['accuracy_group'].corr(train_with_specs['event_count']), 2))

>  ### game_time 

In [None]:
train_with_specs.drop('game_time', axis=1, inplace=True)

>  ### timestamp 

In [None]:
train_with_specs.drop('timestamp', axis=1, inplace=True)

>  ### title  

In [None]:
train_with_specs.drop('title_y', axis=1, inplace=True)

>  ### type  

In [None]:
le.fit(['Game', 'Assessment', 'Activity', 'Clip'])
train_with_specs['type'] = le.transform(train_with_specs['type'])

>  ### world  

In [None]:
le.fit(['NONE', 'TREETOPCITY', 'MAGMAPEAK', 'CRYSTALCAVES'])
train_with_specs['world'] = le.transform(train_with_specs['world'])

In [None]:
print('Correlation on full dataset:', round(train_with_specs['accuracy_group'].corr(train_with_specs['world']), 2))

Filtering step:

In [None]:
train_with_specs.drop('time', axis=1, inplace=True)
train_with_specs.drop('num_correct', axis=1, inplace=True)
train_with_specs.drop('num_incorrect', axis=1, inplace=True)
train_with_specs.drop('accuracy', axis=1, inplace=True)

### * - Test*

Do pretty much the same as in the train data

>  ### event_id 

In [None]:
test_with_specs = test.join(specs.set_index('event_id'), on='event_id')
print('Num of unique values in "info" column:', test_with_specs['info'].nunique())
print('Num of unique values in "args" column:', test_with_specs['args'].nunique())

In [None]:
test_with_specs.drop('info', axis=1, inplace=True)
test_with_specs['args'] = le.fit_transform(test_with_specs['args'])
test_with_specs.head(2)

In [None]:
sns.distplot(test_with_specs['args'])

In [None]:
test_with_specs.drop('event_id', axis=1, inplace=True)

>  ### game_session 

In [None]:
test_with_specs.drop('game_session', axis=1, inplace=True)

>  ### timestamp 

In [None]:
test_with_specs.drop('timestamp', axis=1, inplace=True)

>  ### event_data

In [None]:
test_with_specs.drop('event_data', axis=1, inplace=True)

>  ### installation_id

nothing to be done

>  ### event_count

nothing to be done

>  ### event_code

In [None]:
sns.distplot(test_with_specs['event_code'])

In [None]:
bins = pd.IntervalIndex.from_tuples([(0, 2500), (2500, 3500), (3500, 8000)])
test_with_specs['event_code'] = le.fit_transform(pd.cut(test_with_specs['event_code'], bins))

In [None]:
sns.distplot(test_with_specs['event_code'])

>  ### game_time

In [None]:
test_with_specs.drop('game_time', axis=1, inplace=True)

>  ### title

In [None]:
test_with_specs['title'] = le.fit_transform(test_with_specs['title'])

>  ### type

In [None]:
le.fit(['Game', 'Assessment', 'Activity', 'Clip'])
test_with_specs['type'] = le.transform(test_with_specs['type'])

>  ### world

In [None]:
le.fit(['NONE', 'TREETOPCITY', 'MAGMAPEAK', 'CRYSTALCAVES'])
test_with_specs['world'] = le.transform(test_with_specs['world'])

In [None]:
test_with_specs.info()

In [None]:
train_with_specs.info()

# MODELLING

In [None]:
X = train_with_specs.drop('accuracy_group', axis=1)
Y = train_with_specs['accuracy_group']

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X, Y)

Make prediction for each session

In [None]:
test_with_specs['session_predictions'] = clf.predict(test_with_specs.drop('installation_id', axis=1))

In [None]:
import math
submit = pd.DataFrame(np.ceil(test_with_specs.groupby(['installation_id'])['session_predictions'].mean()))

In [None]:
submit['installation_id'] = submit.index

In [None]:
for col in submit.columns: 
    print(col) 

In [None]:
cols = submit.columns.tolist()
cols = cols[-1:] + cols[:-1]
submit = submit[cols]

In [None]:
submit.rename(columns={'session_predictions': 'accuracy_group'}, inplace=True)

In [None]:
submit.rename(columns={'installation_id': 'id'}, inplace=True)

In [None]:
submit.reset_index(inplace=True)

In [None]:
submit.drop('id', axis=1, inplace=True)

In [None]:
submit

In [None]:
submit['accuracy_group'] = submit['accuracy_group'].astype(int)

In [None]:
submit

In [None]:
submit.to_csv('submission.csv', index=False)