Code quality and readability should also apply to data science notebooks.

Here I hope to write a notebook that is understandable and modular, so that other Kagglers can use part or whole of this code.

Credit:
- I first understood the data with this
  - https://www.kaggle.com/mhviraf/a-baseline-for-dsb-2019
- An example of quality starter code, although this is not a notebook
  - https://www.kaggle.com/gpreda/data-science-bowl-fast-compact-solution

Comments on this notebook:
- This line is slow and takes 38 minutes to compute for the entire dataframe. Speedup is appreciated.
```
train_df_features = train_actions.loc[([index[0]], slice(None,index[1][:-1]), slice(None)), :]
```
- However, this step can be skipped subsequently because you may load the pickle produced. I do not save the test dataset because I suspect all the non-training dataset will be replaced once you sent the notebook for evaluation in the public leaderboard.
- You can see the input and output dataframe just before the section - LightGBM training and predicting

In [None]:
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm_notebook, tqdm

from sklearn.model_selection import StratifiedKFold, KFold

import lightgbm as lgb

# so that we can print multiple dataframe in the same cell
from IPython.display import display, HTML
def displayer(df, num_rows=2): display(HTML(df.head(num_rows).to_html()))
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 4000)
tqdm.pandas()  # for progress_apply

!ls ../input/data-science-bowl-2019/

# Input Processing

In [None]:
%%time
# loading dataframes
train_actions = pd.read_csv("../input/data-science-bowl-2019/train.csv")
print(train_actions.shape)

test_actions = pd.read_csv("../input/data-science-bowl-2019/test.csv")
print(test_actions.shape)

train_df = pd.read_csv("../input/data-science-bowl-2019/train_labels.csv")
print(train_df.shape)
train_labels = train_df["accuracy_group"]

# the actual test_df will be sourced from test_actions
test_df_reference = pd.read_csv("../input/data-science-bowl-2019/sample_submission.csv")
print(test_df_reference.shape)

specs = pd.read_csv("../input/data-science-bowl-2019/specs.csv")
print(specs.shape)

displayer(train_actions)
displayer(test_actions)
displayer(train_df)
displayer(test_df_reference)
displayer(specs)

In [None]:
def extract_time_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
#     df['hour'] = df['timestamp'].dt.hour
#     df['date'] = df['timestamp'].dt.date
#     df['month'] = df['timestamp'].dt.month
#     df['year'] = df['timestamp'].dt.year
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    return df

train_actions = extract_time_features(train_actions)
test_actions = extract_time_features(test_actions)

In [None]:
%%time
# reassign game_session_id for indexing
game_session_lst = list(train_actions["game_session"]) + list(test_actions["game_session"]) + list(train_df["game_session"])
game_session_dict = {}
counter = 10**8
for game_session in game_session_lst:
    if not game_session in game_session_dict:
        counter += 1
        game_session_dict[game_session] = counter
        
def reassign(df):
    arr = []
    for a,b in zip(df["installation_id"], df["game_session"]):
        arr.append(a + str(game_session_dict[b]) + "x")
    return arr

train_actions["game_session"] = reassign(train_actions)
test_actions["game_session"] = reassign(test_actions)
train_df["game_session"] = reassign(train_df)

In [None]:
%%time
# reindex dataframes and lexsort for segmentations later
train_actions['idx'] = np.arange(train_actions.shape[0])
test_actions['idx'] = np.arange(test_actions.shape[0])
train_df['idx'] = np.arange(train_df.shape[0])

train_actions.set_index(['installation_id', 'game_session', 'idx'], inplace=True)
test_actions.set_index(['installation_id', 'game_session', 'idx'], inplace=True)
train_df.set_index(['installation_id', 'game_session', 'idx'], inplace=True)

train_actions = train_actions.sort_index()
test_actions = test_actions.sort_index()
train_df = train_df.sort_index()

print(train_actions.index.is_lexsorted(),
      test_actions.index.is_lexsorted(),
      train_df.index.is_lexsorted())

displayer(train_actions, 7)
displayer(test_actions, 7)
displayer(train_df, 7)

In [None]:
import json
def trim_event_data(json_string):
    d = json.loads(json_string)
    if "event_data" in d: del d["event_data"]
    if "event_count" in d: del d["event_count"]
    if "event_code" in d: del d["event_code"]
    if "description" in d: del d["description"]
    if "game_time" in d: del d["game_time"]
    json_string = json.dumps(d)
    return json_string
train_actions["event_data"] = train_actions["event_data"].progress_apply(trim_event_data)
test_actions["event_data"] = test_actions["event_data"].progress_apply(trim_event_data)
displayer(train_actions, 7)
displayer(test_actions, 7)

# Data aggregation

In [None]:
# drop information currently not aggregated
train_actions = train_actions.drop(["event_data", "game_time", "type", "world"], axis=1)
test_actions = test_actions.drop(["event_data", "game_time", "type", "world"], axis=1)

In [None]:
# helper functions to aggregate player history
# we count the number of event_codes and titles the user has been through
def aggregate_column_types(columns):
    aggr = {}
    for col in columns:
        aggr[col] = list(set(list(train_actions[col]) +  list(test_actions[col])))
    return aggr
aggr = aggregate_column_types(["title"])

def aggregate_df(aggr, df):
    d = {}
    for col, vals in aggr.items():
        value_count = df[col].value_counts()
        for val in vals:
            d[str(col)+"__"+str(val)] = 0
            if val in value_count:
                d[str(col)+"__"+str(val)] = value_count[val]
    d["length"] = df.shape[0]
    return d

In [None]:
# aggregate information from training
train_df_features_lst = []
train_df_features_dfs = []
for index, row in tqdm_notebook(train_df.iterrows()):
    train_df_features = train_actions.loc[([index[0]], slice(None,index[1][:-1]), slice(None)), :]
    train_df_features_dfs.append(train_df_features)
    train_df_features_lst.append(aggregate_df(aggr, train_df_features))

In [None]:
import pickle
with open('train_df_features_dfs.pkl', "wb") as f:
    pickle.dump(len(train_df_features_dfs), f)
    for train_df_feature in tqdm_notebook(train_df_features_dfs):
        pickle.dump(train_df_feature.reset_index().set_index(['installation_id', 'game_session', 'idx']).copy(), f)
        
# loading code
# train_df_features_dfs = []
# with open('train_df_features_dfs.pkl', "rb") as f:
#     for _ in tqdm_notebook(range(pickle.load(f))):
#         train_df_features_dfs.append(pickle.load(f))

In [None]:
# aggregate information from testing
test_df_features_lst = []
test_df_rows = []
for index in tqdm_notebook(list(set([i[0] for i in test_actions.index]))):
    test_df_features = test_actions.loc[([index], slice(None), slice(None)), :]
    test_df_features_lst.append(aggregate_df(aggr, test_df_features[:-1]))
    test_df_rows.append(test_df_features.tail(1))
test_df = pd.concat(test_df_rows, axis=0)

In [None]:
# map assessment titles to number
train_df = train_df[["title"]]
test_df = test_df[["title"]]
title_lst = list(set(list(train_df["title"])+list(test_df["title"])))
train_df['title'] = train_df['title'].apply(lambda x: title_lst.index(x))
test_df['title'] = test_df['title'].apply(lambda x: title_lst.index(x))

In [None]:
# include aggregated information 
train_df = pd.concat([train_df,pd.DataFrame(train_df_features_lst, index=train_df.index)], sort=False, axis=1)
test_df = pd.concat([test_df,pd.DataFrame(test_df_features_lst, index=test_df.index)], sort=False, axis=1)

test_df = test_df.sort_index(axis=0, level="idx")

assert list([i[0] for i in test_df.index]) == list(test_df_reference["installation_id"])

displayer(train_df)
displayer(test_df)

# LightGBM training and predicting

In [None]:
# defining LightGBM dataset and categorical_features
categorical_features = ["title"]
train_dataset = lgb.Dataset(train_df, label=train_labels,
                            free_raw_data=False,
                            categorical_feature=categorical_features)
test_dataset = lgb.Dataset(test_df,
                           free_raw_data=False,
                           categorical_feature=categorical_features)

In [None]:
param = {'num_leaves': 127, 
         'objective': 'multiclass',
         'num_class': 4}

In [None]:
# defined Kfold
num_rows = train_labels.shape[0]
skf = KFold(n_splits=10, shuffle=True, random_state=42)
folds = [fold for fold in skf.split(np.arange(num_rows), train_labels)]

# training for each fold
bst_lst = []
for i, (tr_idx, tx_idx) in enumerate(folds):
    print("fold number ", i)
    bst = lgb.train(param,
                    train_dataset.subset(tr_idx).construct(), 
                    num_boost_round=500,
                    early_stopping_rounds=10,
                    verbose_eval=100,
                    categorical_feature=categorical_features,
                    valid_sets=[train_dataset.subset(tr_idx).construct(),
                                train_dataset.subset(tx_idx).construct()])
    bst_lst.append(bst)

In [None]:
# make prediction with each fold
preds = []
for bst in bst_lst:
    pred = bst.predict(test_df) # not test_dataset
    preds.append(pred)
preds = np.array(preds)
print(preds.shape)

preds_mean = np.mean(preds, axis=0)
print(preds_mean.shape)

res = [np.argmax(ar) for ar in preds_mean]
print(Counter(res))

test_df_reference["accuracy_group"] = res

In [None]:
# save and loading the submission file
test_df_reference.to_csv('submission.csv', index=None)
df_read = pd.read_csv('submission.csv')
df_read.head()