In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import os
import os.path as osp
import sys
from tqdm import tqdm_notebook as tqdm
from IPython.display import display, clear_output

# 1 EDA

In [None]:
%%time
path = '/kaggle/input/data-science-bowl-2019/'
train_df = pd.read_csv(osp.join(path, 'train.csv'))
test_df = pd.read_csv(osp.join(path, 'test.csv'))
train_labels_df = pd.read_csv(osp.join(path, 'train_labels.csv'))
specs_df = pd.read_csv(osp.join(path, 'specs.csv'))
sub_df = pd.read_csv(osp.join(path, 'sample_submission.csv'))

In [None]:
def show_df_info(df):
    display(df.head(2), df.columns, df.shape)

In [None]:
show_df_info(train_df)

In [None]:
show_df_info(train_labels_df)

```python
Index(['event_id', 'game_session', 'timestamp', 'event_data',
       'installation_id', 'event_count', 'event_code', 'game_time', 'title',
       'type', 'world'],
      dtype='object')
```

```python
Index(['game_session', 'installation_id', 'title', 'num_correct',
       'num_incorrect', 'accuracy', 'accuracy_group'],
      dtype='object')
```

In [None]:
def get_shared_columns(df_1, df_2):
    return [x for x in df_1.columns if x in df_1.columns and x in df_2.columns]
    
shares_column_names = get_shared_columns(train_labels_df, train_df)
display(shares_column_names)

In [None]:
show_df_info(test_df)

In [None]:
get_shared_columns(train_labels_df, test_df)

In [None]:
get_shared_columns(train_df, test_df)

In [None]:
show_df_info(specs_df)

In [None]:
display(get_shared_columns(specs_df, train_df),
        get_shared_columns(specs_df, train_labels_df),
        get_shared_columns(specs_df, test_df))

In [None]:
show_df_info(sub_df)

**What is the classes?**


In [None]:
accuracy_group = np.array(train_labels_df['accuracy_group'])
display(set(accuracy_group))

So, we have a problem with 4 classes

Now, we join some table for getting the training dataset.

In [None]:
%%time
train = pd.merge(train_df, train_labels_df, on = ['game_session', 'installation_id', 'title'])
show_df_info(train)

In [None]:
%%time
train = pd.merge(train, specs_df, on = ['event_id'])
show_df_info(train)

In [None]:
%%time
test = pd.merge(test_df, sub_df, on=['installation_id'])
show_df_info(test)

In [None]:
%%time
test = pd.merge(test, specs_df, on=['event_id'])
show_df_info(test)

In [None]:
columns = get_shared_columns(train, test)
id_str = 'installation_id'
target_str = 'accuracy_group'
features = [column for column in columns if column not in [id_str, target_str]]

display(columns, len(columns), features, len(features))

**Almost columns are categorical feature, oh my god!**

In [None]:
%%time
features_numbers = [len(set(train[feature])) for feature in features]
display(features, features_numbers)

# 2 Feature engineering

Firstly, I don't want to do feature enginneering. I will use the NN to solve this problem.

# 3 Model

In [None]:
from fastai import *
from fastai.tabular import *

In [None]:
dep_var = 'accuracy_group'
cat_names = features
cont_names = []
procs = [FillMissing, Categorify, Normalize]

In [None]:
from sklearn.model_selection import StratifiedKFold

n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)

In [None]:
%%time
train_id = train[id_str]
x = train[features]
y = train[target_str]
display(x.shape, y.shape)

In [None]:
val = TabularList.from_df(train.iloc[int(865447*0.8):865447].copy(), path=path, cat_names=cat_names, cont_names=cont_names)

In [None]:
%%time
data = (TabularList.from_df(train, path=path, cat_names=cat_names, cont_names=cont_names, procs=procs)
                           .split_by_idx(list(range(int(865447*0.8),865447)))
                           .label_from_df(cols=dep_var)
                           .add_test(val, label=0)
                           .databunch())

In [None]:
learn = tabular_learner(data, layers=[200,100], metrics=accuracy)

In [None]:
learn.fit_one_cycle(2, 1e-3)

In [None]:
# train_index_list = []
# val_index_list = []
# for train_index, val_index in skf.split(x, y):
#     train_index_list.append(train_index)
#     val_index_list.append(val_index)

# 4 submission

In [None]:
# Get the random results
accuracy_group_list = np.random.randint(2, 4, size=(sub_df.shape[0], 1))
accuracy_group_list

In [None]:
sub_df['accuracy_group'] = accuracy_group_list
sub_df.head()

In [None]:
sub_df.to_csv('submission.csv', index=False)