In [None]:
import scipy as sp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold

# so that we can print multiple dataframe in the same cell
from IPython.display import display, HTML
def displayer(df, num_rows=2): display(HTML(df.head(num_rows).to_html()))
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 4000)

In [None]:
!ls ../input/data-science-bowl-2019/

In [None]:
train_dir = "../input/data-science-bowl-2019/train.csv"
test_dir = "../input/data-science-bowl-2019/test.csv"
train_labels_dir = "../input/data-science-bowl-2019/train_labels.csv"
specs_dir = "../input/data-science-bowl-2019/specs.csv"
sample_submission_dir =  "../input/data-science-bowl-2019/sample_submission.csv"

In [None]:
%%time
train = pd.read_csv(train_dir)
print(train.shape)

test = pd.read_csv(test_dir)
print(test.shape)

train_labels = pd.read_csv(train_labels_dir)
print(train_labels.shape)

specs = pd.read_csv(specs_dir)
print(specs.shape)

sample_submission = pd.read_csv(sample_submission_dir)
print(sample_submission.shape)

displayer(train)
displayer(test)
displayer(train_labels)
displayer(specs)
displayer(sample_submission)

In [None]:
game_session_lst = list(train["game_session"]) + list(test["game_session"]) + list(train_labels["game_session"])
game_session_dict = {}
counter = 10**9
for game_session in game_session_lst:
    if not game_session in game_session_dict:
        counter += 1
        game_session_dict[game_session] = counter

In [None]:
def reassign(df):
    arr = []
    for a,b in zip(df["installation_id"], df["game_session"]):
        arr.append(a + str(game_session_dict[b]))
    return arr

train["game_session"] = reassign(train)
test["game_session"] = reassign(test)
train_labels["game_session"] = reassign(train_labels)

In [None]:
%%time
train['idx'] = np.arange(train.shape[0])
test['idx'] = np.arange(test.shape[0])
train_labels['idx'] = np.arange(train_labels.shape[0])
train.set_index(['installation_id', 'game_session', 'idx'], inplace=True)
test.set_index(['installation_id', 'game_session', 'idx'], inplace=True)
train_labels.set_index(['installation_id', 'game_session', 'idx'], inplace=True)

In [None]:
train = train.sort_index()
test = test.sort_index()
train_labels = train_labels.sort_index()
print(train.index.is_lexsorted(),
      test.index.is_lexsorted(),
      train_labels.index.is_lexsorted())

In [None]:
displayer(train, 50)
displayer(test, 50)
displayer(train_labels, 50)

In [None]:
%%time
train.to_pickle("train.pkl")
test.to_pickle("test.pkl")
train_labels.to_pickle("train_labels.pkl")
specs.to_pickle("specs.pkl")
sample_submission.to_pickle("sample_submission.pkl")

In [None]:
del train, test, train_labels, specs, sample_submission

In [None]:
%%time
train = pd.read_pickle("train.pkl")
test = pd.read_pickle("test.pkl")
train_labels = pd.read_pickle("train_labels.pkl")
specs = pd.read_pickle("specs.pkl")
sample_submission = pd.read_pickle("sample_submission.pkl")