** Import Libraries **

In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as patches

from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('max_columns', 100)


py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot


import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from tqdm import tqdm_notebook


import altair as alt
from altair.vega import v5
from IPython.display import HTML

%matplotlib inline
plt.rc('figure', figsize=(15.0, 8.0))


In [None]:
print(os.listdir("../input/data-science-bowl-2019"))

In [None]:
%%time
root = '../input/data-science-bowl-2019/'

keep_cols = ['event_id', 'game_session', 'installation_id', 'event_count', 'event_code', 'title', 'game_time', 'type', 'world']
train = pd.read_csv(root + 'train.csv',usecols=keep_cols)
test = pd.read_csv(root + 'test.csv', usecols=keep_cols)

train_labels = pd.read_csv(root + 'train_labels.csv')
specs = pd.read_csv(root + 'specs.csv')
##sample_submission = pd.read_csv(root + 'sample_submission.csv')


Check the data 

In [None]:
print('Size of train data', train.shape)
print('Size of train_labels data', train_labels.shape)
print('Size of specs data', specs.shape)
print('Size of test data', test.shape)

**Reduce Memory of DF **

Explore data 

In [None]:
train.describe()

Unique classes in each column

specs column types

In [None]:
specs.dtypes.value_counts()

** CHECK FOR MISSING Values **

** Check variable correlation **
** credit to below notebook **

https://www.kaggle.com/tanreinama/ds-bowl-2019-simple-lgbm-aggregated-data-with-cv
https://www.kaggle.com/caesarlupum/ds-bowl-start-here-a-gentle-introduction



** Combine all the summary metrics from above **

def group_and_reduce(df):
    # group1 and group2 are intermediary "game session" groups,
    # which are reduced to one record by game session. group1 takes
    # the max value of game_time (final game time in a session) and 
    # of event_count (total number of events happened in the session).
    # group2 takes the total number of event_code of each type
    group1 = df.drop(columns=['event_id', 'event_code']).groupby(
        ['game_session', 'installation_id', 'title', 'type', 'world']
    ).max().reset_index()

    group2 = pd.get_dummies(
        df[['installation_id', 'event_code']], 
        columns=['event_code']
    ).groupby(['installation_id']).sum()

    # group3, group4 and group5 are grouped by installation_id 
    # and reduced using summation and other summary stats
    group3 = pd.get_dummies(
        group1.drop(columns=['game_session', 'event_count', 'game_time']),
        columns=['title', 'type', 'world']
    ).groupby(['installation_id']).sum()

    group4 = group1[
        ['installation_id', 'event_count', 'game_time']
    ].groupby(
        ['installation_id']
    ).agg([np.sum, np.mean, np.std])

    return group2.join(group3).join(group4)

In [None]:
def summarize_df(df):
    
    group1 = df.drop(columns=['event_id', 'event_code']).groupby(
        ['game_session', 'installation_id', 'title', 'type', 'world']
    ).max().reset_index()

    
    # group3, group4 and group5 are grouped by installation_id 
    # and reduced using summation and other summary stats
    group3 = pd.get_dummies(
        group1.drop(columns=['game_session', 'event_count', 'game_time']),
        columns=['title', 'type', 'world']
    ).groupby(['installation_id']).sum()


    group3.reset_index(inplace = True) 
    
    return group3

In [None]:
%%time

##train_small = group_and_reduce(train)
##test_small = group_and_reduce(test)

train_small = summarize_df(train)
test_small = summarize_df(test)



print(train_small.shape)
train_small.head()

small_labels = train_labels[['installation_id', 'accuracy_group']].set_index('installation_id')
##small_labels = train_labels[['installation_id', 'accuracy_group']].reset_index()
##train_joined = train_small.join(small_labels).dropna()
train_joined = train_small.merge(small_labels,on='installation_id', how='left').dropna()
##train_joined.head()
##train_joined.columns
##print(type(train_joined))
##aa = pd.DataFrame(train_joined.ravel())
##aa.columns
##print(small_labels.iloc['installation_id'].head())

#train_joined.head()
train_joined.drop(columns=['accuracy_group','installation_id'])

In [None]:
from sklearn.model_selection import KFold
##small_labels = train_labels[['installation_id', 'accuracy_group']].set_index('installation_id')
small_labels = train_labels[['installation_id', 'accuracy_group']]
##train_joined = train_small.join(small_labels).dropna()
train_joined = train_small.merge(small_labels,on='installation_id', how='left').dropna()
##train_joined.set_index('installation_id')
kf = KFold(n_splits=10, random_state=2019)
X = train_joined.drop(columns=['accuracy_group','installation_id']).values
y = train_joined['accuracy_group'].values.astype(np.int32)
y_pred = np.zeros((len(test_small), 4))
for train, test in kf.split(X):
    x_train, x_val, y_train, y_val = X[train], X[test], y[train], y[test]
    train_set = lgb.Dataset(x_train, y_train)
    val_set = lgb.Dataset(x_val, y_val)

    params = {
        'learning_rate': 0.01,
        'bagging_fraction': 0.9,
        'feature_fraction': 0.9,
        'num_leaves': 14,
        'lambda_l1': 0.1,
        'lambda_l2': 1,
        'metric': 'multiclass',
        'objective': 'multiclass',
        'num_classes': 4,
        'random_state': 2019
    }

    model = lgb.train(params, train_set, 
                      num_boost_round=100, 
                      early_stopping_rounds=20, 
                      valid_sets=[train_set, val_set], 
                      verbose_eval=100)
    
    test_df = test_small.drop(columns='installation_id')
    y_pred += model.predict(test_df)

In [None]:
y_pred[:10]

In [None]:
y_pred_max = y_pred.argmax(axis=1)
##y_pred_max
##(pd.DataFrame(y_pred_max)).describe()

##np.histogram(y_pred_max)

test_small['accuracy_group'] = pd.DataFrame(y_pred_max)

##print(test_small[['accuracy_group']])

test_small[['accuracy_group']].groupby(['accuracy_group']).sum()

##type(test_small)
##test_small.columns
##print(test_small["installation_id"])

##y_pred

##test_small.shape
##y_pred.argmax(axis=1)

submission = pd.concat([test_small['installation_id'],
                                     pd.DataFrame(y_pred).idxmax(1)], axis=1)
submission.columns = ['installation_id','accuracy_group']
submission.to_csv('submission.csv', index=False)

In [None]:

##submission.to_csv('submission.csv', index= False)