In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.metrics import cohen_kappa_score
import xgboost as xgb
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import seaborn as sns
import matplotlib.pyplot as plt
import os
import plotly.graph_objects as go
import warnings
warnings.filterwarnings('ignore')

In [None]:
%%time
test = pd.read_csv("/kaggle/input/data-science-bowl-2019/test.csv")
train_labels = pd.read_csv("/kaggle/input/data-science-bowl-2019/train_labels.csv")
train = pd.read_csv("/kaggle/input/data-science-bowl-2019/train.csv")
specs = pd.read_csv("/kaggle/input/data-science-bowl-2019/specs.csv")
submission = pd.read_csv("/kaggle/input/data-science-bowl-2019/sample_submission.csv")
print("Data Loaded!")

In [None]:
train.head()

In [None]:
train_labels.head(4)

In [None]:
specs.head()

In [None]:
specs['info'][1]

In [None]:
test.sample(5)

In [None]:
train['title'].nunique()

In [None]:
plt.figure(figsize=(15,8))
plt.xticks(rotation=90)
sns.set()
sns.set(style="darkgrid")
ax = sns.countplot(x=train['title'], data=train)

In [None]:
countsT = train["type"].value_counts()
values = list(range(4))
labels = 'Game' ,'Activity', 'Assessment', 'Clip'
sizes = countsT.values
explode = (0.1, 0.1, 0.1, 0.9)  # only "explode" the 2nd slice (i.e. 'Hogs')
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',shadow=True, startangle=90)
ax1.axis('equal')  
plt.show()

In [None]:
countsT = train_labels["accuracy_group"].value_counts()
values = list(range(4))
labels = '3' ,'2', '1', '0'
sizes = countsT.values
explode = (0.1, 0.1, 0.1, 0.9)  
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',shadow=True, startangle=90)
ax1.axis('equal')  
plt.show()

In [None]:
countsT = train["world"].value_counts()
values = list(range(4))
labels = 'MAGMAPEAK' ,'CRYSTALCAVES', 'TREETOPCITY', 'NONE'
sizes = countsT.values
explode = (0.1, 0.1, 0.1, 0.9)  
fig1, ax1 = plt.subplots()
ax1.pie(sizes, explode=explode, labels=labels, autopct='%1.1f%%',shadow=True, startangle=90)
ax1.axis('equal')  
ax1.set_title('Title to which Game/Video belongs to')

In [None]:
train["timestamp"] = pd.to_datetime(train["timestamp"])
train['date'] = train['timestamp'].dt.date
group2 = train.groupby(['date'])['event_id'].agg('count')
fig = go.Figure([go.Scatter(x=group2.index, y=group2.values, line_color= "#B22222", )])
fig.update_layout(title_text='Time Series for all Events')
fig.show()

**DATA PREPARATION**

**This part is taken from this kernel,check it out and upvote it if you like.**

https://www.kaggle.com/shahules/xgboost-starter-dsbowl

In [None]:
train_labels.drop(['num_correct','num_incorrect','accuracy','title'],axis=1,inplace=True)

In [None]:
train.drop(['event_data','date'],axis=1,inplace=True)

In [None]:
not_req=(set(train.installation_id.unique()) - set(train_labels.installation_id.unique()))
train = train[~train['installation_id'].isin(not_req)]
print(train.shape)

In [None]:
def extract_time_features(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['date'] = df['timestamp'].dt.date
    df['month'] = df['timestamp'].dt.month
    df['hour'] = df['timestamp'].dt.hour
    df['year'] = df['timestamp'].dt.year
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    df['weekofyear'] = df['timestamp'].dt.weekofyear
    return df

In [None]:
def prepare_data(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour_of_day']=df['timestamp'].map(lambda x : int(x.hour))
    #one hot encoding on event code
    join_one=pd.get_dummies(df[['event_code','installation_id','game_session']],
                            columns=['event_code']).groupby(['installation_id','game_session'],as_index=False,sort=False).agg(sum)
    
    #dictionary to perform some aggregate functions after grouping
    agg={'event_count':sum,'hour_of_day':'mean','game_time':['sum','mean'],'event_id':'count'}
    
    join_two=df.drop(['timestamp'],axis=1).groupby(['installation_id','game_session'],as_index=False,sort=False).agg(agg)
    
    join_two.columns= [' '.join(col).strip() for col in join_two.columns.values]

    join_three=df[['installation_id','game_session','type','world','title']].groupby(['installation_id','game_session'],as_index=False,sort=False).first()
    
    join_four=(join_one.join(join_two.drop(['installation_id','game_session'],axis=1))). \
                        join(join_three.drop(['installation_id','game_session'],axis=1))
    return join_four

In [None]:
join_train=prepare_data(train)
cols=join_train.columns.to_list()[2:-3]
join_train[cols]=join_train[cols].astype('int16')

In [None]:
join_test=prepare_data(test)
cols=join_test.columns.to_list()[2:-3]
join_test[cols]=join_test[cols].astype('int16')

In [None]:
cols=join_test.columns[2:-8].to_list()
cols.append('event_id count')
cols.append('installation_id')

In [None]:
df=join_test[['hour_of_day mean','event_count sum','game_time mean','game_time sum',
    'installation_id']].groupby('installation_id',as_index=False,sort=False).agg('mean')

df_two=join_test[cols].groupby('installation_id',as_index=False,
                               sort=False).agg('sum').drop('installation_id',axis=1)

df_three=join_test[['title','type','world','installation_id']].groupby('installation_id',
         as_index=False,sort=False).last().drop('installation_id',axis=1)
        

In [None]:
final_train=pd.merge(train_labels,join_train,on=['installation_id','game_session'],
                                         how='left').drop(['game_session'],axis=1)

#final_test=join_test.groupby('installation_id',as_index=False,sort=False).last().drop(['game_session','installation_id'],axis=1)
final_test=(df.join(df_two)).join(df_three).drop('installation_id',axis=1)

In [None]:
df=final_train[['hour_of_day mean','event_count sum','game_time mean','game_time sum','installation_id']]. \
    groupby('installation_id',as_index=False,sort=False).agg('mean')

df_two=final_train[cols].groupby('installation_id',as_index=False,
                                 sort=False).agg('sum').drop('installation_id',axis=1)

df_three=final_train[['accuracy_group','title','type','world','installation_id']]. \
        groupby('installation_id',as_index=False,sort=False). \
        last().drop('installation_id',axis=1)

final_train=(df.join(df_two)).join(df_three).drop('installation_id',axis=1)

In [None]:
#concat train and test and Label Encode Categorical Columns

final=pd.concat([final_train,final_test])
encoding=['type','world','title']
for col in encoding:
    lb=LabelEncoder()
    lb.fit(final[col])
    final[col]=lb.transform(final[col])
    
final_train=final[:len(final_train)]
final_test=final[len(final_train):]

In [None]:
X_train=final_train.drop('accuracy_group',axis=1)
y_train=final_train['accuracy_group']

**Model**

In [None]:
%%time

pars = {
    'colsample_bytree': 0.5,                 
    'learning_rate': 0.01,
    'max_depth': 10,
    'subsample': 0.5,
    'objective':'multi:softprob',
    'num_class':4
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)
y_pre=np.zeros((len(final_test),4),dtype=float)
final_test=xgb.DMatrix(final_test.drop('accuracy_group',axis=1))


for train_index, val_index in kf.split(X_train):
    train_X = X_train.iloc[train_index]
    val_X = X_train.iloc[val_index]
    train_y = y_train[train_index]
    val_y = y_train[val_index]
    xgb_train = xgb.DMatrix(train_X, train_y)
    xgb_eval = xgb.DMatrix(val_X, val_y)
    
    xgb_model = xgb.train(pars,
                  xgb_train,
                  num_boost_round=10000,
                  evals=[(xgb_train, 'train'), (xgb_eval, 'val')],
                  verbose_eval=False,
                  early_stopping_rounds=100
                 )
    
    val_X=xgb.DMatrix(val_X)
    pred_val=[np.argmax(x) for x in xgb_model.predict(val_X)]
    
    print('choen_kappa_score :',cohen_kappa_score(pred_val,val_y,weights='quadratic'))
    
    pred=xgb_model.predict(final_test)
    y_pre+=pred
    
pred = np.asarray([np.argmax(line) for line in y_pre])

In [None]:
sub=pd.DataFrame({'installation_id':submission.installation_id,'accuracy_group':pred})
sub.to_csv('submission.csv',index=False)