In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import gc

import sklearn as sk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import cross_validate

import xgboost

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# load and process test data
pd.set_option('display.max_colwidth', -1)
path = '/kaggle/input/data-science-bowl-2019/'
test = pd.read_csv(path+'test.csv',parse_dates=["timestamp"],dtype = {'event_count':np.int16,'event_code':np.int16})

In [None]:
def AssessmentScore(df):
    s1 = df[(df.title=='Bird Measurer (Assessment)') & (df.event_code==4110)].event_data.apply(lambda x: x.find('"correct":true')>-1)
    s2 = df[(df.type=='Assessment') & (df.title!='Bird Measurer (Assessment)') & (df.event_code==4100)].event_data.apply(lambda x: x.find('"correct":true')>-1)
    s1.name = 'correct'
    s2.name = 'correct'
    s1 = s1.append(s2).sort_index()
    return df.join(s1)

def TimeDifference(df):
    gb = df.loc[:,['installation_id','timestamp']].groupby('installation_id',as_index=False)
    store = gb.diff()
    store.index = df.index
    store = store.rename(columns = {'timestamp':'dt'})
    store.dt = store.dt-pd.to_datetime(0,utc=True)
    store.dt = store.dt.dt.total_seconds()
    # store.dt = store.dt.astype(np.float32)
    return df.join(store)

def GameSessionStats(df):
    gb = df.loc[:,['game_session','dt']].groupby('game_session')
    store = gb.sum()
    store.columns = ['game_session_time']
    return df.reset_index().merge(store, how='left', on='game_session').set_index('index')

def InstallationIdStats(df):
    subset = df.loc[:,['installation_id','game_session','title']]
    titles = subset.title.unique()
    subset = subset.drop_duplicates()
    for title in titles:
        subset[title] = (subset.title==title).apply(lambda x: np.uint8(x))
    subset = subset.drop(columns = 'title')
    
    gb = subset.groupby(['installation_id'])
    store = gb.cumsum().join(subset.installation_id)
    store = store.groupby('installation_id').shift(periods=1,fill_value=0)
    store = store.drop(columns='installation_id').join(subset.loc[:,['installation_id','game_session']])
    return df.reset_index().merge(store, how='left', on=['installation_id','game_session']).set_index('index')

def ExtendTrainData(df):
    # find the game_sessions where Assessments are taken
    titles = df.title.unique()
    col_list = ['installation_id','game_session','title','timestamp','dt']+list(titles)
    test_out = df.loc[:,col_list].groupby('installation_id').last().reset_index()
    
    # convert title cumcounts into integer
    for title in titles:
        test_out[title] = test_out[title].apply(np.uint8)
        
    # calculate num_correct and num_incorrect
    subset = df[(df.correct==True) | (df.correct==False)]
    out = subset.loc[:,['installation_id','game_session']].drop_duplicates()
    
    gb = subset.loc[:,['game_session','installation_id','correct','event_id']].groupby(['game_session','installation_id','correct'])
    store = gb.count().unstack().fillna(value=0)
    # cleanup column names
    store.columns = store.columns.to_flat_index()
    new_cols=[]
    for t in store.columns:
        if True in t:
            new_cols.append('num_correct')
        else:
            new_cols.append('num_incorrect')
    store.columns = new_cols
    store = store.reset_index()
    # merge into out
    out = out.merge(store.drop(columns='installation_id'), how='left',on='game_session')
    
    # calc accuracy and accuracy group
    out['accuracy'] = out.num_correct/(out.num_correct+out.num_incorrect)
    def CalcAccuracyGroup(x):
        if x == 1:
            return 3
        elif x==0.5:
            return 2
        elif x==0:
            return 0
        else:
            return 1
    out['accuracy_group'] = out.accuracy.apply(CalcAccuracyGroup)
    
    # accuracy group statistics
#    store = out.loc[:,['installation_id','accuracy']].groupby('installation_id').shift(1)
    store = pd.DataFrame({'installation_id': out.installation_id,'accuracy': out.accuracy})
    gb = store.groupby('installation_id')
    out['acc_sum'] = gb.cumsum()
    out['acc_min'] = gb.cummin()
    out['acc_max'] = gb.cummax()
    out['acc_cnt'] = gb.cumcount()+1
    out['acc_avg'] = out['acc_sum']/out['acc_cnt']
    
#    store = out.loc[:,['installation_id','accuracy_group']].groupby('installation_id').shift(1)
    store = pd.DataFrame({'installation_id': out.installation_id,'accuracy_group': out.accuracy_group})
    gb = store.loc[:,['installation_id','accuracy_group']].groupby('installation_id')
    out['acc_gr_sum'] = gb.cumsum()
    out['acc_gr_min'] = gb.cummin()
    out['acc_gr_max'] = gb.cummax()
    out['acc_gr_cnt'] = gb.cumcount()+1
    out['acc_gr_avg'] = out['acc_gr_sum']/out['acc_gr_cnt']

    out['acc_min'] = out.acc_min.fillna(value=-1)
    out['acc_max'] = out.acc_max.fillna(value=-1)
    out['acc_avg'] = out.acc_avg.fillna(value=-1)
    out['acc_gr_min'] = out.acc_gr_min.fillna(value=-1)
    out['acc_gr_max'] = out.acc_gr_max.fillna(value=-1)
    out['acc_gr_avg'] = out.acc_gr_avg.fillna(value=-1)
    
    out = out.drop(columns=['num_correct','num_incorrect','acc_sum','acc_cnt','acc_gr_sum','acc_gr_cnt'])
    
    out = out.groupby('installation_id').last()
    out = out.drop(columns = ['game_session','accuracy','accuracy_group'])
    
    test_out = test_out.merge(out, how='left',on='installation_id')
    test_out = test_out.fillna(value=-1)
    
    return test_out

In [None]:
test = AssessmentScore(test)
test = TimeDifference(test)
test = GameSessionStats(test)
test = InstallationIdStats(test)
test = ExtendTrainData(test)

In [None]:
dtype_dict={
    'Welcome to Lost Lagoon!': np.uint8, 
    'Magma Peak - Level 1': np.uint8,
    'Sandcastle Builder (Activity)': np.uint8, 
    'Slop Problem': np.uint8, 
    'Scrub-A-Dub': np.uint8,
    'Tree Top City - Level 1': np.uint8, 
    'Ordering Spheres': np.uint8, 
    'All Star Sorting': np.uint8,
    'Costume Box': np.uint8, 
    'Fireworks (Activity)': np.uint8, 
    '12 Monkeys': np.uint8,
    'Tree Top City - Level 2': np.uint8, 
    'Flower Waterer (Activity)': np.uint8, 
    'Pirate\'s Tale': np.uint8,
    'Mushroom Sorter (Assessment)': np.uint8, 
    'Air Show': np.uint8, 
    'Treasure Map': np.uint8,
    'Tree Top City - Level 3': np.uint8, 
    'Crystals Rule': np.uint8, 
    'Rulers': np.uint8,
    'Bug Measurer (Activity)': np.uint8, 
    'Bird Measurer (Assessment)': np.uint8,
    'Watering Hole (Activity)': np.uint8, 
    'Magma Peak - Level 2': np.uint8, 
    'Dino Drink': np.uint8,
    'Bubble Bath': np.uint8, 
    'Bottle Filler (Activity)': np.uint8, 
    'Dino Dive': np.uint8,
    'Crystal Caves - Level 1': np.uint8, 
    'Chow Time': np.uint8, 
    'Cauldron Filler (Assessment)': np.uint8,
    'Balancing Act': np.uint8, 
    'Crystal Caves - Level 2': np.uint8, 
    'Crystal Caves - Level 3': np.uint8,
    'Chicken Balancer (Activity)': np.uint8,
    'Lifting Heavy Things': np.uint8,
    'Pan Balance': np.uint8,
    'Honey Cake': np.uint8,
    'Happy Camel': np.uint8,
    'Cart Balancer (Assessment)': np.uint8,
    'Heavy, Heavier, Heaviest': np.uint8,
    'Egg Dropper (Activity)': np.uint8,
    'Chest Sorter (Assessment)': np.uint8,
    'Leaf Leader': np.uint8,
    'dt': np.float64,
    'acc_min':np.float32, 
    'acc_max':np.float32, 
    'acc_avg':np.float32,
    'acc_gr_min':np.float32, 
    'acc_gr_max':np.float32, 
    'acc_gr_avg':np.float32
}

train_dtype_dict = {
    'accuracy': np.float32,
    'accuracy_group':np.int8, 
}

train_dtype_dict.update(dtype_dict)

In [None]:
# load extended data
path = '/kaggle/input/2019datasciencebowl-featureengineering-raw/'
train = pd.read_csv(path+'train_extend.csv',parse_dates=['timestamp'],dtype=train_dtype_dict)
#path = '/kaggle/input/2019datasciencebowl-featureengineering-test-raw/'
#test = pd.read_csv(path+'test_extend.csv',parse_dates=['timestamp'],dtype=dtype_dict)

In [None]:
# preprocess data
def preprocess_train(df):
    # parse timestamp
    df['MM']=df.timestamp.dt.month.astype(np.uint8)
    df['DD']=df.timestamp.dt.day.astype(np.uint8)
    df['dayofweek']=df.timestamp.dt.dayofweek.astype(np.uint8)
    df['HH']=df.timestamp.dt.hour.astype(np.uint8)
    
    # Encode Title
    title_le = LabelEncoder()
    title_le.fit(df.title.unique())
    df.title = title_le.transform(df.title).astype(np.uint8)
    
    # finalize output
    Y = df.accuracy_group
    label = df.loc[:,['installation_id','game_session']]
    X = df.drop(columns = ['game_session','installation_id','accuracy','accuracy_group','timestamp'])
    return X, Y, label 

In [None]:
# preprocess data
def preprocess_test(df):
    # parse timestamp
    df['MM']=df.timestamp.dt.month.astype(np.uint8)
    df['DD']=df.timestamp.dt.day.astype(np.uint8)
    df['dayofweek']=df.timestamp.dt.dayofweek.astype(np.uint8)
    df['HH']=df.timestamp.dt.hour.astype(np.uint8)
    
    # Encode Title
    title_le = LabelEncoder()
    title_le.fit(df.title.unique())
    df.title = title_le.transform(df.title).astype(np.uint8)
    
    # finalize output
    label = df.loc[:,['installation_id','game_session']]
    X = df.drop(columns = ['game_session','installation_id','timestamp'])
    return X, label 

In [None]:
train_X, train_Y, train_label = preprocess_train(train)
test_X, test_label = preprocess_test(test)
test_X = test_X.loc[:,train_X.columns]

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(train_X, train_Y, test_size=0.2, random_state=42)

In [None]:
model = xgboost.XGBClassifier(
    max_depth=5, learning_rate=0.1, n_estimators=70, 
    verbosity=1, 
    objective='multi:softmax', booster='gbtree', tree_method='auto', 
    n_jobs=1, 
    gamma=0, 
    min_child_weight=1, max_delta_step=0, 
    subsample=1, 
    colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, 
    reg_alpha=0, reg_lambda=1, 
    scale_pos_weight=1, 
    base_score=0.5, 
    random_state=0, 
    missing=None)

In [None]:
model.fit(X_train,Y_train,eval_metric='mlogloss')
Y_pred = model.predict(X_val)
print(cohen_kappa_score(Y_pred,Y_val))

In [None]:
def ModelTuning():
    max_depth = list(range(3,8))
    num_est = list(range(50,130,10))
    lr = [0.3,0.2,0.1,0.05,0.01]
    kappa = []
    for N in num_est:
        model = xgboost.XGBClassifier(max_depth=5,n_estimators=N,learning_rate=0.1,objective='multi:softmax')
        cv = cross_validate(model,X=train_X, y=train_Y, cv=5, scoring='balanced_accuracy',fit_params={'eval_metric':'auc'})#, sk.metrics.make_scorer(cohen_kappa_score))
        #model.fit(X_train,Y_train)
        #Y_pred = model.predict(X_val)
        #kappa.append(cohen_kappa_score(Y_pred,Y_val))
        print(N,np.mean(cv['test_score']))
        kappa.append(np.mean(cv['test_score']))
    sns.scatterplot(x=num_est,y=kappa)

In [None]:
model = xgboost.XGBClassifier(
    max_depth=5, learning_rate=0.1, n_estimators=70, 
    verbosity=1, 
    objective='multi:softmax', booster='gbtree', tree_method='auto', 
    n_jobs=1, 
    gamma=0, 
    min_child_weight=1, max_delta_step=0, 
    subsample=1, 
    colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, 
    reg_alpha=0, reg_lambda=1, 
    scale_pos_weight=1, 
    base_score=0.5, 
    random_state=0, 
    missing=None)

In [None]:
model.fit(train_X,train_Y,eval_metric='mlogloss')

In [None]:
test_predict = model.predict(test_X)

In [None]:
submit = pd.DataFrame({'installation_id':test_label.installation_id, 'accuracy_group':test_predict})

In [None]:
submit.to_csv('submission.csv',index=False)