In [None]:
import pandas as pd
import numpy as np
import datetime
from time import time
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.metrics import cohen_kappa_score, make_scorer
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.metrics import confusion_matrix
%load_ext autoreload
%autoreload 2

from xgboost import XGBClassifier, XGBRegressor, plot_importance
import xgboost as xgb
from sklearn.model_selection import KFold

from matplotlib import pyplot

import lightgbm as lgb

In [None]:
keep_cols = ['event_id', 'game_session', 'installation_id', 'event_count',
             'event_code','title' ,'game_time', 'type', 'world','timestamp']
train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv', usecols=keep_cols)
test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv', usecols=keep_cols)
train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv', usecols=['installation_id','game_session','accuracy_group'])
specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')

In [None]:
print(train.shape)
not_req=(set(train.installation_id.unique()) - set(train_labels.installation_id.unique()))
train = train[~train['installation_id'].isin(not_req)]
print(train.shape)

In [None]:
def get_time(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['hour'] = df['timestamp'].dt.hour
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    
    time_period_conditions = [
        (df['hour'] >= 6) & (df['hour'] <= 9),
        (df['hour'] > 9) & (df['hour'] <= 17),
        (df['hour'] > 17) & (df['hour'] <= 22),
    ]
    time_period_choices = ['morning', 'day', 'evening']
    df['time_period'] = np.select(time_period_conditions, time_period_choices, default='night')
    
    week_days_conditions = [
        (df['dayofweek'] >= 1) & (df['dayofweek'] <= 5),
    ]
    week_days_choices = ['work_day']
    df['week_day'] = np.select(week_days_conditions, week_days_choices, default='weekend')
    return df
    
result = get_time(train)
result_test = get_time(test)

In [None]:
table1 = result.drop(columns=['event_id', 'event_code', 'timestamp', 'dayofweek', 'hour'])
table1_test = result_test.drop(columns=['event_id', 'event_code', 'timestamp','dayofweek', 'hour'])

In [None]:
table2 = table1.groupby(['game_session', 'installation_id', 'title', 'type', 'world']).max().reset_index()
table2_test = table1_test.groupby(['game_session', 'installation_id', 'title', 'type', 'world']).max().reset_index()

In [None]:
table3 = table2.drop(columns=['game_session', 'event_count', 'game_time'])
table3_test = table2_test.drop(columns=['game_session','event_count', 'game_time'])

In [None]:
table_final = pd.get_dummies(table3, columns=['title', 'type', 'world','time_period', 'week_day']).groupby(['installation_id']).sum()
table_final_test = pd.get_dummies(table3_test, columns=['title', 'type', 'world','time_period', 'week_day']).groupby(['installation_id']).sum()


In [None]:
small_labels = train_labels[['installation_id', 'accuracy_group']].groupby(['installation_id']).agg(lambda x:x.value_counts().index[0])

In [None]:
train_joined = table_final.join(small_labels).dropna()

In [None]:
target = train_joined['accuracy_group']
train_joined = train_joined.drop(['accuracy_group'], axis = 1)

In [None]:
pars = {
        'colsample_bytree': 0.8,                 
        'learning_rate': 0.1,
        'max_depth': 10,
        'subsample': 1,
        'objective':'multi:softprob',
        'num_class':5,
        'eval_metric':'mlogloss',
        'min_child_weight':3,
        'gamma':0.04,
        'n_estimators':300
    }

kf = KFold(n_splits=5, shuffle=True, random_state=42)
y_pre=np.zeros((len(table_final_test),5),dtype=float)
final_test=xgb.DMatrix(table_final_test)





for train_index, val_index in kf.split(train_joined):
    train_X = train_joined.iloc[train_index]
    val_X = train_joined.iloc[val_index]
    train_y = target.iloc[train_index]
    val_y = target.iloc[val_index]
    xgb_train = xgb.DMatrix(train_X, train_y)
    xgb_eval = xgb.DMatrix(val_X, val_y)
    
    xgb_model = xgb.train(pars,
                  xgb_train,
                  num_boost_round=1000,
                  evals=[(xgb_train, 'train'), (xgb_eval, 'val')],
                  verbose_eval=False,
                  early_stopping_rounds=100
                 )
    
    val_X=xgb.DMatrix(val_X)
    pred_val=[np.argmax(x) for x in xgb_model.predict(val_X)]
    
    print('choen_kappa_score :',cohen_kappa_score(pred_val,val_y,weights='quadratic'))
    
    pred=xgb_model.predict(final_test)
    y_pre+=pred
    
pred = np.asarray([np.argmax(line) for line in y_pre])

In [None]:
plt.hist(pred)

In [None]:
def cv_train(X, y, cv, **kwargs):
    """
    Author: https://www.kaggle.com/xhlulu/
    Source: https://www.kaggle.com/xhlulu/ds-bowl-2019-simple-lgbm-using-aggregated-data
    """
    models = []
    
    kf = KFold(n_splits=cv, random_state=2019)
    
    for train, test in kf.split(X):
        x_train, x_val, y_train, y_val = X[train], X[test], y[train], y[test]
        
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        
        model = lgb.train(train_set=train_set, valid_sets=[train_set, val_set], **kwargs)
        models.append(model)
        
        if kwargs.get("verbose_eval"):
            print("\n" + "="*50 + "\n")
    
    return models

def cv_predict(models, X):
    return np.mean([model.predict(X) for model in models], axis=0)

In [None]:
X = train_joined.values
y = target.values

params = {
    'learning_rate': 0.01,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.2,
    'max_height': 3,
    'lambda_l1': 10,
    'lambda_l2': 10,
    'metric': 'multiclass',
    'objective': 'multiclass',
    'num_classes': 4,
    'random_state': 2019
}

models = cv_train(X, y, cv=5, params=params, num_boost_round=1000,
                  early_stopping_rounds=100, verbose_eval=500)

In [None]:
test_pred = cv_predict(models=models, X=table_final_test).argmax(axis=1)

In [None]:
plt.hist(test_pred)

In [None]:
weights = {'xgb': 0.50, 'lgb': 0.50}

final_pred = (pred * weights['xgb']) + (test_pred * weights['lgb'])

final_pred = final_pred.astype(int)

In [None]:
table_final_test['accuracy_group'] = final_pred.astype(int)

In [None]:
submission = table_final_test['accuracy_group'].astype(int).reset_index()
submission.to_csv('submission.csv', index=False)

In [None]:
table_final_test['accuracy_group'].value_counts()

In [None]:
plt.hist(final_pred)

In [None]:
submission 