## Regressors and Classification by Optimal Rounding
**■Classification steps**<BR>
**Step 1** Create Regressor Models : Create multiple train_datasets using `kFold`(**not for CV**) and create a regression model from each dataset. I used ** CatBoost **, ** XGBoost **, ** LightGBM ** and `kFold = 5`, so created 15 models. <BR>
**Step 2** Predict each Model<BR>
**Step 3** Optimize Rounding Coefficients : The rounding coefficients of each model is optimized using `scipy.optimize.minimize()`. And calculate the final coefficient by weighted average of the optimal coefficient of each model.<BR>
**Step 4** Final Classification
    
Ver.5 : Introduced weighted average to calculate final rounding coefficient.

In [None]:
import os, sys
import datetime
from time import time
from tqdm import tqdm_notebook as tqdm
from collections import Counter

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split as split
from sklearn.metrics import cohen_kappa_score
import category_encoders as ce

# from catboost import CatBoostRegressor
import catboost as cat
import lightgbm as lgb
import xgboost as xgb

from functools import partial
import scipy as sp              # for optimize.minimize()

In [None]:
# Execution environment setting
Kaggle = True

if Kaggle:
    DIR = '../input/data-science-bowl-2019'
    task_type = 'CPU'
else:
    DIR = './data-science-bowl-2019'
    task_type = 'GPU'

## Observe the data

In [None]:
train = pd.read_csv(os.path.join(DIR,'train.csv'))
train_labels = pd.read_csv(os.path.join(DIR,'train_labels.csv'))
specs = pd.read_csv(os.path.join(DIR,'specs.csv'))
test = pd.read_csv(os.path.join(DIR,'test.csv'))

In [None]:
print('train:\t\t',train.shape)
print('train_labels:\t',train_labels.shape)
print('specs:\t\t',specs.shape)
print('test:\t\t',test.shape)

### 1. train

In [None]:
train.head()

In [None]:
train[['event_id','game_session','installation_id',
       'title','type','world']].describe()

In [None]:
event_code_n = train['event_code'].nunique()
print("num of unique 'event_code':", event_code_n)
print("'event_code': ",
      train['event_code'].min(), "-", train['event_code'].max())

In [None]:
# 'event_data' exsample
print(train['event_data'][40])
print(train['event_data'][41])
print(train['event_data'][43])

### 2. train_labels

In [None]:
train_labels.head()

In [None]:
train_labels[['game_session','installation_id', 'title']].describe()

In [None]:
# unique 'title' list
train_labels['title'].unique()

### 3. specs

In [None]:
specs.head()

In [None]:
specs.describe()

In [None]:
# 'info' exsample
print(specs['info'][0],'\n')
print(specs['info'][6],'\n')
print(specs['info'][7])

In [None]:
# 'args' exsample
print(specs['args'][0],'\n')
print(specs['args'][1])

### 4. test

In [None]:
test.head(8)

In [None]:
test[['event_id','game_session','installation_id',
       'title','type','world']].describe()

## Compile data
Based on several kernels
- Hosseinali: https://www.kaggle.com/mhviraf/a-new-baseline-for-dsb-2019-catboost-model
- Bruno Aquino: https://www.kaggle.com/braquino/catboost-some-more-features
- Heng Zheng: https://www.kaggle.com/hengzheng/bayesian-optimization-seed-blending

In [None]:
# make 'title' and 'event_code' list
title_list = list(set(train['title'].value_counts().index) \
                   .union(set(test['title'].value_counts().index)))
event_code_list = list(set(train['event_code'].value_counts().index) \
                   .union(set(test['event_code'].value_counts().index)))

In [None]:
# makes dict 'title to number(integer)'
title2num = dict(zip(title_list, np.arange(len(title_list))))
# makes dict 'number to title'
num2title = dict(zip(np.arange(len(title_list)), title_list))
# 
assess_titles = list(set(train[train['type'] == 'Assessment']['title'].
                         value_counts().index).
                         union(set(test[test['type'] == 'Assessment']['title'].
                         value_counts().index)))
# makes dict 'title to win event_code' 
# (4100 except 'Bird Measurer' and 4110 for 'Bird Measurer'))
title2win_code = dict(zip(title2num.values() \
                    ,(np.ones(len(title2num))).astype('int') * 4100))
title2win_code[title2num['Bird Measurer (Assessment)']] = 4110

In [None]:
# Convert 'title' to the number
train['title'] = train['title'].map(title2num)
test['title'] = test['title'].map(title2num)
train_labels['title'] = train_labels['title'].map(title2num)

# Convert 'timestamp' to datetime
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [None]:
# Convert the raw data into processed features
def get_data(user_sample, test_set=False):
    '''
    user_sample : DataFrame from train/test group by 'installation_id'
    test_set    : related with the labels processing
    '''
    # Constants and parameters declaration
    user_assessments = []
    last_type = 0
    types_count_dc = {'Clip':0, 'Activity':0, 'Assessment':0, 'Game':0}
    time_first_activity = float(user_sample['timestamp'].values[0])
    time_spent_each_title_dc = {title:0 for title in title_list}
    event_code_count_dc = {code:0 for code in event_code_list}
    accuracy_groups_dc = {0:0, 1:0, 2:0, 3:0}
    
    accumu_accuracy_group = 0
    accumu_accuracy=0
    accumu_win_n = 0 
    accumu_loss_n = 0 
    accumu_actions = 0
    counter = 0
    durations = []
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}   ## add
    
    # group by 'game_session'
    for game_id, session in user_sample.groupby('game_session', sort=False):
        # game_id: game_session_id
        # session: DataFrame from user_sample group by 'game_session'
        session_type = session['type'].iloc[0]  # Game/Assessment/Activity/Clip
        session_title = session['title'].iloc[0]        # session_title:int
        session_title_text = num2title[session_title]   ## add
        
        if session_type != 'Assessment':
            time_spent = int(session['game_time'].iloc[-1] / 1000)   # [sec]
            time_spent_each_title_dc[num2title[session_title]] += time_spent
        
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100(4110)
            all_4100_df = session.query(f'event_code == \
                                         {title2win_code[session_title]}')
            # numbers of wins and losses
            win_n = all_4100_df['event_data'].str.contains('true').sum()
            loss_n = all_4100_df['event_data'].str.contains('false').sum()

            # init features_dc and then update
            features_dc = types_count_dc.copy()
            features_dc.update(last_accuracy_title.copy())   ## add
            features_dc.update(time_spent_each_title_dc.copy())
            features_dc.update(event_code_count_dc.copy())
            features_dc['session_title'] = session_title
            features_dc['accumu_win_n'] = accumu_win_n
            features_dc['accumu_loss_n'] = accumu_loss_n
            accumu_win_n += win_n
            accumu_loss_n += loss_n
            
            features_dc['installation_id'] = session['installation_id'].iloc[-1] # Mod 2019-12-20
            features_dc['day_of_the_week'] = (session['timestamp'].iloc[-1]). \
                                              strftime('%a')    # Mod 2019-11-17

            if durations == []:
                features_dc['duration_mean'] = 0
            else:
                features_dc['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)

            # average of the all accuracy of this player
            features_dc['accuracy_ave'] = accumu_accuracy / counter \
                                                if counter > 0 else 0
            accuracy = win_n / (win_n + loss_n) \
                                   if (win_n + loss_n) > 0 else 0
            accumu_accuracy += accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy    ## add
            if accuracy == 0:
                features_dc['accuracy_group'] = 0
            elif accuracy == 1:
                features_dc['accuracy_group'] = 3
            elif accuracy == 0.5:
                features_dc['accuracy_group'] = 2
            else:
                features_dc['accuracy_group'] = 1
            features_dc.update(accuracy_groups_dc)
            accuracy_groups_dc[features_dc['accuracy_group']] += 1
            # average of accuracy_groups_dc of this player
            features_dc['accuracy_group_ave'] = \
                    accumu_accuracy_group / counter if counter > 0 else 0
            accumu_accuracy_group += features_dc['accuracy_group']
            
            # how many actions the player has done in this game_session
            features_dc['accumu_actions'] = accumu_actions
            
            # if test_set, all sessions belong to the final dataset
            # elif train, needs to be passed throught this clausule
            if test_set or (win_n + loss_n) > 0:
                user_assessments.append(features_dc)
                
            counter += 1
        
        # how many actions was made in each event_code
        event_codes = Counter(session['event_code'])
        for key in event_codes.keys():
            event_code_count_dc[key] += event_codes[key]

        # how many actions the player has done
        accumu_actions += len(session)
        if last_type != session_type:
            types_count_dc[session_type] += 1
            last_type = session_type
            
    # if test_set, only the last assessment must be predicted,
    # the previous are scraped
    if test_set:
        return user_assessments[-1]
    return user_assessments

In [None]:
# get_data function is applyed to each installation_id
def compile_data(df, test_set):
    compiled_data = []
    for ins_id, user_sample in tqdm(df.groupby('installation_id', sort=False),
                                     total=df['installation_id'].nunique()):
        # user_sample : DataFrame group by 'installation_id'
        if test_set == False:
            compiled_data += get_data(user_sample, test_set)
        else:
            compiled_test = get_data(user_sample, test_set)
            compiled_data.append(compiled_test)
            
    compiled_df = pd.DataFrame(compiled_data)
    del compiled_data
    
    # additional feature engineering
    compiled_df['insta_session_count'] = compiled_df.groupby(['installation_id']) \
                                                            ['Clip'].transform('count')
    compiled_df['insta_duration_mean'] = compiled_df.groupby(['installation_id']) \
                                                            ['duration_mean'].transform('mean')
    compiled_df['insta_title_nunique'] = compiled_df.groupby(['installation_id']) \
                                                            ['session_title'].transform('nunique')
    compiled_df.drop('installation_id', axis=1, inplace=True)
    # convert day_of_the_week to int
    compiled_df['day_of_the_week'] = compiled_df['day_of_the_week'].map(
        {'Mon':1, 'Tue': 2, 'Wed':3, 'Thu':4, 'Fri':5,'Sat':6, 'Sun':7})
    
    return compiled_df

In [None]:
# compile train data
new_train = compile_data(train, test_set = False).copy()
print(new_train.shape)
new_train.head(10)

In [None]:
# compile test data
new_test = compile_data(test, test_set = True).copy()
print(new_test.shape)
new_test.head(10)

In [None]:
# rejyect almost same features
features = new_train.columns
counter = 0
to_remove = []
for f_a in features:
    for f_b in features:
        if f_a != f_b and f_a not in to_remove and f_b not in to_remove:
            c = np.corrcoef(new_train[f_a], new_train[f_b])[0][1]
            if c > 0.99:
                counter += 1
                to_remove.append(f_b)
                print('{}: {} vs {} : Correlation= {}'.format(counter, f_a, f_b, c))

In [None]:
print(len(features))
features = [x for x in features if x not in to_remove]
print(len(features))

new_train = new_train[features]
new_test = new_test[features]

new_train.head()

In [None]:
new_test.head()

In [None]:
# all_features but 'accuracy_group', that is the label y
all_features = [x for x in new_train.columns if x not in ['accuracy_group']]
# categorical feature
categorical_features = ['session_title','day_of_the_week']

In [None]:
# Encode categorical_features to integer(for use with LightGB,XGBoost,etc)

# concatnate train and test data
temp_df = pd.concat([new_train[all_features], new_test[all_features]])
# encode
encoder = ce.ordinal.OrdinalEncoder(cols = categorical_features)
temp_df = encoder.fit_transform(temp_df)
# dataset
X, y = temp_df.iloc[:len(new_train),:].copy(), new_train['accuracy_group'].copy()
X_test = temp_df.iloc[len(new_train):,:].copy()

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X_test.head()

In [None]:
del train,test,new_train, new_test

## Step 1 : Create Regressor Models
Create multiple train_datasets using `kFold` and create a regression model from each dataset. I used ** CatBoost **, ** XGBoost **, ** LightGBM **.

In [None]:
# Create multiple datasets to create multiple models (not for CV).
NFOLDS = 5
folds = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=42)

### - CatBoost

In [None]:
# CatBoost
start_time = time()
cat_models = []
scores = []

params = {
    'learning_rate': 0.02,
    'loss_function': 'RMSE',
    'random_seed': 42,
    'depth': 11,                            # 10
    'border_count': 37,                     # 108
    'bagging_temperature': 2.348502,        # 
    'task_type': task_type,
}

# Train and make models
for fold, (train_ids, val_ids) in enumerate(folds.split(X,y)):
    print('● Fold :', fold+1,'/',NFOLDS)
    dtrain = cat.Pool(X.iloc[train_ids], y[train_ids],
                     cat_features=categorical_features)
    dval = cat.Pool(X.iloc[val_ids], y[val_ids],
                   cat_features=categorical_features)
    model = cat.train(params=params,
                      dtrain=dtrain,
                      eval_set=dval,        # =evals
                      iterations=5000,      # =num_boost_round
                      early_stopping_rounds=100,
                      verbose=200
                     )
    cat_models.append(model)
    
print('Time:', time() - start_time)

### -XGBoost

In [None]:
# XGBoost
start_time = time()
xgb_models = []
scores = []

params = {
    'max_depth': 6,                     # 6,10,9
    'learning_rate': 0.01,              # =eta 0.1: [0,1]
    'objective': 'reg:squarederror',    # 'reg:linear'
    'n_estimators' : 300,               # 100
    'subsample': 0.79,                  # 1,0.8,0.6    # 1, (0,1]    
    'colsample_bytree': 1.0,            # 1,0.8,1.0    # 1, (0, 1]   
    'gamma': 0.14,                      # 0.0
    'min_child_weight': 3,              # 5
    'seed' : 42,
}

# Train and make models
for fold, (train_ids, val_ids) in enumerate(folds.split(X,y)):
    print('● Fold :', fold+1,'/',NFOLDS)
    dtrain = xgb.DMatrix(X.iloc[train_ids], y[train_ids])
    dval = xgb.DMatrix(X.iloc[val_ids], y[val_ids])
    model = xgb.train(params=params,
                      dtrain=dtrain,
                      num_boost_round=5000,
                      evals=[(dtrain, 'train'), (dval, 'val')],
                      early_stopping_rounds=100,
                      verbose_eval=200
                     )
    xgb_models.append(model)
    
print('Time:', time() - start_time)

### - LightGBM

In [None]:
# LightGBM
start_time = time()
lgb_models = []
scores = []

params = {
    'n_jobs': -1,
    'seed': 42,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'feature_fraction': 0.95,        # 0.998, 0.967
    'bagging_fraction': 0.85,        # 0.8, 0.872    =subsample
    'learning_rate': 0.01,
    'max_depth': 14,                 # 10,13
    'num_leaves': 957,               # 1024,440    # 2^max_depth < num_leaves ?
    'min_gain_to_split': 0.096104,   # 0.086, 0.053
    'min_child_weight': 1.189104,    # 1.087, 1.497
    'lambda_l1': 1.8,                # 1.0
    'lambda_l2': 1.5,                # 1.0
}

# Train and make models
for fold, (train_ids, val_ids) in enumerate(folds.split(X,y)):
    print('● Fold :', fold+1,'/',NFOLDS)
    train_set = lgb.Dataset(X.iloc[train_ids], y[train_ids],
                           categorical_feature=categorical_features)
    val_set = lgb.Dataset(X.iloc[val_ids], y[val_ids],
                         categorical_feature=categorical_features)
    model = lgb.train(params=params,
                      train_set=train_set,
                      valid_sets=[train_set, val_set],
                      num_boost_round=5000,
                      early_stopping_rounds=100,
                      verbose_eval=100
                     )
    lgb_models.append(model)
    
print('\nTime:', time() - start_time)

## Step 2 : Predict each Model

In [None]:
preds = []

# CatBoost models
for model in cat_models:
    pred = model.predict(X)
    preds.append(pred)
    
# XGBoost models
for model in xgb_models:
    pred = model.predict(xgb.DMatrix(X))
    pred = pred.flatten()
    preds.append(pred)
    
# LightGBM models
for model in lgb_models:
    pred = model.predict(X,num_iteration=model.best_iteration)
    pred = pred.reshape(len(X),1).flatten()
    preds.append(pred)

reg_df = pd.DataFrame(preds).T
reg_df.columns = ['C1','C2','C3','C4','C5',   # CatBoost
                  'X1','X2','X3','X4','X5',   # XGBoost
                  'L1','L2','L3','L4','L5']   # LightGBM

In [None]:
# Calculate the average value of each model pred
reg_df['mean'] = reg_df.mean(axis = 'columns')
reg_df.head(10)

## Step 3 : Optimize Rounding Coefficients
The rounding coefficient is optimized using the average value of the prediction results of each model. Optimization uses `scipy.optimize.minimize()`.

The rounding coefficients of each model is optimized using `scipy.optimize.minimize()`. And calculate the final coefficient by weighted average of the optimal coefficient of each model.

In [None]:
class OptRounder(object):
    def __init__(self):
        self.res_ = []
        self.coef_ = []
        
    def get_res(self):
        return self.res_
    
    # objective function
    def func(self, coef, X, y):
        kappa = cohen_kappa_score(self.bincut(coef, X), y,
                                  weights='quadratic')
        return -kappa

    def bincut(self, coef, X):
        return pd.cut(X,
                      [-np.inf] + list(np.sort(coef)) + [np.inf],
                      labels = [0, 1, 2, 3])
        
    def fit(self, X, y):
        pfunc = partial(self.func, X=X, y=y)
        self.res_ = sp.optimize.minimize(fun = pfunc,           # objective func
                                         x0 = [0.7, 1.5, 2.3],  # initial coef
                                         method='nelder-mead')  # solver
        self.coef_ = self.res_.x
        
    def predict(self, X, coef):
        return self.bincut(coef, X)

In [None]:
optR = OptRounder()

# Optimize each model's coef
coef = []
for col in tqdm(reg_df.columns[:-1]):
    optR.fit(reg_df[col].values.reshape(-1,), y)
    res = optR.get_res()
    coef.append(np.append(res.x, -res.fun))  # Optimized coef & kappa

coef_df = pd.DataFrame(coef,
                       columns = ['coef0','coef1','coef2','kappa'],
                       index = reg_df.columns[:-1])
coef_df

In [None]:
# coefficients average weighted by kappa for each model
coefficients = []
for col in coef_df.columns[:-1]:
    coefficients.append(np.average(np.array(coef_df[col]),
                                   weights=np.array(coef_df['kappa'])))
print(coefficients)

## Step 4 : Final Classification

In [None]:
# final classification
reg_df['predict'] = optR.predict(reg_df['mean'].values,
                                 coefficients).astype(int)

reg_df['y'] = y
kappa = cohen_kappa_score(reg_df['predict'], y, weights='quadratic')
print('●Cohen Kappa score (traind X):',kappa)
reg_df[['mean','predict','y']].head(10)

In [None]:
reg_df[['mean','predict','y']].plot(subplots=True,layout=(1, 3),
                                    figsize=(11, 3),kind='hist')

In [None]:
# binning plot of 'pred' versus 'y'
reg_df.plot.hexbin(x='y', y='predict', gridsize=(3,3),
                   sharex=False, title = "binning 'pred' vs 'y'")

## Make submission

In [None]:
preds = []
for model in cat_models:        # CatBoost
    pred = model.predict(X_test)
    preds.append(pred)
for model in xgb_models:        # XGBoost
    pred = model.predict(xgb.DMatrix(X_test))
    pred = pred.flatten()
    preds.append(pred)
for model in lgb_models:        # LightGBM
    pred = model.predict(X_test,num_iteration=model.best_iteration)
    pred = pred.reshape(len(X_test),1).flatten()
    preds.append(pred)
df_s = pd.DataFrame(preds).T

df_s['mean'] = df_s.mean(axis = 'columns')

# Classification
df_s['pred'] = optR.predict(df_s['mean'].values, coefficients).astype(int)

print(df_s.shape)
df_s[['mean','pred']].head(10)

In [None]:
df_s[['mean','pred']].plot(subplots=True, layout=(1, 2),
                           figsize=(7, 3), kind='hist')

In [None]:
submission = pd.read_csv(os.path.join(DIR,'sample_submission.csv'))
submission['accuracy_group'] = df_s['pred']
submission.head(10)

In [None]:
submission.to_csv('submission.csv', index=None)