In [2]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import features
import warnings
warnings.filterwarnings('ignore')

In [3]:
feats=pd.read_csv('installation_features.csv')
labels=pd.read_csv('installation_labels.csv')

In [4]:
feature_pipe=features.get_data_processing_pipe(feats,log_features=['game_time', 'event_count'], categorical_features=['last_world', 'last_assessment'])

In [5]:
from sklearn.model_selection import GridSearchCV, train_test_split
X_train, X_test, y_train, y_test = train_test_split(feats, labels.accuracy_group, test_size=0.05, random_state=42)

In [6]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC 
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score, make_scorer, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
import inspect
import xgboost as xgb
# import soft_kappa_loss as kappa
from sklearn.utils import class_weight

In [35]:
# Bagging Ensemble with logistic regression
kfold = model_selection.KFold(n_splits=5, random_state=7)
cart = LogisticRegression() # Use Logistic Regression as base estimator
num_trees = 10

lg_pipe = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', BaggingClassifier(base_estimator=cart, 
                              n_estimators=num_trees, 
                              random_state=7))])

lg_pipe.fit(X_train, y_train)
y_pred = lg_pipe.predict(X_test)

print((y_pred==y_test).mean())
print("ck_score:", cohen_kappa_score(y1=y_test, y2=y_pred, weights='quadratic'))
confusion_matrix(y_test, y_pred)

0.6248587570621469
ck_score: 0.5432883500643164


array([[151,   6,   1,  73],
       [ 39,  12,   0,  71],
       [ 23,   4,   0,  84],
       [ 25,   6,   0, 390]], dtype=int64)

In [7]:
# Bagging Ensemble with Decision Tree
kfold = model_selection.KFold(n_splits=5, random_state=7)
cart = DecisionTreeClassifier() # Use Decision Tree as base estimator
num_trees = 100

dt_pipe = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', BaggingClassifier(base_estimator=cart, 
                              n_estimators=num_trees, 
                              random_state=7))])

dt_pipe.fit(X_train, y_train)
y_pred = dt_pipe.predict(X_test)

print((y_pred==y_test).mean())
print("ck_score:", cohen_kappa_score(y1=y_test, y2=y_pred, weights='quadratic'))
confusion_matrix(y_test, y_pred)

0.6293785310734463
ck_score: 0.5575126184614507


array([[149,  12,   5,  65],
       [ 20,  27,   9,  66],
       [ 17,  13,   7,  74],
       [ 19,  17,  11, 374]], dtype=int64)

In [36]:
# Test AdaBoost Ensemble
from sklearn.ensemble import AdaBoostClassifier
seed = 7
num_trees = 70
kfold = model_selection.KFold(n_splits=10, random_state=seed)
adb_pipe = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', AdaBoostClassifier(n_estimators=num_trees, random_state=seed))])

adb_pipe.fit(X_train, y_train)
y_pred = adb_pipe.predict(X_test)

print((y_pred==y_test).mean())
print("ck_score:", cohen_kappa_score(y1=y_test, y2=y_pred, weights='quadratic'))
confusion_matrix(y_test, y_pred)

0.631638418079096
ck_score: 0.5489783338603855


array([[145,   9,   0,  77],
       [ 27,  20,   0,  75],
       [ 14,   3,   0,  94],
       [ 15,  12,   0, 394]], dtype=int64)

In [39]:
# Voting-Base Ensemble
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', LogisticRegression())])
estimators.append(('logistic', model1))
model2 = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', DecisionTreeClassifier())])
estimators.append(('cart', model2))
model3 = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', SVC())])
estimators.append(('svm', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)


ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

print((y_pred==y_test).mean())
print("ck_score:", cohen_kappa_score(y1=y_test, y2=y_pred, weights='quadratic'))
confusion_matrix(y_test, y_pred)

0.6237288135593221
ck_score: 0.5449248207043201


array([[148,   8,   0,  75],
       [ 40,  18,   0,  64],
       [ 23,   4,   0,  84],
       [ 21,  14,   0, 386]], dtype=int64)

In [41]:
regressor_kappa_score = make_scorer(lambda y, y_pred: cohen_kappa_score(y, y_pred, weights='quadratic'))
regressor_accuracy_score = make_scorer(lambda y, y_pred: accuracy_score(y, y_pred))

dt_cv = GridSearchCV(ordinal_pipe, cv=10, 
                      scoring={'kappa': regressor_kappa_score,
                               'accuracy': regressor_accuracy_score)},
                      param_grid={'clf__n_estimators': [5, 10, 20, 100]},
                      refit=False)
dt_cv.fit(feats, labels.accuracy_group)
pd.DataFrame(dt_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__n_estimators,params,split0_test_kappa,split1_test_kappa,split2_test_kappa,split3_test_kappa,...,split2_train_rmse,split3_train_rmse,split4_train_rmse,split5_train_rmse,split6_train_rmse,split7_train_rmse,split8_train_rmse,split9_train_rmse,mean_train_rmse,std_train_rmse
0,2.645448,0.694773,0.047403,0.00309,5,{'clf__n_estimators': 5},0.427293,0.385597,0.445234,0.403831,...,0.242337,0.254334,0.250942,0.25179,0.232823,0.256438,0.250911,0.233185,0.247573,0.009156
1,4.682973,0.224965,0.070351,0.009957,10,{'clf__n_estimators': 10},0.466582,0.405084,0.453943,0.444151,...,0.113819,0.114322,0.113254,0.108906,0.103128,0.111732,0.110727,0.11116,0.111507,0.004013
2,8.934751,0.295163,0.094249,0.004953,20,{'clf__n_estimators': 20},0.43396,0.451722,0.481136,0.455977,...,0.06005,0.059359,0.062123,0.061738,0.063057,0.067328,0.060796,0.065189,0.062176,0.002471
3,42.440261,3.17269,0.332231,0.0695,100,{'clf__n_estimators': 100},0.44593,0.46305,0.492764,0.48921,...,0.046294,0.045791,0.046545,0.042834,0.039379,0.04566,0.042771,0.04415,0.043911,0.002153


In [42]:
# print(ordinal_cv.best_params_)
pd.DataFrame(ftl_cv.cv_results_).sort_values('mean_test_kappa', ascending=False).head()[['mean_test_kappa', 'mean_test_accuracy', 'mean_test_rmse']]

Unnamed: 0,mean_test_kappa,mean_test_accuracy,mean_test_rmse
3,0.478088,0.583833,1.724251
2,0.458683,0.56173,1.78909
1,0.437443,0.537931,1.861504
0,0.405642,0.499774,1.952007
