In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import features

In [5]:
feats=pd.read_csv('installation_features.csv')
labels=pd.read_csv('installation_labels.csv')

In [6]:
feature_pipe=features.get_data_processing_pipe(feats,log_features=['game_time', 'event_count'], categorical_features=['last_world', 'last_assessment'])

In [59]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score, make_scorer, accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
import inspect
import xgboost as xgb
import soft_kappa_loss as kappa
from sklearn.utils import class_weight
from OrdinalRegressor import OrdinalRegressor
from OptimizeThresholds import OptimizedRounder

X_train, X_test, y_train, y_test = train_test_split(feats, labels.accuracy_group, test_size=0.05, random_state=42)

### setup the pipeline
ordinal_pipe = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', OrdinalRegressor(xgb.XGBRegressor,
                              **{'colsample_bytree':0.5,
                                 'learning_rate':0.1,
                                 'max_depth':7,
                                 'subsample':1}))])


ordinal_pipe.fit(X_train, y_train)
y_pred = ordinal_pipe.predict(X_test, **{'classify': True})
print((y_pred==y_test).mean())
print(cohen_kappa_score(y1=y_test, y2=y_pred, weights='quadratic'))
confusion_matrix(y_test, y_pred)

0.5073446327683616
0.6124221150864135


array([[ 58, 113,  34,  26],
       [  5,  42,  42,  33],
       [  1,  22,  29,  59],
       [  0,  28,  73, 320]])

In [None]:
regressor_kappa_score = make_scorer(lambda y, y_pred: cohen_kappa_score(y, y_pred, weights='quadratic'), needs_proba=True)
regressor_accuracy_score = make_scorer(lambda y, y_pred: accuracy_score(y, y_pred), needs_proba=True)

ordinal_cv = GridSearchCV(ordinal_pipe, cv=10, 
                      scoring={'kappa': regressor_kappa_score,
                               'accuracy': regressor_accuracy_score,
                               'rmse': make_scorer(mean_squared_error)},
                      param_grid={'clf__max_depth': [5, 6, 7, 8, 9],
                                  'clf__learning_rate': [0.01, 0.03, 0.1],
                                  'clf__subsample': [0.8, 1],
                                  'clf__colsample_bytree': [0.3, 0.5, 0.8]},
                      refit=False)
ordinal_cv.fit(feats, labels.accuracy_group)
pd.DataFrame(ordinal_cv.cv_results_)

In [48]:
# print(ordinal_cv.best_params_)
pd.DataFrame(ordinal_cv.cv_results_).sort_values('mean_test_kappa', ascending=False).head()[['mean_test_kappa', 'mean_test_accuracy', 'mean_test_rmse']]

Unnamed: 0,mean_test_kappa,mean_test_accuracy,mean_test_rmse
51,0.545278,0.546128,1.063646
50,0.543564,0.547089,1.063029
80,0.542952,0.544432,1.064869
52,0.542376,0.542906,1.068208
23,0.541848,0.543358,1.072427


In [49]:
params = pd.DataFrame(ordinal_cv.cv_results_).sort_values('mean_test_kappa', ascending=False)['params'].values[0]
print(params)

{'clf__colsample_bytree': 0.5, 'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__subsample': 1}


In [50]:
ordinal_pipe = Pipeline(steps=[
    ('preprocess', feature_pipe),
    ('clf', OrdinalRegressor(xgb.XGBRegressor,
                              **{'clf__colsample_bytree': 0.5, 'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__subsample': 1}))])


ordinal_pipe.fit(X_train, y_train)
y_pred = ordinal_pipe.predict(X_test, **{'classify': True})
print((y_pred==y_test).mean())
print(cohen_kappa_score(y1=y_test, y2=y_pred, weights='quadratic'))
confusion_matrix(y_test, y_pred)

0.5864406779661017
0.6255592699807795


array([[147,  35,  23,  26],
       [ 28,  33,  25,  36],
       [ 15,  19,  17,  60],
       [ 21,  33,  45, 322]])