# 1. Importing Libraries and Data

Importing the relevant libraries and the cleaned dataset.

In [16]:
import pandas as pd
from datetime import date, timedelta
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from IPython.display import display
from IPython.display import Image
from sklearn.externals.six import StringIO 
from subprocess import call

import warnings
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Adam
from keras.regularizers import l2

from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, classification_report
from sklearn.ensemble import StackingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn import tree
from sklearn.tree import export_graphviz
from pickle import dump

from Classes import Classification
from Classes import Ensemble

pd.set_option('display.max_columns', 500)
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'deslib'

In [None]:
df = pd.read_csv('data_cleaned6.csv')

The class distribution is checked to ensure that there is a balanced number of winners on both the fighter_x and fighter_y sides. This is already approximately a 50/50 split.

In [None]:
df.fx_win.value_counts(normalize=True)

In [None]:
df.head()

# 2. Data Transformations

In this section the data is transformed using the StandardScaler class. The scaling was fit to fighter_x and then used to transform the fighter_y data.

In [None]:
y = df[['fx_win']]

In [None]:
X = df.drop(columns=['fx_win','name_x','name_y'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=400, random_state=50, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=50, stratify=y_train)

In [None]:
x_num_cols = [col for col in X.columns if '_x' in col and 'stance' not in col]
y_num_cols = [col for col in X.columns if '_y' in col and 'stance' not in col]

In [2]:
scaler = RobustScaler()

NameError: name 'RobustScaler' is not defined

In [3]:
X_train[x_num_cols] = scaler.fit_transform(X_train[x_num_cols])
X_train[y_num_cols] = scaler.transform(X_train[y_num_cols])

X_val[x_num_cols] = scaler.transform(X_val[x_num_cols])
X_val[y_num_cols] = scaler.transform(X_val[y_num_cols])

X_test[x_num_cols] = scaler.transform(X_test[x_num_cols])
X_test[y_num_cols] = scaler.transform(X_test[y_num_cols])

NameError: name 'scaler' is not defined

The scaler was then saved to be used on the data in the web application.

In [4]:
dump(scaler, open('scaler.pkl', 'wb'))

NameError: name 'dump' is not defined

# 3. Modelling

A stratified kfold with 5 splits was used for the cross validation of the models.

In [5]:
skf = StratifiedKFold(n_splits=5, random_state=50)

NameError: name 'StratifiedKFold' is not defined

## 3.1. Logistic Regression

The first model tried was a logistic regression. This acts as a baseline for which other models will be compared to. The metric by which models will be assessed is the area under the ROC curve.

In [6]:
log_reg1 = Classification("Logistic Regression", X_train, X_val, y_train, y_val)

NameError: name 'Classification' is not defined

A grid search is completed which returns the best performing hyperparameters for the model.

In [7]:
param_grid = {'C': [0.01,0.1,1,10,100],
             'penalty': ['l1','l2'],
             'solver': ['liblinear','saga']}
log_reg1.get_scores(param_grid, skf)

NameError: name 'log_reg1' is not defined

## 3.2. Support Vector Machine

In [8]:
svm1 = Classification("SVM", X_train, X_val, y_train, y_val)

NameError: name 'Classification' is not defined

In [9]:
param_grid = {'C':[1,2,3,4,5,6],
             'degree':[1,2,3,4,5,6],
             'gamma':['scale','auto'],
             'decision_function_shape':['ovo','ovr']}
svm1.get_scores(param_grid, skf)

NameError: name 'svm1' is not defined

## 3.3. Decision Tree

In [10]:
d_tree = Classification('Decision Tree', X_train, X_val, y_train, y_val)

NameError: name 'Classification' is not defined

In [11]:
param_grid = {'max_depth':[1,10,100,1000],'min_samples_leaf':[1,10,100,1000]}
d_tree.get_scores(param_grid, skf)

NameError: name 'd_tree' is not defined

In [12]:
d_tree.opt_plots()

NameError: name 'd_tree' is not defined

In [None]:
d_tree4 = DecisionTreeClassifier(max_depth=6, min_samples_leaf=31, random_state=50)

In [None]:
d_tree4.fit(X_train, y_train)

In [None]:
val_prob = d_tree4.predict_proba(X_val)[:,1]

In [None]:
val_auc = roc_auc_score(y_val, val_prob)

In [None]:
val_auc

In [None]:
d_tree2 = Classification('Decision Tree', X_train, X_val, y_train, y_val)

In [None]:
param_grid = {'max_depth':range(1,101,5),'min_samples_leaf':range(1,101,5)}
d_tree2.get_scores(param_grid, skf)

In [None]:
d_tree2.opt_plots()

In [None]:
d_tree3 = Classification('Decision Tree', X_train, X_val, y_train, y_val)

In [None]:
param_grid = {'max_depth':range(1,16,1),'min_samples_leaf':range(15,46,1)}
d_tree3.get_scores(param_grid, skf)

In [None]:
d_tree3.opt_plots()

## 3.4. Random Forest

In [None]:
rand_forr = Classification("Random Forest", X_train, X_val, y_train, y_val)

In [None]:
param_grid = {'max_depth':[1,10,100,1000],'min_samples_leaf':[1,10,100,1000]}
rand_forr.get_scores(param_grid, skf)

In [None]:
rand_forr.opt_plots()

In [None]:
rand_forr2 = Classification("Random Forest", X_train, X_val, y_train, y_val)

In [None]:
param_grid = {'max_depth':range(1,101,1),'min_samples_leaf':range(1,21,1)}
rand_forr2.get_scores(param_grid, skf)

In [None]:
rand_forr2.opt_plots()

## 3.5. Voing Classifier

In [None]:
svm = ('svm', svm1.best_model)
rand_forr = ('rand_forr', rand_forr3.best_model)
estimators = [svm, rand_forr]
voting = Ensemble("Voting", estimators, X_train, X_val, y_train, y_val)

In [None]:
voting.ensemble_get_scores({}, skf)

## 3.6. Adaboost

In [None]:
estimators = rand_forr3.best_model
adaboost = Ensemble("AdaBoost", estimators, X_train, X_val, y_train, y_val)

In [None]:
adaboost.ensemble_get_scores({}, skf)

## 3.7. XGBoost

In [None]:
estimators = []
xgboost = Ensemble("XGBoost", estimators, X_train, X_val, y_train, y_val)

In [None]:
xgboost.ensemble_get_scores({}, skf)

## 3.8. Stacking Classifier

In [None]:
estimators = [('svm', svm1.best_model), 
              ('rnd', rand_forr3.best_model),
              ('voting', voting.best_model)]
stacking = Ensemble("Stacking", estimators, X_train, X_val, y_train, y_val)

In [None]:
stacking.ensemble_get_scores({}, skf)

In [None]:
stacking.best_model.score(X_test, y_test)

## 3.9. Deep Neural Network

In [None]:
l2_reg = keras.regularizers.l2(0.0001)

In [None]:
model2 = Sequential()
model2.add(Dense(64, activation='relu', input_dim=X_train.shape[1], kernel_initializer='normal', 
                 kernel_regularizer=l2_reg))
model2.add(Dropout(rate=0.1))
model2.add(Dense(64, activation='relu', kernel_initializer='normal', kernel_regularizer=l2_reg))
model2.add(Dropout(rate=0.1))
model2.add(Dense(1, activation='sigmoid'))

In [None]:
model2.summary()

In [None]:
model2.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

In [None]:
history2 = model2.fit(X_train, y_train, epochs=40, batch_size=16, validation_data=(X_val, y_val))

In [None]:
pd.DataFrame(history2.history).plot(figsize=(15,10))
plt.grid(True)
plt.gca().set_ylim(0,1)
plt.show()

In [None]:
y_pred = model2.predict(X_test)

In [None]:
model2.evaluate(X_val, y_val)

# 4. Model Evaluation

In [None]:
model_evaluation_list = [log_reg1.scores_table,
                         svm1.scores_table, 
                         d_tree3.scores_table, 
                         rand_forr3.scores_table,
                         voting.scores_table,
                         adaboost.scores_table,
                         xgboost.scores_table,
                         stacking.scores_table]

model_names = ['logistic_regression',
               'svm',
               'decision_tree',
               'random_forest',
               'voting',
               'adaboost (random forest)',
               'xgboost',
               'stacking (svm, random forest, xgboost)']

In [None]:
df_evaluation = pd.concat(model_evaluation_list)

In [None]:
dump(stacking.best_model, open('model.pkl', 'wb'))