Load necessary libraries.

In [27]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from statistics import mean
import matplotlib.pyplot as plt

from sklearn.metrics import precision_recall_curve
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix

from imblearn.pipeline import Pipeline as imbpipeline

# I. Model Testing on All Data

Create explanatory and response variables.

In [28]:
df = pd.read_csv('train.csv')
X = df.drop(['outcome'], axis=1)
y = pd.get_dummies(df['outcome'], drop_first=True)
y = np.ravel(y).reshape((-1,))

In [29]:
y

array([1, 0, 0, ..., 1, 1, 1], dtype=uint8)

Identify numeric and categorical columns.

In [30]:
numeric_columns = X.select_dtypes(include=['number']).columns

categorical_columns = list(set(X.columns) - set(numeric_columns))

Preprocess data.

In [31]:
numeric_pipe = Pipeline([('imputer', SimpleImputer(strategy='median'))])
categorical_pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder())])
t = [('cat', categorical_pipe, categorical_columns), ('num', numeric_pipe, numeric_columns)]
col_transform = ColumnTransformer(transformers=t)
X_nonreg = col_transform.fit_transform(X) # only used for non regularized models. IE. basic logistic regression

Need a different pipeline that scales features for ridge, lasso, and elastic net.

In [32]:
numeric_transformer = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', MinMaxScaler())])
categorical_transformer = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value='missing')), ('ohe', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

Calculating baseline.

In [33]:
# 1 == red wins
# assuming favourite always wins
ones = np.ones((len(y),), dtype=int)
print(-log_loss(y, ones))
# assuming favourite wins with probability proportional to historical favourite win rate
percent_favourite = sum(y)/len(y)
print(percent_favourite)
fav_win_rates = [percent_favourite] * len(y)
print(-log_loss(y, fav_win_rates))

-12.248972182133173
0.6453641415618602
-0.6502692790201589


In [8]:
df

Unnamed: 0,outcome,weightclass,time_format,title,r_kd,r_sub.att,r_rev.,r_ctrl_s,r_sig_str_percent,r_sig_str_att,...,b_percent_loss_by_doctor_stoppage,date,r_height,r_reach,r_stance,r_dob,b_height,b_reach,b_stance,b_dob
0,R,Middleweight,5 Rnd (5-5-5-5-5),False,0.222222,0.222222,0.000000,100.444444,0.410538,202.444444,...,0.0,16881696.0,73.0,76.0,Orthodox,6676128.0,74.0,78.0,Orthodox,6522336.0
1,B,Lightweight,3 Rnd (5-5-5),False,0.166667,0.000000,0.000000,195.666667,0.426600,132.833333,...,0.0,16881696.0,70.0,74.0,Orthodox,6655392.0,70.0,72.0,Switch,7617024.0
2,B,Welterweight,3 Rnd (5-5-5),False,0.571429,0.285714,0.214286,132.500000,0.502677,106.714286,...,0.0,16881696.0,71.0,76.0,Orthodox,5020704.0,72.0,79.0,Orthodox,9301824.0
3,R,Women's Flyweight Bout,3 Rnd (5-5-5),False,0.333333,0.222222,0.111111,123.111111,0.390576,106.111111,...,0.0,16881696.0,66.0,67.0,Orthodox,7595424.0,65.0,69.0,Orthodox,8309952.0
4,B,Lightweight,3 Rnd (5-5-5),False,1.000000,0.000000,0.000000,47.000000,0.693878,49.000000,...,0.0,16881696.0,68.0,71.0,Orthodox,8201088.0,71.0,73.0,Southpaw,8192448.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6833,R,Light Heavyweight,5 Rnd (5-5-5-5-5),True,,,,,,,...,0.0,9828864.0,75.0,74.0,Orthodox,1596672.0,72.0,74.0,Orthodox,350784.0
6834,R,Lightweight,5 Rnd (5-5-5-5-5),True,,,,,,,...,0.0,9828864.0,67.0,70.0,Southpaw,1555200.0,67.0,70.0,Southpaw,1687392.0
6835,R,Welterweight,3 Rnd (5-5-5),False,,,,,,,...,0.0,9828864.0,68.0,,Orthodox,179712.0,62.0,,Southpaw,-376704.0
6836,R,Middleweight,3 Rnd (5-5-5),False,,,,,,,...,0.0,9828864.0,75.0,77.0,Orthodox,352512.0,73.0,74.0,Orthodox,1781568.0


In [9]:
y

array([1, 0, 0, ..., 1, 1, 1], dtype=uint8)

## Logistic Regression
Get cv cross entropy estimate.

In [10]:
clf = LogisticRegression(penalty='none')
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
cv_scores_logistic = cross_val_score(estimator=clf, X=X_nonreg, y=y, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
print(mean(cv_scores_logistic))

-0.6307739491118018


## Ridge
Included along with elastic net because uses different solver.

In [11]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(penalty='l2'))]
)

param_grid = dict(classifier__C=[0.001, 0.01, 0.1, 1, 10])

rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

-0.6021754639475657
{'classifier__C': 1}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
ridge_grid_table = pd.concat([pd.DataFrame(grid.cv_results_['params']),
                         pd.DataFrame(grid.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)

ridge_grid_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__C,Negative Log Loss
3,1.0,-0.602175
4,10.0,-0.603675
2,0.1,-0.604013
1,0.01,-0.618997
0,0.001,-0.641229


## Lasso
Included along with elastic net because uses different solver.

In [13]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(penalty='l1', solver='liblinear'))]
)

param_grid = dict(classifier__C=[0.001, 0.01, 0.1, 1, 10])

rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

-0.6011158868881584
{'classifier__C': 1}


In [14]:
lasso_grid_table = pd.concat([pd.DataFrame(grid.cv_results_['params']),
                         pd.DataFrame(grid.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)

lasso_grid_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__C,Negative Log Loss
3,1.0,-0.601116
4,10.0,-0.604122
2,0.1,-0.606081
1,0.01,-0.65116
0,0.001,-0.693147


## Elastic Net
Tune model parameters and obtain cv accuracy estimates.

In [15]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(penalty='elasticnet', solver='saga', random_state=1))]
)

param_grid = dict(classifier__C=[0.001, 0.01, 0.1, 1, 10], 
                  classifier__l1_ratio=np.arange(0, 1.125, 0.125))
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

-0.6011213845413709
{'classifier__C': 1, 'classifier__l1_ratio': 1.0}


In [16]:
en_grid_table = pd.concat([pd.DataFrame(grid.cv_results_['params']),
                         pd.DataFrame(grid.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)

en_grid_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__C,classifier__l1_ratio,Negative Log Loss
35,1.0,1.0,-0.601121
34,1.0,0.875,-0.601182
33,1.0,0.75,-0.601259
32,1.0,0.625,-0.601359
31,1.0,0.5,-0.601485
30,1.0,0.375,-0.601641
29,1.0,0.25,-0.601802
28,1.0,0.125,-0.60197
27,1.0,0.0,-0.602148
19,0.1,0.125,-0.60392


# II. Model Testing on Data With No Missing Values

In [17]:
df = pd.read_csv('train_no_na.csv')
X = df.drop(['outcome'], axis=1)
y = pd.get_dummies(df['outcome'], drop_first=True)
y = np.ravel(y).reshape((-1,))

X_nonreg = col_transform.fit_transform(X) # only used for non regularized models. IE. basic logistic regression

Calculating Baseline

In [26]:
# 1 == red wins
# assuming favourite always wins
ones = np.ones((len(y),), dtype=int)
print(-log_loss(y, ones))
# assuming favourite wins with probability proportional to historical favourite win rate
percent_favourite = sum(y)/len(y)
print(percent_favourite)
fav_win_rates = [percent_favourite] * len(y)
print(-log_loss(y, fav_win_rates))

-12.928562768179523
0.6256884343036979
-0.6612105569516342


## Logistic Regression
Get cv accuracy estimate.

In [18]:
clf = LogisticRegression(penalty='none')
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
cv_scores_logistic = cross_val_score(estimator=clf, X=X_nonreg, y=y, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
print(mean(cv_scores_logistic))

-0.6376620313965908


## Ridge
Included along with elastic net because uses different solver.

In [19]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(penalty='l2'))]
)

param_grid = dict(classifier__C=[0.001, 0.01, 0.1, 1, 10])

rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

-0.6178083293920043
{'classifier__C': 1}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
ridge_grid_table = pd.concat([pd.DataFrame(grid.cv_results_['params']),
                         pd.DataFrame(grid.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)

ridge_grid_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__C,Negative Log Loss
3,1.0,-0.617808
2,0.1,-0.619947
4,10.0,-0.620268
1,0.01,-0.635706
0,0.001,-0.654772


## Lasso
Included along with elastic net because uses different solver.

In [21]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(penalty='l1', solver='liblinear'))]
)

param_grid = dict(classifier__C=[0.001, 0.01, 0.1, 1, 10])

rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

-0.6162806233221458
{'classifier__C': 1}


In [22]:
lasso_grid_table = pd.concat([pd.DataFrame(grid.cv_results_['params']),
                         pd.DataFrame(grid.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)

lasso_grid_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__C,Negative Log Loss
3,1.0,-0.616281
4,10.0,-0.62057
2,0.1,-0.621638
1,0.01,-0.662608
0,0.001,-0.693147


## Elastic Net
Tune model parameters and obtain cv accuracy estimates.

In [23]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(penalty='elasticnet', solver='saga', random_state=1))]
)

param_grid = dict(classifier__C=[0.001, 0.01, 0.1, 1, 10], 
                  classifier__l1_ratio=np.arange(0, 1.125, 0.125))
rkf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
grid = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='neg_log_loss', cv=rkf, n_jobs=-1)
grid.fit(X, y)
print(grid.best_score_)
print(grid.best_params_)

-0.6162116307486469
{'classifier__C': 1, 'classifier__l1_ratio': 1.0}


In [24]:
en_grid_table = pd.concat([pd.DataFrame(grid.cv_results_['params']),
                         pd.DataFrame(grid.cv_results_['mean_test_score'],
                                      columns=['Negative Log Loss'])],axis=1)

en_grid_table.sort_values('Negative Log Loss', ascending=False)

Unnamed: 0,classifier__C,classifier__l1_ratio,Negative Log Loss
35,1.0,1.0,-0.616212
34,1.0,0.875,-0.616329
33,1.0,0.75,-0.616471
32,1.0,0.625,-0.61664
31,1.0,0.5,-0.616841
30,1.0,0.375,-0.617062
29,1.0,0.25,-0.617298
28,1.0,0.125,-0.617531
27,1.0,0.0,-0.61776
19,0.1,0.125,-0.619725
