In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from joblib import dump
DIR_NAME = os.getcwd()
TEST_YR = '18-19'

In [2]:
full_df = pd.read_csv(DIR_NAME + '/Data/' + 'Feats.csv', 
                      index_col=[0,1])

Since the overall home win rate is almost constant throughout, the effect of the advangtage for the home team is captured by the intercept term so we drop it from the model

In [3]:
X = full_df.iloc[:,1:5]
y = full_df['FTR']
mask = full_df.index.get_level_values(0) != TEST_YR
X_tr = X[mask]
y_tr = y[mask]
X_test = X[~mask]
y_test = y[~mask]

In [4]:
def model_selection(mods, param_grids):
    '''
    Trains models and selects the one with smallest log loss. 
    Hyperparameter tuning and model selection done using cross-validation.
    Parameters:
    Mods (list): list of sklearn estimators
    param_grids (list): list of dictionaries mapping hyperparameters to values 
    to be tested
    Returns:
    best_mod (estimator): the best model
    
    '''
    best_log_loss = np.inf
    best_mod = None
    for mod, param_grid in zip(mods, param_grids):
        clf = GridSearchCV(mod, param_grid, scoring=['neg_log_loss','accuracy'],
                           refit='neg_log_loss')
        clf.fit(X_tr, y_tr)
        if clf.best_score_ < best_log_loss:
            best_log_loss = clf.best_score_
            best_mod = clf
    
    return best_mod
        
        

In [5]:
mods = [Pipeline([(
    'standardization', StandardScaler()), ('clf',LogisticRegression())])]
param_grids = [{'clf__C': np.geomspace(1e-1, 1e1, 100)}]
best_mod = model_selection(mods, param_grids)


In [6]:
best_mod.score(X_test, y_test)

-0.9466265539398268

In [7]:
((best_mod.predict(X_test) == y_test).sum()) / y_test.shape[0]

0.579136690647482

In [8]:
best_mod.best_estimator_['clf'].coef_

array([[-0.23739684,  0.21996692, -0.06305141,  0.17921413],
       [-0.05458513, -0.03002893, -0.01334554,  0.03636626],
       [ 0.29198197, -0.18993799,  0.07639694, -0.2155804 ]])

In [9]:
dump(best_mod, 'Mods/mod.joblib')

['Mods/mod.joblib']