### Grid search for hyperparameters in the logistic regression model
1. Load and process data according to modelling pipeline
2. Define functions for betting evaluation, cross-validation & grid search
3. Perform grid search
4. Investigate results to determine optimal hyperparameters. 


In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

### LOAD DATA

## Path to the data folder inside the dropbox
db_path = 'C:/Users/dosef/Dropbox/ExamProject/data/'

## data set name
data_set = 'data_train.csv'

## Loading the csv file 'data_train.csv'
raw_data = pd.read_csv(db_path + data_set, index_col = 0)

data = raw_data.loc[:, ['season', 'league', 'team_home_s', 'team_away_s',
                        'team_home', 'team_awat',
                        'draw','odds_prob_draw', 'odds_draw',
                        'home_agg_overall','home_def_overall','away_agg_overall', 'away_def_overall', 
                         'RR', 'ELO_home','ELO_away']].copy()


data.reset_index(inplace = True)
data.drop('index', axis =1, inplace = True)
data.shape

(18185, 16)

In [10]:
## Binary target array
y = data.loc[:,'draw'].copy()

## target array containing odds
y_odds = data.loc[:, 'odds_draw'].copy()

## target array containing implied probabilities
y_probs = data.loc[:, 'odds_prob_draw']

## matrix containing explanatory variables
x = data.drop(['draw', 'odds_prob_draw', 'odds_draw'], axis=1).copy()


In [11]:
### IMPUTATION 
x.loc[x['home_agg_overall'].isnull(), 'home_agg_overall'] = x.groupby('team_home')['home_agg_overall'].transform('mean')
x.loc[x['home_def_overall'].isnull(), 'home_def_overall'] = x.groupby('team_home')['home_def_overall'].transform('mean')
x.loc[x['away_agg_overall'].isnull(), 'away_agg_overall'] = x.groupby('team_awat')['away_agg_overall'].transform('mean')
x.loc[x['away_def_overall'].isnull(), 'away_def_overall'] = x.groupby('team_awat')['away_def_overall'].transform('mean')

x = pd.get_dummies(x, columns = ['league'])

x.drop(['team_home', 'team_awat','season', 'team_home_s', 'team_away_s'], axis=1, inplace=True)

x = x.fillna(x.mean())


In [12]:
### BETTING PAYOFF FUNCTION
def betting_payoff(target, odds, impl_probs, pred_probs):
    draw = target.values
    bet = pred_probs > impl_probs
    payoff = odds[draw & bet]
    return payoff.sum()/bet.sum() - 1

In [13]:
## STANDARDIZING
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(x.iloc[:,:7])
x.iloc[:,:7] = scaler.transform(x.iloc[:,:7])

In [6]:
## GRID SEARCH FOR THE LOGIT MODEL WITHOUT POLYNOMIAL FEATURES
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

## Making a copy of x
x_sin_poly = x.copy()

## penalty function
penalty_func = ['l1','l2']

##Inverse of regularization strength
inv_reg = np.logspace(-2,4, num = 12, base = np.exp(1))

## Dict used to store the results
res_sin_poly = dict()

for p in penalty_func:
    for c in inv_reg:
        
        ## FUunction used to obtain train and test splits
        kf = KFold(n_splits = 10, random_state = 1003, shuffle = True)
        
        ## Initializing the mean_payoff
        mean_payoff = 0
        nsplits = float(kf.n_splits)
    
        ## initializing the model
        model = LogisticRegression(penalty = p, C = c)
        
        for train_index, test_index in kf.split(x_sin_poly):
            
            ## Creating training and test data sets
            x_train, x_test = x_sin_poly.loc[train_index], x_sin_poly.loc[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            ## fitting the model
            model.fit(x_train, y_train)
            pred_prob = model.predict_proba(x_test)[:,1]
            
            ## updating the average payoff
            mean_payoff += betting_payoff(y_test, y_odds[test_index], y_probs[test_index], pred_prob)/nsplits
        
        key = 'penalty: ' + p + ' C: ' + str(c)
        res_sin_poly[key] = mean_payoff

In [7]:
import operator
print('Combination that yields highest cross-validated payoff: ', max(res_sin_poly.items(), key = operator.itemgetter(1))[0])
print('Highest cross-validated payoff: ', round(res_sin_poly[max(res_sin_poly.items(), key = operator.itemgetter(1))[0]],3))

Combination that yields highest cross-validated payoff:  penalty: l1 C: 1.1993961020353856
Highest cross-validated payoff:  -0.079


In [15]:
## Retraining the model on the entire data set

model = LogisticRegression(penalty = 'l1', C = 1.1993961020353856)

model.fit(x_sin_poly,y)
pred_prob = model.predict_proba(x_sin_poly)[:,1]

print('In-sample payoff: ', round(betting_payoff(y, y_odds, y_probs, pred_prob),3))

In-sample payoff:  -0.055


In [8]:
## ADDING POLYNOMIAL FEATURES
from sklearn.preprocessing import PolynomialFeatures
x = PolynomialFeatures().fit_transform(x)

In [9]:
## GRID SEARCH FOR THE LOGIT MODEL WITH POLYNOMIAL FEATURES

## penalty function
penalty_func = ['l1','l2']

##Inverse of regularization strength
inv_reg = np.logspace(-2,4, num = 12, base = np.exp(1))

## Dict used to store the results
res = dict()

for p in penalty_func:
    for c in inv_reg:
        
        ## FUunction used to obtain train and test splits
        kf = KFold(n_splits = 10, random_state = 1003, shuffle = True)
        
        ## Initializing the mean_payoff
        mean_payoff = 0
        nsplits = float(kf.n_splits)
    
        ## initializing the model
        model = LogisticRegression(penalty = p, C = c)
        
        for train_index, test_index in kf.split(x):
            
            ## Creating training and test data sets
            x_train, x_test = x[train_index], x[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            ## fitting the model
            model.fit(x_train, y_train)
            pred_prob = model.predict_proba(x_test)
            
            ## updating the average payoff
            mean_payoff += betting_payoff(y_test, y_odds[test_index], y_probs[test_index], pred_prob[:,1])/nsplits
        
        key = 'penalty: ' + p + ' C: ' + str(c)
        res[key] = mean_payoff
        

In [10]:
import operator
print('Combination that yields highest cross-validated payoff: ', max(res.items(), key = operator.itemgetter(1))[0])
print('Highest cross-validated payoff: ', round(res[max(res.items(), key = operator.itemgetter(1))[0]],3))

Combination that yields highest cross-validated payoff:  penalty: l2 C: 18.34011815147647
Highest cross-validated payoff:  -0.045


In [11]:
## Retraining the model on the entire data set
model = LogisticRegression(penalty = 'l2', C = 18.34011815147647)

model.fit(x,y)
pred_prob = model.predict_proba(x)[:,1]

print('In-sample payoff: ', round(betting_payoff(y, y_odds, y_probs, pred_prob),3))

In-sample payoff:  0.052
