In [233]:
import pandas as pd
import numpy as np

from sklearn.metrics import brier_score_loss, make_scorer

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, TimeSeriesSplit, KFold
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv('soccer18.csv')
data['GameID'] = data.index
data['HomeWin'] = 1*(data['FTHG'] > data['FTAG'])
data.head()

Unnamed: 0,Div,Date,Y,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,home_xG,away_xG,GameID,HomeWin
0,EPL,2014-08-16,14,Arsenal,Crystal Palace,2,1,1,1,14,4,6,2,1.55411,0.158151,0,1
1,EPL,2014-08-16,14,Leicester,Everton,2,2,1,2,11,13,3,3,1.2783,0.613273,1,0
2,EPL,2014-08-16,14,Man United,Swansea,1,2,0,1,14,5,5,4,1.16635,0.278076,2,0
3,EPL,2014-08-16,14,QPR,Hull,0,1,0,0,19,11,6,4,1.90067,1.11757,3,0
4,EPL,2014-08-16,14,Stoke,Aston Villa,0,1,0,0,12,7,2,2,0.423368,0.909774,4,0


In [40]:
# Splitting each game into two rows and identifying home team
datamelt = pd.melt(data, 
                   id_vars = ['GameID','Date','Div','Y','FTHG','FTAG','HTHG','HTAG','HS','AS','HST','AST','home_xG','away_xG',
                              'HomeWin'], value_vars = ['HomeTeam','AwayTeam'], var_name = 'HA', value_name = 'Team')
datamelt['isHome'] = 1*(datamelt['HA'] == 'HomeTeam') - 1*(datamelt['HA'] == 'AwayTeam')
datamelt['isHome2'] = 1*(datamelt['HA'] == 'HomeTeam')

In [41]:
# Adding differentials for each stat
datamelt['GDiff'] = (datamelt['FTHG'] - datamelt['FTAG'])*datamelt['isHome']
datamelt['SDiff'] = (datamelt['HS'] - datamelt['AS'])*datamelt['isHome']
datamelt['STDiff'] = (datamelt['HST'] - datamelt['AST'])*datamelt['isHome']
datamelt['xGDiff'] = (datamelt['home_xG'] - datamelt['away_xG'])*datamelt['isHome']

In [226]:
datamelt = datamelt.sort_values(['Date','GameID'])

# Using ewm to compute rolling weighted averages for each stat to capture recent trends/momentum
datamelt['AvgGDiff'] = datamelt.groupby('Team')['GDiff'].transform(lambda x : x.ewm(halflife = 15).mean().shift(1, fill_value = 0))
datamelt['AvgSDiff'] = datamelt.groupby('Team')['SDiff'].transform(lambda x : x.ewm(halflife = 15).mean().shift(1, fill_value = 0))
datamelt['AvgSTDiff'] = datamelt.groupby('Team')['STDiff'].transform(lambda x : x.ewm(halflife = 15).mean().shift(1, fill_value = 0))
datamelt['AvgxGDiff'] = datamelt.groupby('Team')['xGDiff'].transform(lambda x : x.ewm(halflife = 15).mean().shift(1, fill_value = 0))

# Games played at home or away
datamelt['GamesPlayedHome'] = datamelt.groupby('Team')['isHome2'].apply(lambda x: x.shift().fillna(0).cumsum())
datamelt['GamesPlayedAway'] = datamelt.groupby('Team')['isHome2'].apply(lambda x: x.sub(1).abs().shift().fillna(0).cumsum())

# Calculating win percentage at home and away for each team
datamelt['WinAtHome'] = datamelt['HomeWin'] * (datamelt['HA'] == 'HomeTeam')
datamelt['WinPctAtHome'] = datamelt.groupby('Team')['WinsAtHome'].transform(lambda x : x.cumsum().shift(1, fill_value = 0))/datamelt['GamesPlayedHome']
datamelt['WinPctAtHome'] = datamelt.groupby('Team')['WinPctAtHome'].transform(lambda x : x.ewm(halflife = 10).mean())

datamelt['WinAway'] = datamelt['HomeWin'].replace({0: 1, 1: 0}) * (datamelt['HA'] == 'AwayTeam')
datamelt['WinPctAway'] = datamelt.groupby('Team')['WinAway'].transform(lambda x : x.cumsum().shift(1, fill_value = 0))/datamelt['GamesPlayedAway']
datamelt['WinPctAway'] = datamelt.groupby('Team')['WinPctAway'].transform(lambda x : x.ewm(halflife = 10).mean())

In [227]:
# Pivoting data back into original format and taking differences between home and away differentials
pivotdata = datamelt.pivot_table(index=['GameID','Y','Date','Div'], columns=['HA'], 
                                 values=['AvgGDiff','AvgSDiff','AvgSTDiff','AvgxGDiff','Team','WinPctAtHome', 'WinPctAway'], 
                                 aggfunc='first')
pivotdata.columns = ['AwayAvgGDiff','HomeAvgGDiff','AwayAvgSDiff','HomeAvgSDiff','AwayAvgSTDiff','HomeAvgSTDiff',
                     'AwayAvgxGDiff','HomeAvgxGDiff','AwayTeam','HomeTeam','AwayWinPctAtHome','HomeWinPctAtHome',
                     'AwayWinPctAway', 'HomeWinPctAway']
pivotdata.reset_index(inplace = True)
pivotdata['AvgGDiffDiff'] = pivotdata['HomeAvgGDiff'] - pivotdata['AwayAvgGDiff']
pivotdata['AvgSDiffDiff'] = pivotdata['HomeAvgSDiff'] - pivotdata['AwayAvgSDiff']
pivotdata['AvgSTDiffDiff'] = pivotdata['HomeAvgSTDiff'] - pivotdata['AwayAvgSTDiff']
pivotdata['AvgxGDiffDiff'] = pivotdata['HomeAvgxGDiff'] - pivotdata['AwayAvgxGDiff']
pivotdata['WinPctDiff'] = pivotdata['HomeWinPctAtHome'] - pivotdata['AwayWinPctAway']

pivotdata = pivotdata.sort_values(['Date','GameID'])

In [228]:
# Filtering/cleaning dataframe for use in model
datamaster = data.merge(pivotdata[['Date','GameID','AvgGDiffDiff','AvgSDiffDiff',
                                   'AvgSTDiffDiff','AvgxGDiffDiff','HomeWinPctAtHome','AwayWinPctAway']], 
                        on = ['Date','GameID'], how = 'inner').sort_values(['Date','GameID'])
datamaster.drop(['GameID','Date','Div','FTHG','FTAG','HTHG','HTAG','HS','AS','HST','AST','home_xG','away_xG'], 
                axis = 1, inplace = True)

In [237]:
x_train = datamaster[datamaster['Y'] < 18].drop(['Y','HomeWin','HomeTeam','AwayTeam'], axis = 1).fillna(0)
y_train = datamaster[datamaster['Y'] < 18]['HomeWin']
x_test = datamaster[datamaster['Y'] == 18].drop(['Y','HomeWin','HomeTeam','AwayTeam'], axis = 1).fillna(0)
y_test = datamaster[datamaster['Y'] == 18]['HomeWin']

In [238]:
# Logistic Regression
tscv = TimeSeriesSplit(n_splits = 4)

#Create a set of steps. All but the last step is a transformer (something that processes data). 
#Build a list of steps, where the first is StandardScaler and the last is RandomForest
steps = [('scaler', StandardScaler()),
         ('lr', LogisticRegression(solver = 'liblinear'))]

#Now set up the pipeline
pipeline = Pipeline(steps)

#Now set up the parameter grid
parameters_scaler = dict(lr__C = [10**i for i in range(-3, 3)],
                         lr__penalty = ['l1', 'l2'],
                         lr__max_iter = [100, 1000, 10000])

#Now run a grid search
lr_grid_search_scaler = GridSearchCV(pipeline, param_grid = parameters_scaler, cv = tscv, scoring = 'neg_brier_score')

In [239]:
#Now fit model to training data
lr_grid_search_scaler.fit(x_train, y_train)

proba = lr_grid_search_scaler.predict_proba(x_test)
brierscore = brier_score_loss(y_test, proba[:,1])

In [209]:
brierscore

0.21347867563728137

In [197]:
lr_grid_search_scaler.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('lr',
                 LogisticRegression(C=1, max_iter=1000, penalty='l1',
                                    solver='liblinear'))])

In [234]:
# Support Vector Machine
tscv = TimeSeriesSplit(n_splits = 4)

#Create a set of steps. All but the last step is a transformer (something that processes data). 
#Build a list of steps, where the first is StandardScaler and the last is RandomForest
steps = [('scaler', StandardScaler()),
         ('svc', SVC(probability = True))]

#Now set up the pipeline
pipeline = Pipeline(steps)

BrierLoss = make_scorer(brier_score_loss, greater_is_better = False, needs_proba = True)

#Now set up the parameter grid
parameters_scaler = dict(svc__C = [10**i for i in range(-3, 3)],
                         svc__kernel = ['linear', 'sigmoid', 'rbf'],
                         svc__gamma = [1,0.1,0.01,0.001])

#Now run a grid search
svc_grid_search_scaler = GridSearchCV(pipeline, param_grid = parameters_scaler, cv = tscv, scoring = BrierLoss)

In [235]:
#Now fit model to training data
svc_grid_search_scaler.fit(x_train, y_train)

proba = svc_grid_search_scaler.predict_proba(x_test)
brierscore = brier_score_loss(y_test, proba[:,1])

In [236]:
brierscore

0.21350713986911402