In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as st
from scipy.special import logit, expit
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.metrics import brier_score_loss

In [3]:
data = pd.read_csv('soccer18.csv')
data['GameID'] = data.index
data.head()

Unnamed: 0,Div,Date,Y,HomeTeam,AwayTeam,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,AST,home_xG,away_xG,GameID
0,EPL,2014-08-16,14,Arsenal,Crystal Palace,2,1,1,1,14,4,6,2,1.55411,0.158151,0
1,EPL,2014-08-16,14,Leicester,Everton,2,2,1,2,11,13,3,3,1.2783,0.613273,1
2,EPL,2014-08-16,14,Man United,Swansea,1,2,0,1,14,5,5,4,1.16635,0.278076,2
3,EPL,2014-08-16,14,QPR,Hull,0,1,0,0,19,11,6,4,1.90067,1.11757,3
4,EPL,2014-08-16,14,Stoke,Aston Villa,0,1,0,0,12,7,2,2,0.423368,0.909774,4


In [4]:
# Splitting each game into two rows and identifying home team
datamelt = pd.melt(data, 
                   id_vars = ['GameID','Date','Div','Y','FTHG','FTAG','HTHG','HTAG','HS','AS','HST','AST','home_xG','away_xG'], 
                   value_vars = ['HomeTeam','AwayTeam'], var_name = 'HA', value_name = 'Team')
datamelt['isHome'] = 1*(datamelt['HA'] == 'HomeTeam') - 1*(datamelt['HA'] == 'AwayTeam')

In [5]:
datamelt['GDiff'] = (datamelt['FTHG'] - datamelt['FTAG'])*datamelt['isHome']

In [6]:
datamelt = datamelt.sort_values(['Date','GameID'])
datamelt['CumGDiff'] = datamelt.groupby('Team')['GDiff'].transform(lambda x : x.cumsum().shift(1, fill_value = 0))
datamelt['GamesPlayed'] = datamelt.groupby('Team')['GDiff'].cumcount()
datamelt['AvgGDiff'] = datamelt['CumGDiff']/datamelt['GamesPlayed']

In [7]:
pivotdata = datamelt.pivot_table(index=['GameID','Y','Date','Div'], columns=['HA'], 
                                 values=['AvgGDiff','Team','GamesPlayed'], aggfunc='first')
pivotdata.columns = ['AwayAvgGDiff', 'HomeAvgGDiff', 'AwayGPlayed', 'HomeGPlayed', 'AwayTeam', 'HomeTeam']
pivotdata.reset_index(inplace = True)
pivotdata['AvgGDiffDiff'] = abs(pivotdata['HomeAvgGDiff'] - pivotdata['AwayAvgGDiff'])
pivotdata = pivotdata.sort_values(['Date','GameID'])

In [8]:
# Question 1)a)i)
pivotdata17 = pivotdata[pivotdata['Y'] < 18]
pivotdata17.nlargest(7, 'AvgGDiffDiff')

Unnamed: 0,GameID,Y,Date,Div,AwayAvgGDiff,HomeAvgGDiff,AwayGPlayed,HomeGPlayed,AwayTeam,HomeTeam,AvgGDiffDiff
5326,5326,14,2014-08-22,Ligue_1,1.0,-3.5,2,2,Paris SG,Evian Thonon Gaillard,4.5
7214,7214,14,2014-09-21,Serie_A,1.0,-3.5,2,2,Sampdoria,Sassuolo,4.5
6464,6464,17,2017-08-13,Ligue_1,0.078261,-4.0,115,1,Lille,Strasbourg,4.078261
1910,1910,14,2014-08-30,La_Liga,2.0,-2.0,1,1,Celta,Cordoba,4.0
1912,1912,14,2014-08-31,La_Liga,1.0,-3.0,1,1,Granada,Elche,4.0
7197,7197,14,2014-09-13,Serie_A,2.0,-2.0,1,1,Roma,Empoli,4.0
7212,7212,14,2014-09-21,Serie_A,3.5,-0.5,2,2,Inter,Palermo,4.0


In [9]:
# Question 1)a)ii)
pivotdata17_100 = pivotdata17[(pivotdata17['AwayGPlayed'] >= 100) & (pivotdata17['HomeGPlayed'] >= 100)]
pivotdata17_100.nlargest(7, 'AvgGDiffDiff')

Unnamed: 0,GameID,Y,Date,Div,AwayAvgGDiff,HomeAvgGDiff,AwayGPlayed,HomeGPlayed,AwayTeam,HomeTeam,AvgGDiffDiff
2940,2940,16,2017-04-02,La_Liga,2.192308,-0.875,104,104,Barcelona,Granada,3.067308
3393,3393,17,2018-05-13,La_Liga,2.14,-0.705357,150,112,Barcelona,Levante,2.845357
3008,3008,16,2017-05-06,La_Liga,1.9,-0.936937,110,111,Real Madrid,Granada,2.836937
3293,3293,17,2018-03-01,La_Liga,2.208633,-0.623762,139,101,Barcelona,Las Palmas,2.832395
3370,3370,17,2018-04-29,La_Liga,2.142857,-0.621622,147,148,Barcelona,La Coruna,2.764479
2921,2921,16,2017-03-12,La_Liga,2.22549,-0.519608,102,102,Barcelona,La Coruna,2.745098
3190,3190,17,2017-12-17,La_Liga,-0.527132,2.186047,129,129,La Coruna,Barcelona,2.713178


In [10]:
# Question 1)a)iii)
# We can see that for the most part in the first table, each of the opposing teams have not played more than 2 games prior.
# This means that teams which performed extremely well or extremely poorly in their first or second games of 2014 would have
# a lopsided average goal differential compared to the average team. We see that over an extended period of time, when 
# we restrict our data to teams that have had 100+ prior matches, the mode of disparities naturally regresses.

In [11]:
data17 = data[data['Y'] < 18].copy()
data17['HomeWin'] = 1*(data17['FTHG'] > data17['FTAG'])

In [12]:
# Question 1)b)i)
y = data17['HomeWin']
x = np.ones(y.shape)
result = sm.GLM(y, x, family = sm.families.Binomial()).fit()
result.summary()

0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,7304.0
Model:,GLM,Df Residuals:,7303.0
Model Family:,Binomial,Df Model:,0.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-5037.4
Date:,"Sun, 14 Feb 2021",Deviance:,10075.0
Time:,13:55:32,Pearson chi2:,7300.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1669,0.023,-7.106,0.000,-0.213,-0.121


In [13]:
data18 = data[data['Y'] == 18].copy()
data18['HomeWin'] = 1*(data18['FTHG'] > data18['FTAG'])
print('Brier score loss is:', brier_score_loss(data18['HomeWin'], result.predict(np.ones(data18['HomeWin'].shape))))

Brier score loss is: 0.2473559477379797


In [None]:
there could be draws

In [14]:
# Question 1)b)ii)
datalog = pd.DataFrame({'HomeAvgGDiff': pivotdata17['HomeAvgGDiff'].fillna(0), 
                        'AwayAvgGDiff': pivotdata17['AwayAvgGDiff'].fillna(0)})
X = sm.add_constant(datalog)
result = sm.GLM(data17.sort_values(['Date', 'GameID'])['HomeWin'], X, family = sm.families.Binomial()).fit()
result.summary()

  return ptp(axis=axis, out=out, **kwargs)


0,1,2,3
Dep. Variable:,HomeWin,No. Observations:,7304.0
Model:,GLM,Df Residuals:,7301.0
Model Family:,Binomial,Df Model:,2.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-4606.5
Date:,"Sun, 14 Feb 2021",Deviance:,9212.9
Time:,13:55:37,Pearson chi2:,7350.0
No. Iterations:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.1791,0.025,-7.183,0.000,-0.228,-0.130
HomeAvgGDiff,0.7853,0.039,20.128,0.000,0.709,0.862
AwayAvgGDiff,-0.7619,0.040,-19.082,0.000,-0.840,-0.684


In [16]:
pivotdata18 = pivotdata[pivotdata['Y'] == 18]
datalog18 = pd.DataFrame({'HomeAvgGDiff': pivotdata18['HomeAvgGDiff'].fillna(0), 
                        'AwayAvgGDiff': pivotdata18['AwayAvgGDiff'].fillna(0)})
X = sm.add_constant(datalog18)
print('Brier score loss is:', brier_score_loss(data18.sort_values(['Date', 'GameID'])['HomeWin'], result.predict(X)))

Brier score loss is: 0.21726101075298784
