In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from scipy.special import logit
pd.options.display.max_columns = 100

In [132]:
df = pd.read_csv('soccer18_full.csv')
df = df.rename(columns = {'HomeTeam':'Team_Home','AwayTeam':'Team_Away'})
df = df.sort_values(['Date','game_id'])
dfshots = pd.read_csv('soccer18_shots.csv')

In [133]:
# 1a
ids = df.game_id.unique()
idshots = dfshots.game_id.unique()

missing = [x for x in ids if x not in idshots]
df = df[df.game_id != missing[0]]

In [111]:
# 1b
xgs = pd.merge(dfshots[dfshots.result != 'OwnGoal'], df[['game_id','Y']], on = ['game_id'])
topxgshots = xgs.groupby(['Y','player'])['xG'].sum().to_frame().reset_index()
topxgassists = xgs.groupby(['Y','player_assisted'])['xG'].sum().to_frame().reset_index()

topxgs = pd.merge(topxgshots, topxgassists, how = 'outer', left_on = ['Y','player'], right_on = ['Y','player_assisted'])
topxgs['xG'] = topxgs['xG_x'] + topxgs['xG_y']
topxgs.nlargest(10,'xG')[['Y','player','xG']]

Unnamed: 0,Y,player,xG
1185,14,Lionel Messi,53.503719
380,14,Cristiano Ronaldo,52.909403
3393,15,Luis Suárez,49.109693
3679,15,Neymar,46.124972
2536,15,Cristiano Ronaldo,44.19541
7686,17,Lionel Messi,44.046686
3349,15,Lionel Messi,42.973189
10329,18,Robert Lewandowski,42.75666
9771,18,Lionel Messi,41.332334
5525,16,Lionel Messi,40.840305


#1c
Players on better teams (surrounded by better players) may tend to have more goal-scoring or assist-making opportunities, which would lead to being involved in more xG-positive plays - this raw number may not translate directly to skill

In [152]:
# 1d
ogs = dfshots[dfshots.result == 'OwnGoal'].groupby(['game_id','team_id']).result.count().rename('OG')
sps = dfshots[dfshots.result == 'ShotOnPost'].groupby(['game_id','team_id']).result.count().rename('SP')
hgs = dfshots[(dfshots.result == 'Goal')&(dfshots.shot_type == 'Head')].groupby(['game_id','team_id']).result.count().rename('HG')

df_all = df.merge(ogs, how = 'left', right_index = True, left_on = ['game_id','home_team_id'])
df_all = df_all.merge(ogs, how = 'left', right_index = True, left_on = ['game_id','away_team_id'], suffixes = ['_Home','_Away'])

df_all = df_all.merge(sps, how = 'left', right_index = True, left_on = ['game_id','home_team_id'])
df_all = df_all.merge(sps, how = 'left', right_index = True, left_on = ['game_id','away_team_id'], suffixes = ['_Home','_Away'])

df_all = df_all.merge(hgs, how = 'left', right_index = True, left_on = ['game_id','home_team_id'])
df_all = df_all.merge(hgs, how = 'left', right_index = True, left_on = ['game_id','away_team_id'], suffixes = ['_Home','_Away'])

flds = ['OG_Home','OG_Away','SP_Home','SP_Away','HG_Home','HG_Away']
df_all.loc[:,flds] = df_all.loc[:,flds].fillna(0)

In [153]:
df_temp = df_all.copy()
df_temp['OG_Total'] = df_temp['OG_Home'] + df_temp['OG_Away']
df_temp['SP_Total'] = df_temp['SP_Home'] + df_temp['SP_Away']
df_temp['HG_Total'] = df_temp['HG_Home'] + df_temp['HG_Away']

og_max = df_temp.max()['OG_Total']
sp_max = df_temp.max()['SP_Total']
hg_max = df_temp.max()['HG_Total']

In [154]:
df_temp[df_temp.OG_Total == og_max][['Team_Home','Team_Away','Date','Div','OG_Total']]

Unnamed: 0,Team_Home,Team_Away,Date,Div,OG_Total
76,Southampton,Sunderland,2014-10-18,EPL,3.0
77,QPR,Liverpool,2014-10-19,EPL,3.0
7516,Empoli,Napoli,2015-04-30,Serie_A,3.0


In [155]:
df_temp[df_temp.SP_Total == sp_max][['Team_Home','Team_Away','Date','Div','SP_Total']]

Unnamed: 0,Team_Home,Team_Away,Date,Div,SP_Total
1933,Sociedad,Almeria,2014-09-21,La_Liga,5.0
2470,Getafe,Espanol,2016-01-17,La_Liga,5.0
1123,West Ham,Liverpool,2017-05-14,EPL,5.0
3190,Barcelona,La Coruna,2017-12-17,La_Liga,5.0
5152,Hoffenheim,Mainz,2018-12-23,Bundesliga,5.0


In [156]:
df_temp[df_temp.HG_Total == hg_max][['Team_Home','Team_Away','Date','Div','HG_Total']]

Unnamed: 0,Team_Home,Team_Away,Date,Div,HG_Total
8030,Cagliari,Fiorentina,2016-10-23,Serie_A,5.0


In [159]:
# 1e
df_all['GD_Home'] = df_all.FTHG - df_all.FTAG
df_all['xGD_Home'] = df_all.home_xG - df_all.away_xG
df_all['OGD_Home'] = df_all.OG_Home - df_all.OG_Away
df_all['SPD_Home'] = df_all.SP_Home - df_all.SP_Away
df_all['HGD_Home'] = df_all.HG_Home - df_all.HG_Away

flds = ['GD','xGD','OGD','SPD','HGD']
for fld in flds:
    df_all[f'{fld}_Away'] = -df_all[f'{fld}_Home']
df_fil = df_all.loc[:,['Div','Date','Y','game_id','Team_Home','Team_Away','pH']+
                    [f'{fld}_{ha}' for fld in flds for ha in ['Home','Away']]]

In [164]:
df_long = pd.wide_to_long(df_fil, ['Team','GD','xGD','OGD','SPD','HGD'], 'game_id','isHome','_',r'\w+').reset_index()
df_long.isHome = df_long.isHome == 'Home'
df_long = df_long.sort_values(['Date','game_id','isHome']).reset_index(drop = True)

In [165]:
gp = df_long.groupby(['Y','Team'])
df_long['sn'] = gp.Y.cumcount()
df_long['savgGD'] = gp.GD.transform(lambda x : x.cumsum().shift(1, fill_value = 0)) / (5+gp.Y.cumcount())
df_long['savgxGD'] = gp.xGD.transform(lambda x : x.cumsum().shift(1, fill_value = 0)) / (5+gp.Y.cumcount())
df_long['savgOGD'] = gp.OGD.transform(lambda x : x.cumsum().shift(1, fill_value = 0)) / (5+gp.Y.cumcount())
df_long['savgSPD'] = gp.SPD.transform(lambda x : x.cumsum().shift(1, fill_value = 0)) / (5+gp.Y.cumcount())
df_long['savgHGD'] = gp.HGD.transform(lambda x : x.cumsum().shift(1, fill_value = 0)) / (5+gp.Y.cumcount())
sflds = [f for f in df_long.columns if f.startswith('s')]

In [170]:
dfm = df_fil.merge(df_long.loc[df_long.isHome,['game_id']+sflds], how = 'inner', on = 'game_id')
dfm = dfm.merge(df_long.loc[~df_long.isHome,['game_id']+sflds], how = 'inner', on = 'game_id', suffixes = ['_Home','_Away'])

In [171]:
dfm['GDD'] = dfm.savgGD_Home - dfm.savgGD_Away
dfm['xGDD'] = dfm.savgxGD_Home - dfm.savgxGD_Away
dfm['OGDD'] = dfm.savgOGD_Home - dfm.savgOGD_Away
dfm['SPDD'] = dfm.savgSPD_Home - dfm.savgSPD_Away
dfm['HGDD'] = dfm.savgHGD_Home - dfm.savgHGD_Away
dfm1417 = dfm[dfm.Y.between(14,17)]
df_final = dfm1417.loc[dfm1417.sn_Home+dfm1417.sn_Away > 9,['Div','Date','Y','game_id','Team_Home','Team_Away','pH']+
                       [f'{f}D' for f in flds]]
df_final.head()

Unnamed: 0,Div,Date,Y,game_id,Team_Home,Team_Away,pH,GDD,xGDD,OGDD,SPDD,HGDD
169,Ligue_1,2014-09-19,14,6155,Bordeaux,Evian Thonon Gaillard,0.629685,1.5,0.369822,-0.1,0.0,-0.2
188,Ligue_1,2014-09-20,14,6156,Marseille,Rennes,0.609827,0.2,0.610045,-0.1,0.2,0.1
189,Ligue_1,2014-09-20,14,6157,Lorient,Reims,0.553418,0.6,0.612688,0.2,0.3,-0.1
190,Ligue_1,2014-09-20,14,6158,Metz,Bastia,0.447615,-0.1,-0.111996,-0.1,0.0,0.2
191,Ligue_1,2014-09-20,14,6159,Nantes,Nice,0.530376,0.3,0.700922,0.0,0.2,-0.1


In [174]:
# i.
# Own goal var as the only covariate
smf.ols('logit(pH) ~ OGDD', df_final).fit().summary()

0,1,2,3
Dep. Variable:,logit(pH),R-squared:,0.042
Model:,OLS,Adj. R-squared:,0.042
Method:,Least Squares,F-statistic:,277.4
Date:,"Thu, 25 Mar 2021",Prob (F-statistic):,5.38e-61
Time:,17:14:28,Log-Likelihood:,-8342.2
No. Observations:,6321,AIC:,16690.0
Df Residuals:,6319,BIC:,16700.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2238,0.011,-19.644,0.000,-0.246,-0.201
OGDD,-2.5428,0.153,-16.656,0.000,-2.842,-2.243

0,1,2,3
Omnibus:,44.16,Durbin-Watson:,2.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,61.529
Skew:,-0.079,Prob(JB):,4.36e-14
Kurtosis:,3.457,Cond. No.,13.4


In [176]:
# ii.
# Goal var and own goal var
smf.ols('logit(pH) ~ GDD + OGDD', df_final).fit().summary()

0,1,2,3
Dep. Variable:,logit(pH),R-squared:,0.78
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,11230.0
Date:,"Thu, 25 Mar 2021",Prob (F-statistic):,0.0
Time:,17:17:38,Log-Likelihood:,-3686.5
No. Observations:,6321,AIC:,7379.0
Df Residuals:,6318,BIC:,7399.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2072,0.005,-37.969,0.000,-0.218,-0.196
GDD,0.9648,0.007,145.758,0.000,0.952,0.978
OGDD,0.3478,0.076,4.592,0.000,0.199,0.496

0,1,2,3
Omnibus:,76.282,Durbin-Watson:,1.959
Prob(Omnibus):,0.0,Jarque-Bera (JB):,131.654
Skew:,-0.049,Prob(JB):,2.58e-29
Kurtosis:,3.7,Cond. No.,13.9


In [177]:
# iii.
# Expected goal var and own goal var
smf.ols('logit(pH) ~ xGDD + OGDD', df_final).fit().summary()

0,1,2,3
Dep. Variable:,logit(pH),R-squared:,0.827
Model:,OLS,Adj. R-squared:,0.827
Method:,Least Squares,F-statistic:,15090.0
Date:,"Thu, 25 Mar 2021",Prob (F-statistic):,0.0
Time:,17:18:18,Log-Likelihood:,-2935.0
No. Observations:,6321,AIC:,5876.0
Df Residuals:,6318,BIC:,5896.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2044,0.005,-42.184,0.000,-0.214,-0.195
xGDD,1.2263,0.007,169.248,0.000,1.212,1.240
OGDD,-0.1177,0.066,-1.771,0.077,-0.248,0.013

0,1,2,3
Omnibus:,106.763,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,208.03
Skew:,-0.054,Prob(JB):,6.709999999999999e-46
Kurtosis:,3.882,Cond. No.,13.7


#iv. 


In [178]:
# 1f
# Shots hitting post var as the only covariate
smf.ols('logit(pH) ~ SPDD', df_final).fit().summary()

0,1,2,3
Dep. Variable:,logit(pH),R-squared:,0.267
Model:,OLS,Adj. R-squared:,0.267
Method:,Least Squares,F-statistic:,2298.0
Date:,"Thu, 25 Mar 2021",Prob (F-statistic):,0.0
Time:,17:37:47,Log-Likelihood:,-7497.9
No. Observations:,6321,AIC:,15000.0
Df Residuals:,6319,BIC:,15010.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2167,0.010,-21.737,0.000,-0.236,-0.197
SPDD,2.2059,0.046,47.933,0.000,2.116,2.296

0,1,2,3
Omnibus:,21.274,Durbin-Watson:,2.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,25.939
Skew:,-0.061,Prob(JB):,2.33e-06
Kurtosis:,3.289,Cond. No.,4.62


In [179]:
# Goal var and shots on post var
smf.ols('logit(pH) ~ GDD + SPDD', df_final).fit().summary()

0,1,2,3
Dep. Variable:,logit(pH),R-squared:,0.796
Model:,OLS,Adj. R-squared:,0.796
Method:,Least Squares,F-statistic:,12350.0
Date:,"Thu, 25 Mar 2021",Prob (F-statistic):,0.0
Time:,17:38:10,Log-Likelihood:,-3449.8
No. Observations:,6321,AIC:,6906.0
Df Residuals:,6318,BIC:,6926.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2060,0.005,-39.202,0.000,-0.216,-0.196
GDD,0.8855,0.007,128.159,0.000,0.872,0.899
SPDD,0.6177,0.027,22.676,0.000,0.564,0.671

0,1,2,3
Omnibus:,68.239,Durbin-Watson:,1.97
Prob(Omnibus):,0.0,Jarque-Bera (JB):,115.718
Skew:,-0.026,Prob(JB):,7.450000000000001e-26
Kurtosis:,3.661,Cond. No.,5.22


In [180]:
# Expected goal var and shots on post var
smf.ols('logit(pH) ~ xGDD + SPDD', df_final).fit().summary()

0,1,2,3
Dep. Variable:,logit(pH),R-squared:,0.831
Model:,OLS,Adj. R-squared:,0.831
Method:,Least Squares,F-statistic:,15550.0
Date:,"Thu, 25 Mar 2021",Prob (F-statistic):,0.0
Time:,17:38:27,Log-Likelihood:,-2856.7
No. Observations:,6321,AIC:,5719.0
Df Residuals:,6318,BIC:,5740.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2040,0.005,-42.633,0.000,-0.213,-0.195
xGDD,1.1770,0.008,145.325,0.000,1.161,1.193
SPDD,0.3255,0.026,12.717,0.000,0.275,0.376

0,1,2,3
Omnibus:,97.027,Durbin-Watson:,2.033
Prob(Omnibus):,0.0,Jarque-Bera (JB):,183.664
Skew:,-0.044,Prob(JB):,1.31e-40
Kurtosis:,3.831,Cond. No.,5.43


In [181]:
# 1g
# Header goals var as the only covariate
smf.ols('logit(pH) ~ HGDD', df_final).fit().summary()

0,1,2,3
Dep. Variable:,logit(pH),R-squared:,0.185
Model:,OLS,Adj. R-squared:,0.185
Method:,Least Squares,F-statistic:,1432.0
Date:,"Thu, 25 Mar 2021",Prob (F-statistic):,1.21e-282
Time:,17:39:28,Log-Likelihood:,-7832.5
No. Observations:,6321,AIC:,15670.0
Df Residuals:,6319,BIC:,15680.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2182,0.011,-20.762,0.000,-0.239,-0.198
HGDD,1.9639,0.052,37.842,0.000,1.862,2.066

0,1,2,3
Omnibus:,44.719,Durbin-Watson:,2.041
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60.586
Skew:,-0.093,Prob(JB):,6.98e-14
Kurtosis:,3.442,Cond. No.,4.94


In [182]:
# Goal var and header goal var
smf.ols('logit(pH) ~ GDD + HGDD', df_final).fit().summary()

0,1,2,3
Dep. Variable:,logit(pH),R-squared:,0.78
Model:,OLS,Adj. R-squared:,0.78
Method:,Least Squares,F-statistic:,11200.0
Date:,"Thu, 25 Mar 2021",Prob (F-statistic):,0.0
Time:,17:39:29,Log-Likelihood:,-3692.1
No. Observations:,6321,AIC:,7390.0
Df Residuals:,6318,BIC:,7411.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2072,0.005,-37.947,0.000,-0.218,-0.197
GDD,0.9686,0.007,130.760,0.000,0.954,0.983
HGDD,-0.0982,0.031,-3.143,0.002,-0.159,-0.037

0,1,2,3
Omnibus:,68.56,Durbin-Watson:,1.953
Prob(Omnibus):,0.0,Jarque-Bera (JB):,115.888
Skew:,-0.033,Prob(JB):,6.84e-26
Kurtosis:,3.66,Cond. No.,5.77


In [183]:
# Expected goal var header goal var
smf.ols('logit(pH) ~ xGDD + HGDD', df_final).fit().summary()

0,1,2,3
Dep. Variable:,logit(pH),R-squared:,0.828
Model:,OLS,Adj. R-squared:,0.828
Method:,Least Squares,F-statistic:,15190.0
Date:,"Thu, 25 Mar 2021",Prob (F-statistic):,0.0
Time:,17:39:29,Log-Likelihood:,-2917.6
No. Observations:,6321,AIC:,5841.0
Df Residuals:,6318,BIC:,5862.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.2042,0.005,-42.257,0.000,-0.214,-0.195
xGDD,1.2077,0.008,153.625,0.000,1.192,1.223
HGDD,0.1637,0.027,6.160,0.000,0.112,0.216

0,1,2,3
Omnibus:,110.608,Durbin-Watson:,2.035
Prob(Omnibus):,0.0,Jarque-Bera (JB):,217.421
Skew:,-0.06,Prob(JB):,6.129999999999999e-48
Kurtosis:,3.901,Cond. No.,5.55
