In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn
from scipy.stats import poisson,skellam

In [2]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter('ignore')

In [3]:
loc = "http://football-data.co.uk/mmz4281/{}/E0.csv"
data_goals = pd.DataFrame()
years = ['1314','1415','1516','1617']
for x in range(1,5):
    r1 = pd.read_csv(loc.format(years[x-1]), error_bad_lines=False, escapechar='\n')
    r1 = r1[['HomeTeam','AwayTeam','FTHG','FTAG','HST','AST']]
    data_goals = data_goals.append(r1)

In [4]:
data_goals = data_goals.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'})

In [5]:
data_goals.head()

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,HST,AST
0,Arsenal,Aston Villa,1.0,3.0,4.0,4.0
1,Liverpool,Stoke,1.0,0.0,11.0,4.0
2,Norwich,Everton,2.0,2.0,2.0,6.0
3,Sunderland,Fulham,0.0,1.0,3.0,1.0
4,Swansea,Man United,1.0,4.0,6.0,7.0


In [6]:
data_goals.isnull().sum()

HomeTeam     1
AwayTeam     1
HomeGoals    1
AwayGoals    1
HST          1
AST          1
dtype: int64

In [7]:
data_goals = data_goals.dropna()

In [8]:
data_goals.mean()

HomeGoals    1.534211
AwayGoals    1.174342
HST          4.800658
AST          3.826316
dtype: float64

In [9]:
# importing for the Poisson regression model
import statsmodels.api as sm
import statsmodels.formula.api as smf

goal_model_data = pd.concat([data_goals[['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename(
            columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals','HST':'HomeShots','AST':'AwayShots'}),
           data_goals[['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename(
            columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})])

poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, 
                        family=sm.families.Poisson()).fit()
poisson_model.summary()

0,1,2,3
Dep. Variable:,goals,No. Observations:,3040
Model:,GLM,Df Residuals:,2988
Model Family:,Poisson,Df Model:,51
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-4360.6
Date:,"Tue, 03 Sep 2019",Deviance:,3422.1
Time:,05:56:05,Pearson chi2:,2.96e+03
No. Iterations:,5,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.2099,0.103,2.046,0.041,0.009,0.411
team[T.Aston Villa],-0.7618,0.118,-6.461,0.000,-0.993,-0.531
team[T.Bournemouth],-0.2951,0.117,-2.530,0.011,-0.524,-0.066
team[T.Burnley],-0.7088,0.136,-5.206,0.000,-0.976,-0.442
team[T.Cardiff],-0.7931,0.187,-4.244,0.000,-1.159,-0.427
team[T.Chelsea],0.0216,0.084,0.257,0.797,-0.143,0.186
team[T.Crystal Palace],-0.4954,0.097,-5.083,0.000,-0.686,-0.304
team[T.Everton],-0.1929,0.089,-2.166,0.030,-0.367,-0.018
team[T.Fulham],-0.5580,0.169,-3.295,0.001,-0.890,-0.226


In [10]:
def simulate_match(foot_model, homeTeam, awayTeam, max_goals=6):
    home_goals_avg = foot_model.predict(pd.DataFrame(data={'team': homeTeam, 
                                                            'opponent': awayTeam,'home':1},
                                                      index=[1])).values[0]
    away_goals_avg = foot_model.predict(pd.DataFrame(data={'team': awayTeam, 
                                                            'opponent': homeTeam,'home':0},
                                                      index=[1])).values[0]
    return("HomeTeam Probable goals Scored {}, AwayTeam Probable goals Scored {}".format(home_goals_avg,away_goals_avg))

In [11]:
simulate_match(poisson_model,'Arsenal','Sunderland')

'HomeTeam Probable goals Scored 2.4238993593162226, AwayTeam Probable goals Scored 0.6676936108019813'