# Football Forecasting - WDSS Demo [ Tidy Version ]

Here we provide a baseline model for our upcoming Premier League forecasting competition.

In [1]:
# Import some modules
from scipy.stats import poisson, skellam
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator
import pandas as pd

In [2]:
# Import some more modules...
import statsmodels.api as sm
import statsmodels.formula.api as smf

### Data

In [3]:
# Define a function to quickly get data from any given EPL season.
# N.B: Current GW fixtures are only added after the GW has finished.

def get_epl_data(season):
    """
    Takes in season formatted as YYYY (last 2 digits of Y1 and Y2 - no spaces or slashes)
    """
    x = pd.read_csv("http://www.football-data.co.uk/mmz4281/" + str(season) + "/E0.csv") # input season year within hyperlink
    
    x = x[['HomeTeam','AwayTeam','FTHG','FTAG', 'FTR']] # isolate required columns
    x = x.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals', 'FTR':'Result'})
    globals()["epl_"+str(season)] = x
    
    return globals()["epl_"+str(season)]

In [6]:
# Get data from the 2018/2019 season
# Consider COVID19's impact on home crowds and the home advantage - we will start with 18/19 instead
get_epl_data(1819)

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Result
0,Man United,Leicester,2,1,H
1,Bournemouth,Cardiff,2,0,H
2,Fulham,Crystal Palace,0,2,A
3,Huddersfield,Chelsea,0,3,A
4,Newcastle,Tottenham,1,2,A
...,...,...,...,...,...
375,Liverpool,Wolves,2,0,H
376,Man United,Cardiff,0,2,A
377,Southampton,Huddersfield,1,1,D
378,Tottenham,Everton,2,2,D


### Model

In [9]:
# Remove final week of fixtures
# Notice the home advantage
epl_1819 = epl_1819[:-10]
epl_1819.mean()

HomeGoals    1.574286
AwayGoals    1.245714
dtype: float64

In [11]:
# probability of draw between home and away team
skellam.pmf(0.0,  epl_1819.mean()[0],  epl_1819.mean()[1])

0.24795571573757016

In [12]:
# Prepare the dataset
# Separate home and away teams/goals - then concatenate
goal_model_data = pd.concat([epl_1819[['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename(
            columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals'}),
           epl_1819[['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename(
            columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})])

In [13]:
# Building the model
# Poisson Regression: log-linear model
poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, 
                        family=sm.families.Poisson()).fit()

In [14]:
# Get a statistical summary of the poisson model
poisson_model.summary()

0,1,2,3
Dep. Variable:,goals,No. Observations:,700.0
Model:,GLM,Df Residuals:,660.0
Model Family:,Poisson,Df Model:,39.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-974.49
Date:,"Sun, 26 Sep 2021",Deviance:,685.96
Time:,01:46:25,Pearson chi2:,593.0
No. Iterations:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.5036,0.197,2.558,0.011,0.118,0.890
team[T.Bournemouth],-0.3406,0.187,-1.817,0.069,-0.708,0.027
team[T.Brighton],-0.7855,0.214,-3.669,0.000,-1.205,-0.366
team[T.Burnley],-0.4553,0.193,-2.357,0.018,-0.834,-0.077
team[T.Cardiff],-0.8120,0.219,-3.703,0.000,-1.242,-0.382
team[T.Chelsea],-0.1752,0.178,-0.985,0.325,-0.524,0.173
team[T.Crystal Palace],-0.4631,0.195,-2.376,0.018,-0.845,-0.081
team[T.Everton],-0.3338,0.186,-1.793,0.073,-0.699,0.031
team[T.Fulham],-0.7074,0.212,-3.332,0.001,-1.124,-0.291


### Simulation & Validation

In [15]:
# Build a function to simulate a match using the newly generated poisson model
# Outputs the probability distribution 
# Considers 8 goals as a maximum for either team

def simulate_match(foot_model, homeTeam, awayTeam, max_goals=8):
    home_goals_avg = foot_model.predict(pd.DataFrame(data={'team': homeTeam, 
                                                            'opponent': awayTeam,'home':1},
                                                      index=[1])).values[0]
    away_goals_avg = foot_model.predict(pd.DataFrame(data={'team': awayTeam, 
                                                            'opponent': homeTeam,'home':0},
                                                      index=[1])).values[0]
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]]
    return(np.outer(np.array(team_pred[0]), np.array(team_pred[1])))


simulate_match(poisson_model, 'Chelsea', 'Man City')

array([[7.53180722e-02, 1.30364597e-01, 1.12821051e-01, 6.50922606e-02,
        2.81663021e-02, 9.75035210e-03, 2.81273954e-03, 6.95491695e-04,
        1.50474335e-04],
       [6.44105870e-02, 1.11485331e-01, 9.64824229e-02, 5.56656669e-02,
        2.40872874e-02, 8.33831621e-03, 2.40540151e-03, 5.94771308e-04,
        1.28682797e-04],
       [2.75413562e-02, 4.76700703e-02, 4.12549692e-02, 2.38021113e-02,
        1.02994957e-02, 3.56538495e-03, 1.02852688e-03, 2.54318572e-04,
        5.50235439e-05],
       [7.85094852e-03, 1.35888467e-02, 1.17601558e-02, 6.78503807e-03,
        2.93597780e-03, 1.01634986e-03, 2.93192227e-04, 7.24961399e-05,
        1.56850304e-05],
       [1.67849557e-03, 2.90523100e-03, 2.51426555e-03, 1.45060897e-03,
        6.27698134e-04, 2.17290782e-04, 6.26831082e-05, 1.54993310e-05,
        3.35338514e-06],
       [2.87083516e-04, 4.96899691e-04, 4.30030443e-04, 2.48106657e-04,
        1.07359108e-04, 3.71645912e-05, 1.07210811e-05, 2.65094679e-06,
        5.7

In [17]:
# Similar function, selecting most probable scoreline

def simulate_match_output(homeTeam, awayTeam, max_goals=8, foot_model=poisson_model):
    # Predict avg goals
    home_goals_avg = foot_model.predict(pd.DataFrame(data={'team': homeTeam, 
                                                            'opponent': awayTeam,'home':1},
                                                      index=[1])).values[0]
    away_goals_avg = foot_model.predict(pd.DataFrame(data={'team': awayTeam, 
                                                            'opponent': homeTeam,'home':0},
                                                      index=[1])).values[0]
    
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]]
    distribution = np.outer(np.array(team_pred[0]), np.array(team_pred[1])) # multiply distributions together
    
    # Get most likely score from the matrix
    global h, a
    h = np.argmax(distribution)//(max_goals + 1) 
    a = np.argmax(distribution)%(max_goals + 1) 
    output = homeTeam + ": " + str(h) + "\n" + awayTeam + ": " + str(a)
    
    return print(output)
    return(h, a)

simulate_match_output('Chelsea', 'Man City')

Chelsea: 0
Man City: 1


In [18]:
# Similar function, built to be iterated

def simulate_match_clean(homeTeam, awayTeam, max_goals=8, foot_model=poisson_model):
    # Predict avg goals
    home_goals_avg = foot_model.predict(pd.DataFrame(data={'team': homeTeam, 
                                                            'opponent': awayTeam,'home':1},
                                                      index=[1])).values[0]
    away_goals_avg = foot_model.predict(pd.DataFrame(data={'team': awayTeam, 
                                                            'opponent': homeTeam,'home':0},
                                                      index=[1])).values[0]
    
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]]
    distribution = np.outer(np.array(team_pred[0]), np.array(team_pred[1])) # *multiply distributions together
    
    # Get most likely score
    global h, a
    h = np.argmax(distribution)//(max_goals + 1) 
    a = np.argmax(distribution)%(max_goals + 1) 
    
    return(h, a)

simulate_match_clean('Chelsea', 'Man City')

(0, 1)

In [40]:
# Simulate matches for any given PL seaason
# Takes in dataset as input

def simulate_test(x):
    
    data = x
    
    data['HomePred'] = [0] * len(data)
    data['AwayPred'] = [0] * len(data)
    
    for i in range(len(data)):
        homeTeam = data['HomeTeam'][i]
        awayTeam = data['AwayTeam'][i]
        
        simulate_match_clean(homeTeam, awayTeam)
        data.loc[i, 'HomePred'] = int(h)
        data.loc[i, 'AwayPred'] = int(a)
    
    data = pd.DataFrame.from_dict(data)
    
    return(data)

epl_1819_post = simulate_test(epl_1819)
epl_1819_post

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Result,HomePred,AwayPred
0,Man United,Leicester,2,1,H,2,1
1,Bournemouth,Cardiff,2,0,H,2,0
2,Fulham,Crystal Palace,0,2,A,1,1
3,Huddersfield,Chelsea,0,3,A,0,2
4,Newcastle,Tottenham,1,2,A,0,1
...,...,...,...,...,...,...,...
345,Chelsea,Burnley,2,2,D,2,0
346,Tottenham,Brighton,1,0,H,2,0
347,Watford,Southampton,1,1,D,1,1
348,Man United,Man City,0,2,A,0,2


In [46]:
def update_df(data):
    
    ResultPred = []
    
    for i in data['HomePred']:
        if i > data['AwayPred'][i]:
            ResultPred.append('H')
        elif i == data['AwayPred'][i]:
            ResultPred.append('D')
        else:
            ResultPred.append('A')
        
    data = pd.concat([data, pd.Series(ResultPred)], axis=1).rename({0: 'ResultPred'}, axis =1)
    
    correctFTR = []
    
    for i in range(len(data)):
        if data['ResultPred'][i] is str(data['Result'][i]):
            correctFTR.append(True)
        else:
            correctFTR.append(False)
        
    data = pd.concat([data, pd.Series(correctFTR)], axis=1).rename({0: 'correctFTR'}, axis =1)
    
    return data

update_df(epl_1819_post)

Unnamed: 0,HomeTeam,AwayTeam,HomeGoals,AwayGoals,Result,HomePred,AwayPred,ResultPred,correctFTR
0,Man United,Leicester,2,1,H,2,1,H,True
1,Bournemouth,Cardiff,2,0,H,2,0,H,True
2,Fulham,Crystal Palace,0,2,A,1,1,H,False
3,Huddersfield,Chelsea,0,3,A,0,2,A,True
4,Newcastle,Tottenham,1,2,A,0,1,A,True
...,...,...,...,...,...,...,...,...,...
345,Chelsea,Burnley,2,2,D,2,0,H,False
346,Tottenham,Brighton,1,0,H,2,0,H,True
347,Watford,Southampton,1,1,D,1,1,H,False
348,Man United,Man City,0,2,A,0,2,A,True


### Plots

### Next Steps