# How to predict soccer results with the Poisson distribution
## A deep dive into the mechanics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn
from scipy.stats import poisson,skellam

epl_1819 = pd.read_csv("http://www.football-data.co.uk/mmz4281/1819/E0.csv")
epl_1819 = epl_1819[['HomeTeam','AwayTeam','FTHG','FTAG']]
epl_1819 = epl_1819.rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'})

In [None]:
epl_1819_master = epl_1819
epl_1819_test = epl_1819[-10:]
epl_1819 = epl_1819[:-10]
epl_1819.mean()

In [None]:
epl_1819[['HomeGoals','AwayGoals']].max()

In [None]:
poisson.pmf(1,1.575676)

In [None]:
import plotly.graph_objects as go

#Y-Axis ticks for the line
goals=['0','1','2','3','4','5','6','7','8']

#Poisson predictions as percentage
probs_goals = [[poisson.pmf(i, epl_1819.mean()[j]) for i in range(epl_1819['HomeGoals'].max()+3)] for j in range(2)]
probs_home_new = [i*100 for i in probs_goals[0]]
probs_away_new = [i*100 for i in probs_goals[1]]

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=epl_1819['HomeGoals'],
    histnorm='percent',
    name='Home',
    marker_color='#EB89B5',
    opacity=0.50
))
fig.add_trace(go.Histogram(
    x=epl_1819['AwayGoals'],
    histnorm='percent',
    name='Away',
    marker_color='#330C73',
    opacity=0.50
))

fig.add_trace(go.Scatter(x=goals, y=probs_home_new,
                    mode='lines+markers',
                    line = dict(color='#EB89B5', width=4, dash='dash'),
                    name='Home Poisson', ))

fig.add_trace(go.Scatter(x=goals, y=probs_away_new,
                    mode='lines+markers',
                    line = dict(color='#330C73', width=4, dash='dash'),
                    name='Away Poisson'))

fig.update_layout(title='Real number of goals per match vs Poisson prediction (Premier League 2018/19)',
                   xaxis_title='Goals',
                   yaxis_title='Share in %',
                  barmode='group')
#fig.show()

## Predict a match of the Premier League

In [None]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

goal_model_data = pd.concat([epl_1819[['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename(
            columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals'}),
           epl_1819[['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename(
            columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})])

poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, 
                        family=sm.families.Poisson()).fit()
#poisson_model.summary()

In [None]:
liverpool_home_goals = 2.718**(0.4925+0.1849-0.2537+0.2526)
chelsea_away_goals = 2.718**(0.4925-0.1249-0.8112)
display(liverpool_home_goals)
display(chelsea_away_goals)

## Testing the last matchday

In [None]:
epl_1819_test = epl_1819_test.reset_index()

In [None]:
def simulate_match(foot_model, homeTeam, awayTeam, max_goals=10):
    home_goals_avg = foot_model.predict(pd.DataFrame(data={'team': homeTeam, 
                                                            'opponent': awayTeam,'home':1},
                                                      index=[1])).values[0]
    away_goals_avg = foot_model.predict(pd.DataFrame(data={'team': awayTeam, 
                                                            'opponent': homeTeam,'home':0},
                                                      index=[1])).values[0]
    team_pred = [[poisson.pmf(i, team_avg) for i in range(0, max_goals+1)] for team_avg in [home_goals_avg, away_goals_avg]]
    return(np.outer(np.array(team_pred[0]), np.array(team_pred[1])))


In [None]:
from numpy import unravel_index

home_match = []
away_match = []
home_goals = []
away_goals = []
pred_list_home = []
pred_list_away = []

for n in range(0,len(epl_1819_test['HomeTeam'])):

    a = simulate_match(poisson_model, epl_1819_test['HomeTeam'].loc[n], epl_1819_test['AwayTeam'].loc[n], max_goals=7)
    b = unravel_index(a.argmax(), a.shape)
    
    home_match.append(epl_1819_test['HomeTeam'].loc[n])
    away_match.append(epl_1819_test['AwayTeam'].loc[n])
    home_goals.append(epl_1819_test['HomeGoals'].loc[n])
    away_goals.append(epl_1819_test['AwayGoals'].loc[n])
    pred_list_home.append(b[0])
    pred_list_away.append(b[1])
    
    #pred_list_home.append(result)
    #pred_list_away.append(result2)

final_df = pd.DataFrame({'Home': home_match, 'Away': away_match,
                         'Home Goals': home_goals, 'Away Goals': away_goals,
                         'Home Prediction': pred_list_home, 'Away Prediction': pred_list_away})

In [None]:
def transform_df(mod_df):
    mod_df.loc[(mod_df['Home Goals'] > mod_df['Away Goals']), '1x2'] = '1'
    mod_df.loc[(mod_df['Home Goals'] == mod_df['Away Goals']), '1x2'] = 'X'
    mod_df.loc[(mod_df['Home Goals'] < mod_df['Away Goals']), '1x2'] = '2'
    
    mod_df['Home Prediction Int'] = mod_df['Home Prediction'].round(0).astype(int)
    mod_df['Away Prediction Int'] = mod_df['Away Prediction'].round(0).astype(int)
    mod_df.loc[(mod_df['Home Prediction Int'] > mod_df['Away Prediction Int']), '1x2_Pred'] = '1'
    mod_df.loc[(mod_df['Home Prediction Int'] == mod_df['Away Prediction Int']), '1x2_Pred'] = 'X'
    mod_df.loc[(mod_df['Home Prediction Int'] < mod_df['Away Prediction Int']), '1x2_Pred'] = '2'
    
    mod_df['Exact_Res'] = 'NO'
    mod_df.loc[(mod_df['Home Goals'] == mod_df['Home Prediction Int']) & 
               (mod_df['Away Goals'] == mod_df['Away Prediction Int']), 'Exact_Res'] = 'YES'
    
    mod_df.loc[mod_df['1x2'] == mod_df['1x2_Pred'], 'Match'] = 'Yes'
    mod_df.loc[mod_df['1x2'] != mod_df['1x2_Pred'], 'Match'] = 'No'
    return mod_df

In [None]:
transform_df(final_df)

In [None]:
final_df.loc[final_df['1x2'] == final_df['1x2_Pred'], 'Match'] = 'Yes'
final_df.loc[final_df['1x2'] != final_df['1x2_Pred'], 'Match'] = 'No'

In [None]:
final_df.columns

In [None]:
final_df_s = final_df[['Home', 'Away', 'Home Goals', 'Away Goals', 'Home Prediction',
       'Away Prediction', '1x2',
       '1x2_Pred', 'Match']]

In [None]:
final_df_s

In [None]:
final_df.groupby('Match')['Match'].count()

## Test for all 5 top leagues

In [None]:
def loading_dfs(leagues,league_dfs,years):
    league_df = pd.DataFrame()
    for n in range(0,len(leagues)):
        league_dfs[n] = pd.read_csv("http://www.football-data.co.uk/mmz4281/{}{}/{}.csv".format(year, year+1,leagues[n]))
        league_dfs[n] = league_dfs[n][['Div','HomeTeam','AwayTeam','FTHG','FTAG']]
        league_dfs[n] = league_dfs[n].rename(columns={'FTHG': 'HomeGoals', 'FTAG': 'AwayGoals'})
        league_dfs_test[n] = league_dfs[n][-100:]
        league_dfs_test[n] = league_dfs_test[n].reset_index()
        league_dfs[n] = league_dfs[n][:-100]
    return league_dfs, league_dfs_test

def modelling_dfs(league_df):
    psn_modells = []
    for n in range(0,len(league_df[0])):
        import statsmodels.api as sm
        import statsmodels.formula.api as smf

        goal_model_data = pd.concat([league_df[0][n][['HomeTeam','AwayTeam','HomeGoals']].assign(home=1).rename(
                    columns={'HomeTeam':'team', 'AwayTeam':'opponent','HomeGoals':'goals'}),
                   league_df[0][n][['AwayTeam','HomeTeam','AwayGoals']].assign(home=0).rename(
                    columns={'AwayTeam':'team', 'HomeTeam':'opponent','AwayGoals':'goals'})])

        poisson_model = smf.glm(formula="goals ~ home + team + opponent", data=goal_model_data, 
                                family=sm.families.Poisson()).fit()
        psn_modells.append(poisson_model)
    return psn_modells

def transform_df(mod_df):
    mod_df.loc[(mod_df['Home Goals'] > mod_df['Away Goals']), '1x2'] = '1'
    mod_df.loc[(mod_df['Home Goals'] == mod_df['Away Goals']), '1x2'] = 'X'
    mod_df.loc[(mod_df['Home Goals'] < mod_df['Away Goals']), '1x2'] = '2'
    
    mod_df['Home Prediction Int'] = mod_df['Home Prediction'].round(0).astype(int)
    mod_df['Away Prediction Int'] = mod_df['Away Prediction'].round(0).astype(int)
    mod_df.loc[(mod_df['Home Prediction Int'] > mod_df['Away Prediction Int']), '1x2_Pred'] = '1'
    mod_df.loc[(mod_df['Home Prediction Int'] == mod_df['Away Prediction Int']), '1x2_Pred'] = 'X'
    mod_df.loc[(mod_df['Home Prediction Int'] < mod_df['Away Prediction Int']), '1x2_Pred'] = '2'
    
    mod_df['Exact_Res'] = 'NO'
    mod_df.loc[(mod_df['Home Goals'] == mod_df['Home Prediction Int']) & 
               (mod_df['Away Goals'] == mod_df['Away Prediction Int']), 'Exact_Res'] = 'YES'
    
    mod_df.loc[mod_df['1x2'] == mod_df['1x2_Pred'], 'Match'] = 'Yes'
    mod_df.loc[mod_df['1x2'] != mod_df['1x2_Pred'], 'Match'] = 'No'
    return mod_df

def predicting_dfs(league_df,psn_modells):
    prediction_dfs = []
    for k in range(0,len(league_df[0])):        
        from numpy import unravel_index

        home_match = []
        away_match = []
        home_goals = []
        away_goals = []
        pred_list_home = []
        pred_list_away = []

        for n in range(0,len(league_df[1][k]['HomeTeam'])):

            a = simulate_match(psn_modells[k], league_df[1][k]['HomeTeam'].loc[n], league_df[1][k]['AwayTeam'].loc[n], max_goals=7)
            b = unravel_index(a.argmax(), a.shape)

            home_match.append(league_df[1][k]['HomeTeam'].loc[n])
            away_match.append(league_df[1][k]['AwayTeam'].loc[n])
            home_goals.append(league_df[1][k]['HomeGoals'].loc[n])
            away_goals.append(league_df[1][k]['AwayGoals'].loc[n])
            pred_list_home.append(b[0])
            pred_list_away.append(b[1])

        final_df = pd.DataFrame({'Home': home_match, 'Away': away_match,
                                 'Home Goals': home_goals, 'Away Goals': away_goals,
                                 'Home Prediction': pred_list_home, 'Away Prediction': pred_list_away})
        
        final_df = transform_df(final_df)
        
        prediction_dfs.append(final_df)
        
    return prediction_dfs

In [None]:
leagues = ['E0','SP1','D1','I1','F1']
league_dfs = ['E0_df','SP1_df','D1_df','I1_df','F1_df']
league_dfs_test = ['E0_df_test','SP1_df_test','D1_df_test','I1_df_test','F1_df_test']
year = 18

league_df = loading_dfs(leagues,league_dfs,year)

In [None]:
psn_modells = modelling_dfs(league_df)

In [None]:
predicting_dfs = predicting_dfs(league_df,psn_modells)

In [None]:
correct_pl = predicting_dfs[0].groupby('Match')['Match'].count()[1]
correct_sp = predicting_dfs[1].groupby('Match')['Match'].count()[1]
correct_ger = predicting_dfs[2].groupby('Match')['Match'].count()[1]
correct_it = predicting_dfs[3].groupby('Match')['Match'].count()[1]
correct_fr = predicting_dfs[4].groupby('Match')['Match'].count()[1]
incorrect_pl = predicting_dfs[0].groupby('Match')['Match'].count()[0]
incorrect_sp = predicting_dfs[1].groupby('Match')['Match'].count()[0]
incorrect_ger = predicting_dfs[2].groupby('Match')['Match'].count()[0]
incorrect_it = predicting_dfs[3].groupby('Match')['Match'].count()[0]
incorrect_fr = predicting_dfs[4].groupby('Match')['Match'].count()[0]

In [None]:
yes = correct_pl + correct_sp + correct_ger + correct_it + correct_fr
no = incorrect_pl + incorrect_sp + incorrect_ger + incorrect_it + incorrect_fr

In [None]:
round((yes/(yes+no)*100),2)

In [None]:
league_dic = ['England','Spain','Germany','Italy','France']
league = []
wins_per_matchday = []
lose_per_matchday = []
matchday = []

for k in range(0,len(league_dic)):
    for n in range(0,100,10):
        matchday.append((n+10)/10)
        league.append(league_dic[k])
        wins_per_matchday.append(predicting_dfs[k][0+n:10+n].groupby('Match')['Match'].count()[1])
        lose_per_matchday.append(predicting_dfs[k][0+n:10+n].groupby('Match')['Match'].count()[0])
        
df_vis = pd.DataFrame({'League': league, 'Wins': wins_per_matchday, 'Loses': lose_per_matchday, 'Matchday': matchday})

In [None]:
bla = pd.DataFrame(df_vis.groupby(['League','Matchday'])['Wins'].sum()).reset_index()

In [None]:
bla2 = bla.groupby('Matchday').sum().reset_index()

In [None]:
import plotly.express as px
league_dic = ['England','Spain','Germany','Italy','France']

fig = go.Figure()

fig = go.Figure(data=[
    go.Bar(name='England', x=bla['Matchday'], y=bla.loc[bla['League'] == 'England']['Wins'],marker_color='#330C73',
    opacity=0.20),
    go.Bar(name='Spain', x=bla['Matchday'], y=bla.loc[bla['League'] == 'Spain']['Wins'],marker_color='#330C73',
    opacity=0.40),
    go.Bar(name='Germany', x=bla['Matchday'], y=bla.loc[bla['League'] == 'Germany']['Wins'],marker_color='#330C73',
    opacity=0.60),
    go.Bar(name='Italy', x=bla['Matchday'], y=bla.loc[bla['League'] == 'Italy']['Wins'],marker_color='#330C73',
    opacity=0.80),
    go.Bar(name='France', x=bla['Matchday'], y=bla.loc[bla['League'] == 'France']['Wins'],marker_color='#330C73',
    opacity=1.00)
])


fig.add_trace(go.Scatter(x=bla2['Matchday'], y=bla2['Wins']/5,
                    mode='lines',
                         line = dict(color='Orange', width=4),
                    name='Average'))

fig.update_layout(title='Correct predicted matches out of 10 per matchday',
                   xaxis_title='-- Matchday ->',
                   yaxis_title='Correct predicted matches',
                  barmode='group')
#fig.show()