### Predicting Football Results Using Bayesian Modelling with Python and PyMC3
https://pena.lt/y/2021/08/25/predicting-football-results-using-bayesian-statistics-with-python-and-pymc3/

In [2]:
import pandas as pd
import numpy as np
from penaltyblog.scrapers import ESPN

In [3]:
espn = ESPN("ENG Premier League", "2019-2020")

df = espn.get_fixtures()

df.head()

Unnamed: 0_level_0,espn_id,datetime,attendance,team_home,team_away,goals_home,goals_away,appearances_home,fouls_committed_home,won_corners_home,...,won_corners_away,goal_assists_away,possession_pct_away,shot_assists_away,shots_on_target_away,total_goals_away,total_shots_away,season,competition,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1565308800---liverpool---norwich_city,541844,2019-08-09 19:00:00+00:00,53333,Liverpool,Norwich City,4,1,0,9,11,...,2,1,42.1,10,5,1,12,2019-2020,ENG Premier League,2019-08-09
1565395200---afc_bournemouth---sheffield_united,541840,2019-08-10 14:00:00+00:00,10714,AFC Bournemouth,Sheffield United,1,1,0,10,3,...,4,0,47.1,5,3,1,8,2019-2020,ENG Premier League,2019-08-10
1565395200---burnley---southampton,541841,2019-08-10 14:00:00+00:00,19784,Burnley,Southampton,3,0,0,6,2,...,7,0,53.5,5,3,0,11,2019-2020,ENG Premier League,2019-08-10
1565395200---crystal_palace---everton,541839,2019-08-10 14:00:00+00:00,25151,Crystal Palace,Everton,0,0,0,16,6,...,2,0,64.6,7,3,0,10,2019-2020,ENG Premier League,2019-08-10
1565395200---tottenham_hotspur---aston_villa,541837,2019-08-10 16:30:00+00:00,60407,Tottenham Hotspur,Aston Villa,3,1,0,13,14,...,0,1,29.9,7,4,1,7,2019-2020,ENG Premier League,2019-08-10


In [4]:
df.dtypes

espn_id                              object
datetime                datetime64[ns, UTC]
attendance                            int64
team_home                            object
team_away                            object
goals_home                            int64
goals_away                            int64
appearances_home                     object
fouls_committed_home                  int64
won_corners_home                      int64
goal_assists_home                     int64
possession_pct_home                 float64
shot_assists_home                     int64
shots_on_target_home                  int64
total_goals_home                      int64
total_shots_home                      int64
appearances_away                     object
fouls_committed_away                  int64
won_corners_away                      int64
goal_assists_away                     int64
possession_pct_away                 float64
shot_assists_away                     int64
shots_on_target_away            

In [5]:
def log_likelihood(
    goals_home_observed,
    goals_away_observed,
    home_attack,
    home_defence,
    away_attack,
    away_defence,
    home_advantage,
    rho,
    weight
):
    goal_expectation_home = np.exp(home_attack + away_defence + home_advantage)
    goal_expectation_away = np.exp(away_attack + home_defence)

    home_llk = poisson.pmf(goals_home_observed, goal_expectation_home)
    away_llk = poisson.pmf(goals_away_observed, goal_expectation_away)
    adj_llk = rho_correction(
        goals_home_observed,
        goals_away_observed,
        goal_expectation_home,
        goal_expectation_away,
        rho,
    )

    if goal_expectation_home < 0 or goal_expectation_away < 0 or adj_llk < 0:
        return 10000

    log_llk = weight * (np.log(home_llk) + np.log(away_llk) + np.log(adj_llk))

    return -log_llk

In [6]:
def fit_poisson_model(df, xi=0.0001):
    teams = np.sort(np.unique(np.concatenate([df["team_home"], df["team_away"]])))
    n_teams = len(teams)

    df["days_since"] = (df["datetime"].max() - df["datetime"]).dt.days
    df["weight"] = dc_decay(xi, df["days_since"])

    params = np.concatenate(
        (
            np.random.uniform(0.5, 1.5, (n_teams)),  # attack strength
            np.random.uniform(0, -1, (n_teams)),  # defence strength
            [0.25],  # home advantage
            [-0.1],  # rho
        )
    )

    def _fit(params, df, teams):
        attack_params = dict(zip(teams, params[:n_teams]))
        defence_params = dict(zip(teams, params[n_teams : (2 * n_teams)]))
        home_advantage = params[-2]
        rho = params[-1]

        llk = list()
        for idx, row in df.iterrows():
            tmp = log_likelihood(
                row["fthg"],
                row["ftag"],
                attack_params[row["team_home"]],
                defence_params[row["team_home"]],
                attack_params[row["team_away"]],
                defence_params[row["team_away"]],
                home_advantage,
                rho,
                row["weight"],
            )
            llk.append(tmp)

        return np.sum(llk)

    options = {
        "maxiter": 100,
        "disp": False,
    }

    constraints = [{"type": "eq", "fun": lambda x: sum(x[:n_teams]) - n_teams}]

    res = minimize(
        _fit,
        params,
        args=(df, teams),
        constraints=constraints,
        options=options,
    )

    model_params = dict(
        zip(
            ["attack_" + team for team in teams]
            + ["defence_" + team for team in teams]
            + ["home_adv", "rho"],
            res["x"],
        )
    )

    return model_params

In [7]:
def predict(params, home_team, away_team):
    home_attack = params["attack_" + home_team]
    home_defence = params["defence_" + home_team]
    away_attack = params["attack_" + away_team]
    away_defence = params["defence_" + away_team]
    home_advantage = params["home_adv"]
    rho = params["rho"]

    home_goal_expectation = np.exp(home_attack + away_defence + home_advantage)
    away_goal_expectation = np.exp(away_attack + home_defence)

    home_probs = poisson.pmf(range(10), home_goal_expectation)
    away_probs = poisson.pmf(range(10), away_goal_expectation)

    m = np.outer(home_probs, away_probs)

    m[0, 0] *= 1 - home_goal_expectation * away_goal_expectation * rho
    m[0, 1] *= 1 + home_goal_expectation * rho
    m[1, 0] *= 1 + away_goal_expectation * rho
    m[1, 1] *= 1 - rho    

    home = np.sum(np.tril(m, -1))
    draw = np.sum(np.diag(m))
    away = np.sum(np.triu(m, 1))

    return home, draw, away

In [8]:
def calculate_rps(params, df):
    rps = list()
    for idx, row in df.iterrows():
        if row["FTR"] == "H":
            outcome = 0
        elif row["FTR"] == "D":
            outcome = 1
        elif row["FTR"] == "A":
            outcome = 2  

        predictions = predict(params, row["team_home"], row["team_away"])
        rps.append(pb.metrics.rps(predictions, outcome))

    return np.mean(rps)

In [9]:
df.iloc[-380:]

Unnamed: 0_level_0,espn_id,datetime,attendance,team_home,team_away,goals_home,goals_away,appearances_home,fouls_committed_home,won_corners_home,...,won_corners_away,goal_assists_away,possession_pct_away,shot_assists_away,shots_on_target_away,total_goals_away,total_shots_away,season,competition,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1565308800---liverpool---norwich_city,541844,2019-08-09 19:00:00+00:00,53333,Liverpool,Norwich City,4,1,0,9,11,...,2,1,42.1,10,5,1,12,2019-2020,ENG Premier League,2019-08-09
1565395200---afc_bournemouth---sheffield_united,541840,2019-08-10 14:00:00+00:00,10714,AFC Bournemouth,Sheffield United,1,1,0,10,3,...,4,0,47.1,5,3,1,8,2019-2020,ENG Premier League,2019-08-10
1565395200---burnley---southampton,541841,2019-08-10 14:00:00+00:00,19784,Burnley,Southampton,3,0,0,6,2,...,7,0,53.5,5,3,0,11,2019-2020,ENG Premier League,2019-08-10
1565395200---crystal_palace---everton,541839,2019-08-10 14:00:00+00:00,25151,Crystal Palace,Everton,0,0,0,16,6,...,2,0,64.6,7,3,0,10,2019-2020,ENG Premier League,2019-08-10
1565395200---tottenham_hotspur---aston_villa,541837,2019-08-10 16:30:00+00:00,60407,Tottenham Hotspur,Aston Villa,3,1,0,13,14,...,0,1,29.9,7,4,1,7,2019-2020,ENG Premier League,2019-08-10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595721600---leicester_city---manchester_united,541469,2020-07-26 15:00:00+00:00,0,Leicester City,Manchester United,0,2,0,12,3,...,3,0,54.2,4,3,2,7,2019-2020,ENG Premier League,2020-07-26
1595721600---manchester_city---norwich_city,541470,2020-07-26 15:00:00+00:00,0,Manchester City,Norwich City,5,0,0,7,9,...,0,0,26.9,4,4,0,5,2019-2020,ENG Premier League,2020-07-26
1595721600---newcastle_united---liverpool,541468,2020-07-26 15:00:00+00:00,0,Newcastle United,Liverpool,1,3,0,11,2,...,4,3,74.7,10,6,3,14,2019-2020,ENG Premier League,2020-07-26
1595721600---southampton---sheffield_united,541471,2020-07-26 15:00:00+00:00,0,Southampton,Sheffield United,3,1,0,9,9,...,1,1,27.9,5,3,1,5,2019-2020,ENG Premier League,2020-07-26


In [12]:
train = df.iloc[-380:]
(train["datetime"].max() - train["datetime"]).dt.days

id
1565308800---liverpool---norwich_city              351
1565395200---afc_bournemouth---sheffield_united    351
1565395200---burnley---southampton                 351
1565395200---crystal_palace---everton              351
1565395200---tottenham_hotspur---aston_villa       350
                                                  ... 
1595721600---leicester_city---manchester_united      0
1595721600---manchester_city---norwich_city          0
1595721600---newcastle_united---liverpool            0
1595721600---southampton---sheffield_united          0
1595721600---west_ham_united---aston_villa           0
Name: datetime, Length: 380, dtype: int64

In [11]:
from pprint import pprint
import numpy as np
from scipy.optimize import minimize
from scipy.stats import poisson

xis = [0, 0.00005, 0.0001, 0.001, 0.0025,]
rps = list()
for xi in xis:
    train = df.iloc[-380:]
    test = df.iloc[-380:]

    train.iloc["days_since"] = (train["datetime"].max() - train["datetime"]).dt.days
    train["weight"] = dc_decay(xi, train["days_since"])

    params = fit_poisson_model(train, xi)
    rps.append(calculate_rps(params, test))

plt.plot(xis, rps)
plt.ticklabel_format(useOffset=False)
plt.xlabel("xi")
plt.ylabel("RPS")   

ValueError: Must have equal len keys and value when setting with an iterable