## Predicting Premier League Results

Can we do it? Probably not. But anyway.

Setup imports etc. first

In [81]:
import os
import pandas as pd
from IPython.display import display
from pandas.api.types import is_string_dtype

from fastai.tabular.transform import add_datepart

First, load the raw data

In [17]:
PATH = "data/"
pd.set_option("display.max_rows", 40)

raw_dataframe = pd.read_csv(f'{PATH}EPL_Set.csv', low_memory=False, parse_dates=["Date"])
display(raw_dataframe)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Season
0,E0,1993-08-14,Arsenal,Coventry,0,3,A,,,,1993-94
1,E0,1993-08-14,Aston Villa,QPR,4,1,H,,,,1993-94
2,E0,1993-08-14,Chelsea,Blackburn,1,2,A,,,,1993-94
3,E0,1993-08-14,Liverpool,Sheffield Weds,2,0,H,,,,1993-94
4,E0,1993-08-14,Man City,Leeds,1,1,D,,,,1993-94
...,...,...,...,...,...,...,...,...,...,...,...
9659,E0,2018-05-13,Newcastle,Chelsea,3,0,H,1.0,0.0,H,2017-18
9660,E0,2018-05-13,Southampton,Man City,0,1,A,0.0,0.0,D,2017-18
9661,E0,2018-05-13,Swansea,Stoke,1,2,A,1.0,2.0,A,2017-18
9662,E0,2018-05-13,Tottenham,Leicester,5,4,H,1.0,2.0,A,2017-18


Now let's remove data we do not need (we don't want half-time data, as we want to be able to predict before a game has even started

In [18]:
cleaned_dataframe = raw_dataframe.drop(["Div", "HTHG", "HTAG", "HTR"], axis=1)
display(cleaned_dataframe)

os.makedirs('tmp', exist_ok=True)
cleaned_dataframe.to_feather('tmp/epl-cleaned')

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,Season
0,1993-08-14,Arsenal,Coventry,0,3,A,1993-94
1,1993-08-14,Aston Villa,QPR,4,1,H,1993-94
2,1993-08-14,Chelsea,Blackburn,1,2,A,1993-94
3,1993-08-14,Liverpool,Sheffield Weds,2,0,H,1993-94
4,1993-08-14,Man City,Leeds,1,1,D,1993-94
...,...,...,...,...,...,...,...
9659,2018-05-13,Newcastle,Chelsea,3,0,H,2017-18
9660,2018-05-13,Southampton,Man City,0,1,A,2017-18
9661,2018-05-13,Swansea,Stoke,1,2,A,2017-18
9662,2018-05-13,Tottenham,Leicester,5,4,H,2017-18


Now let's add derived data - calculate each team's form and matchday

In [229]:
import math
import numpy as np

augmented_dataframe = pd.read_feather('tmp/epl-cleaned')
add_datepart(augmented_dataframe, "Date")
augmented_dataframe = augmented_dataframe.drop(["Elapsed"], axis=1)

def form_score(results, length, default_value=0):
    """
    Calculate the recent form of this team, given a list of results and the # of games asked for
    """
    if not results.size:
        return default_value
    return sum(results[-length:])/len(results[-length:])

K = 16

def elo_score(R_home, R_away, result):
    """
    Calculates the Elo score for the home & away teams in a match
    Based on Elo with two modifications:
     - Expected home & away adjusted for home advantage
     - Goal difference taken into account
    https://www.eloratings.net/about
    """
        
    Q_home = 10**((R_home + 50)/400)
    Q_away = 10**((R_away - 50)/400)
    
    E_home = Q_home/(Q_home + Q_away)
    E_away = Q_away/(Q_home + Q_away)
    
    if result > 0:
        S_home = 1
        S_away = 0
    elif result == 0:
        S_home = 0.5
        S_away = 0.5
    elif result < 0:
        S_home = 0
        S_away = 1
        
    gd = abs(result)
    if gd < 2:
        k = K
    elif gd < 4:
        k = K*(1 + gd/4)
    else:
        k = K*(1.75 + (gd - 3)/8)
    
    R_home = R_home + k*(S_home - E_home)
    R_away = R_away + k*(S_away - E_away)
    return (R_home, R_away)

    
FORM_LENGTH = 10

matchdays = {}
elo = {}
home_form = {}
away_form = {}

for (i, row) in augmented_dataframe.iterrows():
        
    # Get teams
    home_team = row["HomeTeam"]
    away_team = row["AwayTeam"]
    
    # Get each team's respective matchday, form, and Elo score
    home_matchday = matchdays.get((home_team, row["Season"]), 1)
    away_matchday = matchdays.get((away_team, row["Season"]), 1)
    
    home_elo = elo.get(home_team, 1500 if row["Season"] == "1993-94" else 1400)
    away_elo = elo.get(away_team, 1500 if row["Season"] == "1993-94" else 1400)
    
    home_team_form = home_form.get(home_team, [])
    away_team_form = away_form.get(away_team, [])
    
    # Update the current row with these stats
    
    augmented_dataframe.at[i, "HomeMatchday"] = home_matchday
    augmented_dataframe.at[i, "AwayMatchday"] = away_matchday
        
    augmented_dataframe.at[i, "HomeElo"] = home_elo
    augmented_dataframe.at[i, "AwayElo"] = away_elo

    augmented_dataframe.at[i, "HomeForm"] = sum(home_team_form[-10:])
    augmented_dataframe.at[i, "AwayForm"] = sum(away_team_form[-10:])
    
    # Update the stats
    
    matchdays[(home_team, row["Season"])] = home_matchday + 1
    matchdays[(away_team, row["Season"])] = away_matchday + 1

    new_home_elo, new_away_elo = elo_score(home_elo, away_elo, row["FTHG"] - row["FTAG"])
    elo[home_team] = new_home_elo
    elo[away_team] = new_away_elo
    
    home_form[home_team] = home_team_form + [(new_home_elo - home_elo)]
    away_form[away_team] = away_team_form + [(new_away_elo - away_elo)]
            
augmented_dataframe = augmented_dataframe.drop(["Season"], axis=1)
augmented_dataframe.to_feather('tmp/epl-augmented')

display(augmented_dataframe)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,Year,Month,Week,Day,Dayofweek,...,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,HomeMatchday,AwayMatchday,HomeElo,AwayElo,HomeForm,AwayForm
0,Arsenal,Coventry,0,3,A,1993,8,32,14,5,...,False,False,False,False,1.0,1.0,1500.000000,1500.000000,0.000000,0.000000
1,Aston Villa,QPR,4,1,H,1993,8,32,14,5,...,False,False,False,False,1.0,1.0,1500.000000,1500.000000,0.000000,0.000000
2,Chelsea,Blackburn,1,2,A,1993,8,32,14,5,...,False,False,False,False,1.0,1.0,1500.000000,1500.000000,0.000000,0.000000
3,Liverpool,Sheffield Weds,2,0,H,1993,8,32,14,5,...,False,False,False,False,1.0,1.0,1500.000000,1500.000000,0.000000,0.000000
4,Man City,Leeds,1,1,D,1993,8,32,14,5,...,False,False,False,False,1.0,1.0,1500.000000,1500.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9659,Newcastle,Chelsea,3,0,H,2018,5,19,13,6,...,False,False,False,False,38.0,38.0,1444.778743,1701.977359,11.539939,-6.661626
9660,Southampton,Man City,0,1,A,2018,5,19,13,6,...,False,False,False,False,38.0,38.0,1467.668590,1827.466509,-38.956622,31.722641
9661,Swansea,Stoke,1,2,A,2018,5,19,13,6,...,False,False,False,False,38.0,38.0,1430.460516,1418.054330,19.085539,-1.054051
9662,Tottenham,Leicester,5,4,H,2018,5,19,13,6,...,False,False,False,False,38.0,38.0,1749.140418,1510.624895,6.923503,4.689006


Now, let's have a go at classifying the data:

In [230]:
from sklearn import preprocessing
from sklearn.ensemble import forest, RandomForestClassifier

def split_vals(a, n):
    return a[:n].copy(), a[n:].copy()

final_dataframe = pd.read_feather('tmp/epl-augmented')

# Label results
result_encoder = preprocessing.LabelEncoder()
result_encoder.fit(final_dataframe.FTR)
final_dataframe.FTR = result_encoder.transform(augmented_dataframe.FTR)

# Label team names
team_encoder = preprocessing.LabelEncoder()
team_encoder.fit(final_dataframe.HomeTeam)
final_dataframe.HomeTeam = team_encoder.transform(augmented_dataframe.HomeTeam)
final_dataframe.AwayTeam = team_encoder.transform(augmented_dataframe.AwayTeam)

# Drop result, and H/A goals from the data
source_dataframe = final_dataframe.drop(["FTR", "FTHG", "FTAG"], axis=1)
training_size = len(source_dataframe) - 760

source_train, source_valid = split_vals(source_dataframe[110:], training_size)
result_train, result_valid = split_vals(augmented_dataframe.FTR[110:], training_size)

We need to redefine the RF sampling here as fastai v1 doesn't have them

In [146]:
def set_rf_samples(n):
    forest._generate_sample_indices = (lambda rs, n_samples:
            forest.check_random_state(rs).randint(0, n_samples, n))
    
def reset_rf_samples():
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n_samples))
    


Now, finally, we can run the classifier

In [240]:
reset_rf_samples()
set_rf_samples(2000)

classifier = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=0.5, min_samples_leaf=3, oob_score=True)
%time classifier.fit(source_train, result_train)

print ("""
CLASSIFICATION:

Training data score: {}
Validation data score: {}
Out-of-bag score: {}""".format(
    classifier.score(source_train, result_train),
    classifier.score(source_valid, result_valid),
    classifier.oob_score_,
))

CPU times: user 1.34 s, sys: 52.1 ms, total: 1.39 s
Wall time: 529 ms

CLASSIFICATION:

Training data score: 0.7022686433063792
Validation data score: 0.5784615384615385
Out-of-bag score: 0.5037061994609164


## Baseline

Without any Elo data - just teams and dates, validation score is about `0.51`.

## Current model

With random forest, using form, goals for & against & Elo score combined, accuracy of about `0.57`

## TODOs

* Add results since 1981 (when three points for a win was introduced)
* Use weighted average of Elo form?
