## Predicting Premier League Results

Can we do it? Probably not. But anyway.

Setup imports etc. first

In [8]:
import os
import pandas as pd
from IPython.display import display
from pandas.api.types import is_string_dtype

from fastai.tabular.transform import add_datepart

First, load the raw data

In [9]:
PATH = "data/"
pd.set_option("display.max_rows", 20)

raw_dataframe = pd.read_csv(f'{PATH}EPL_Set.csv', low_memory=False, parse_dates=["Date"])
display(raw_dataframe)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Season
0,E0,1993-08-14,Arsenal,Coventry,0,3,A,,,,1993-94
1,E0,1993-08-14,Aston Villa,QPR,4,1,H,,,,1993-94
2,E0,1993-08-14,Chelsea,Blackburn,1,2,A,,,,1993-94
3,E0,1993-08-14,Liverpool,Sheffield Weds,2,0,H,,,,1993-94
4,E0,1993-08-14,Man City,Leeds,1,1,D,,,,1993-94
5,E0,1993-08-14,Newcastle,Tottenham,0,1,A,,,,1993-94
6,E0,1993-08-14,Oldham,Ipswich,0,3,A,,,,1993-94
7,E0,1993-08-14,Sheffield United,Swindon,3,1,H,,,,1993-94
8,E0,1993-08-14,Southampton,Everton,0,2,A,,,,1993-94
9,E0,1993-08-14,West Ham,Wimbledon,0,2,A,,,,1993-94


Now let's remove data we do not need (we don't want half-time data, as we want to be able to predict before a game has even started

In [10]:
cleaned_dataframe = raw_dataframe.drop(["Div", "HTHG", "HTAG", "HTR"], axis=1)
display(cleaned_dataframe)

os.makedirs('tmp', exist_ok=True)
cleaned_dataframe.to_feather('tmp/epl-cleaned')

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,Season
0,1993-08-14,Arsenal,Coventry,0,3,A,1993-94
1,1993-08-14,Aston Villa,QPR,4,1,H,1993-94
2,1993-08-14,Chelsea,Blackburn,1,2,A,1993-94
3,1993-08-14,Liverpool,Sheffield Weds,2,0,H,1993-94
4,1993-08-14,Man City,Leeds,1,1,D,1993-94
5,1993-08-14,Newcastle,Tottenham,0,1,A,1993-94
6,1993-08-14,Oldham,Ipswich,0,3,A,1993-94
7,1993-08-14,Sheffield United,Swindon,3,1,H,1993-94
8,1993-08-14,Southampton,Everton,0,2,A,1993-94
9,1993-08-14,West Ham,Wimbledon,0,2,A,1993-94


Let's clean up the data, splitting up the date and categorising the teams

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,Season,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start
0,Arsenal,Coventry,0,3,A,1993-94,8,32,14,5,226,False,False,False,False,False,False
1,Aston Villa,QPR,4,1,H,1993-94,8,32,14,5,226,False,False,False,False,False,False
2,Chelsea,Blackburn,1,2,A,1993-94,8,32,14,5,226,False,False,False,False,False,False
3,Liverpool,Sheffield Weds,2,0,H,1993-94,8,32,14,5,226,False,False,False,False,False,False
4,Man City,Leeds,1,1,D,1993-94,8,32,14,5,226,False,False,False,False,False,False
5,Newcastle,Tottenham,0,1,A,1993-94,8,32,14,5,226,False,False,False,False,False,False
6,Oldham,Ipswich,0,3,A,1993-94,8,32,14,5,226,False,False,False,False,False,False
7,Sheffield United,Swindon,3,1,H,1993-94,8,32,14,5,226,False,False,False,False,False,False
8,Southampton,Everton,0,2,A,1993-94,8,32,14,5,226,False,False,False,False,False,False
9,West Ham,Wimbledon,0,2,A,1993-94,8,32,14,5,226,False,False,False,False,False,False


Now let's add derived data - calculate each team's form and matchday

In [215]:
import math

augmented_dataframe = pd.read_feather('tmp/epl-cleaned')
add_datepart(augmented_dataframe, "Date")
augmented_dataframe = augmented_dataframe.drop(["Elapsed", "Year"], axis=1)

def form_score(results, length, default_value=0):
    if not results:
        return default_value
    return sum(results[-length:])/len(results[-length:])

K = 32

def elo_score(R_home, R_away, result):
    Q_home = 10**(R_home/400)
    Q_away = 10**(R_away/400)
    
    E_home = Q_home/(Q_home + Q_away)
    E_away = Q_away/(Q_home + Q_away)
    
    if result > 0:
        S_home = 1
        S_away = 0
    elif result == 0:
        S_home = 0.5
        S_away = 0.5
    elif result < 0:
        S_home = 0
        S_away = 1
    
    R_home = R_home + K*(S_home - E_home)
    R_away = R_away + K*(S_away - E_away)
    return (R_home, R_away)


form = {}
goals_for = {}
goals_against = {}
elo = {}
matchdays = {}

FORM_LENGTH = 20


for (i, row) in augmented_dataframe.iterrows():
    
    # Get teams
    home_team = row["HomeTeam"]
    away_team = row["AwayTeam"]
    
    # Get each team's respective form, goals for & against, and Elo score
    home_matchday = matchdays.get((home_team, row["Season"]), 1)
    away_matchday = matchdays.get((away_team, row["Season"]), 1)
    
    home_form = form.get(home_team, [])
    away_form = form.get(away_team, [])
    
    home_goals_for = goals_for.get(home_team, [])
    away_goals_for = goals_for.get(away_team, [])
    home_goals_against = goals_against.get(home_team, [])
    away_goals_against = goals_against.get(away_team, [])

    home_elo = elo.get(home_team, 1500)
    away_elo = elo.get(away_team, 1500)
    
    # Update the current row with these stats
    
    augmented_dataframe.at[i, "HomeMatchday"] = home_matchday
    augmented_dataframe.at[i, "AwayMatchday"] = away_matchday
    
    augmented_dataframe.at[i, "HomeForm10"] = form_score(home_form, FORM_LENGTH, 0.5)
    augmented_dataframe.at[i, "AwayForm10"] = form_score(away_form, FORM_LENGTH, 0.5)
    
    augmented_dataframe.at[i, "HomeGF10"] = form_score(home_goals_for, FORM_LENGTH)
    augmented_dataframe.at[i, "AwayGF10"] = form_score(away_goals_for, FORM_LENGTH)
    augmented_dataframe.at[i, "HomeGA10"] = form_score(home_goals_against, FORM_LENGTH)
    augmented_dataframe.at[i, "AwayGA10"] = form_score(away_goals_against, FORM_LENGTH)
    
    augmented_dataframe.at[i, "HomeElo"] = home_elo
    augmented_dataframe.at[i, "AwayElo"] = away_elo
    
    # Update the stats
    
    matchdays[(home_team, row["Season"])] = home_matchday + 1
    matchdays[(away_team, row["Season"])] = away_matchday + 1

    if row["FTR"] == "H":
        home_form.append(1)
        away_form.append(0)
    elif row["FTR"] == "D":
        home_form.append(0.5)
        away_form.append(0.5)
    elif row["FTR"] == "A":
        home_form.append(0)
        away_form.append(1)
                
    form[home_team] = home_form
    form[away_team] = away_form

    goals_for[home_team] = home_goals_for + [row["FTHG"]]
    goals_for[away_team] = away_goals_for + [row["FTAG"]]
    goals_against[home_team] = home_goals_against + [row["FTAG"]]
    goals_against[away_team] = away_goals_against + [row["FTHG"]]
    
    new_home_elo, new_away_elo = elo_score(home_elo, away_elo, row["FTHG"] - row["FTAG"])
    elo[home_team] = new_home_elo
    elo[away_team] = new_away_elo
        
augmented_dataframe = augmented_dataframe.drop(["Season"], axis=1)
augmented_dataframe.to_feather('tmp/epl-augmented')

display(augmented_dataframe)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,Month,Week,Day,Dayofweek,Dayofyear,...,HomeMatchday,AwayMatchday,HomeForm10,AwayForm10,HomeGF10,AwayGF10,HomeGA10,AwayGA10,HomeElo,AwayElo
0,Arsenal,Coventry,0,3,A,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000
1,Aston Villa,QPR,4,1,H,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000
2,Chelsea,Blackburn,1,2,A,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000
3,Liverpool,Sheffield Weds,2,0,H,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000
4,Man City,Leeds,1,1,D,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000
5,Newcastle,Tottenham,0,1,A,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000
6,Oldham,Ipswich,0,3,A,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000
7,Sheffield United,Swindon,3,1,H,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000
8,Southampton,Everton,0,2,A,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000
9,West Ham,Wimbledon,0,2,A,8,32,14,5,226,...,1.0,1.0,0.500,0.500,0.00,0.00,0.00,0.00,1500.000000,1500.000000


Now, let's have a go at classifying the data:

In [252]:
from sklearn import preprocessing
from sklearn.ensemble import forest, RandomForestClassifier

def split_vals(a, n):
    return a[:n].copy(), a[n:].copy()

final_dataframe = pd.read_feather('tmp/epl-augmented')

result_encoder = preprocessing.LabelEncoder()
result_encoder.fit(final_dataframe.FTR)
final_dataframe.FTR = result_encoder.transform(augmented_dataframe.FTR)

team_encoder = preprocessing.LabelEncoder()
team_encoder.fit(final_dataframe.HomeTeam)
final_dataframe.HomeTeam = team_encoder.transform(augmented_dataframe.HomeTeam)
final_dataframe.AwayTeam = team_encoder.transform(augmented_dataframe.AwayTeam)

source_dataframe = final_dataframe.drop(["FTR", "FTHG", "FTAG"], axis=1)
training_size = len(source_dataframe) - 760

source_train, source_valid = split_vals(source_dataframe[:], training_size)
result_train, result_valid = split_vals(augmented_dataframe.FTR[:], training_size)

def set_rf_samples(n):
    forest._generate_sample_indices = (lambda rs, n_samples:
            forest.check_random_state(rs).randint(0, n_samples, n))
    
def reset_rf_samples():
    forest._generate_sample_indices = (lambda rs, n_samples:
        forest.check_random_state(rs).randint(0, n_samples, n_samples))
    
set_rf_samples(1000)

classifier = RandomForestClassifier(n_jobs=-1, n_estimators=1000, oob_score=True)
%time classifier.fit(source_train, result_train)

print ("""CLASSIFICATION:
Training data score: {}
Validation data score: {}
Out-of-bag score: {}""".format(
    classifier.score(source_train, result_train),
    classifier.score(source_valid, result_valid),
    classifier.oob_score_,
))


CPU times: user 6.34 s, sys: 462 ms, total: 6.81 s
Wall time: 3.91 s
CLASSIFICATION:
Training data score: 0.676774483378257
Validation data score: 0.5763157894736842
Out-of-bag score: 0.5056154537286612


## Current model

Random forest, using form, goals for & against & Elo score combined

Accuracy of about `0.576`

Future work:
 * Split form trackers into home/away
 * Wider gaps in Elo?