Setup imports etc. first

In [37]:
import os
import pandas as pd
from IPython.display import display
from fastai.tabular.transform import add_datepart
# from fastai.structured import train_cats

First, load the raw data

In [70]:
PATH = "data/"
pd.set_option("display.max_rows", 1000)

raw_dataframe = pd.read_csv(f'{PATH}EPL_Set.csv', low_memory=False, parse_dates=["Date"])
display(raw_dataframe)

Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Season
0,E0,1993-08-14,Arsenal,Coventry,0,3,A,,,,1993-94
1,E0,1993-08-14,Aston Villa,QPR,4,1,H,,,,1993-94
2,E0,1993-08-14,Chelsea,Blackburn,1,2,A,,,,1993-94
3,E0,1993-08-14,Liverpool,Sheffield Weds,2,0,H,,,,1993-94
4,E0,1993-08-14,Man City,Leeds,1,1,D,,,,1993-94
5,E0,1993-08-14,Newcastle,Tottenham,0,1,A,,,,1993-94
6,E0,1993-08-14,Oldham,Ipswich,0,3,A,,,,1993-94
7,E0,1993-08-14,Sheffield United,Swindon,3,1,H,,,,1993-94
8,E0,1993-08-14,Southampton,Everton,0,2,A,,,,1993-94
9,E0,1993-08-14,West Ham,Wimbledon,0,2,A,,,,1993-94


Now let's remove data we do not need (we don't want half-time data, as we want to be able to predict before a game has even started

In [39]:
cleaned_dataframe = raw_dataframe.drop(["Div", "HTHG", "HTAG", "HTR", "Season"], axis=1)
display(cleaned_dataframe)

os.makedirs('tmp', exist_ok=True)
cleaned_dataframe.to_feather('tmp/epl-cleaned')

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR
0,1993-08-14,Arsenal,Coventry,0,3,A
1,1993-08-14,Aston Villa,QPR,4,1,H
2,1993-08-14,Chelsea,Blackburn,1,2,A
3,1993-08-14,Liverpool,Sheffield Weds,2,0,H
4,1993-08-14,Man City,Leeds,1,1,D
5,1993-08-14,Newcastle,Tottenham,0,1,A
6,1993-08-14,Oldham,Ipswich,0,3,A
7,1993-08-14,Sheffield United,Swindon,3,1,H
8,1993-08-14,Southampton,Everton,0,2,A
9,1993-08-14,West Ham,Wimbledon,0,2,A


Let's clean up the data, splitting up the date and categorising the teams

In [82]:
augmented_dataframe = pd.read_feather('tmp/epl-cleaned')

add_datepart(augmented_dataframe, "Date")
augmented_dataframe = augmented_dataframe.drop(["Elapsed"], axis=1)
# train_cats(augmented_dataframe)
display(augmented_dataframe)

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start
0,Arsenal,Coventry,0,3,A,1993,8,32,14,5,226,False,False,False,False,False,False
1,Aston Villa,QPR,4,1,H,1993,8,32,14,5,226,False,False,False,False,False,False
2,Chelsea,Blackburn,1,2,A,1993,8,32,14,5,226,False,False,False,False,False,False
3,Liverpool,Sheffield Weds,2,0,H,1993,8,32,14,5,226,False,False,False,False,False,False
4,Man City,Leeds,1,1,D,1993,8,32,14,5,226,False,False,False,False,False,False
5,Newcastle,Tottenham,0,1,A,1993,8,32,14,5,226,False,False,False,False,False,False
6,Oldham,Ipswich,0,3,A,1993,8,32,14,5,226,False,False,False,False,False,False
7,Sheffield United,Swindon,3,1,H,1993,8,32,14,5,226,False,False,False,False,False,False
8,Southampton,Everton,0,2,A,1993,8,32,14,5,226,False,False,False,False,False,False
9,West Ham,Wimbledon,0,2,A,1993,8,32,14,5,226,False,False,False,False,False,False


Now let's calculate each team's form

In [86]:
form = {}
elo = {}

def form_score(results, length):
    if not results:
        return None
    if len(results) < length:
        return None
    return sum(results[-length:])/len(results[-length:])

K = 32
def elo_score(R_home, R_away, result):
    Q_home = 10**(R_home/400)
    Q_away = 10**(R_away/400)
    
    E_home = Q_home/(Q_home + Q_away)
    E_away = Q_away/(Q_home + Q_away)
    
    if result == "H":
        S_home = 1
        S_away = 0
    elif result == "D":
        S_home = 0.5
        S_away = 0.5
    elif result == "A":
        S_home = 0
        S_away = 1
    
    R_home = R_home + K*(S_home - E_home)
    R_away = R_away + K*(S_away - E_away)
    return (R_home, R_away)


for (i, row) in augmented_dataframe.iterrows():
    home_form = form.get(row["HomeTeam"], [])
    away_form = form.get(row["AwayTeam"], [])
    
#     augmented_dataframe.at[i, "HomeForm5"] = form_score(home_form, 5)
#     augmented_dataframe.at[i, "AwayForm5"] = form_score(away_form, 5)
#     augmented_dataframe.at[i, "HomeForm10"] = form_score(home_form, 10)
#     augmented_dataframe.at[i, "AwayForm10"] = form_score(away_form, 10)
#     augmented_dataframe.at[i, "HomeForm20"] = form_score(home_form, 20)
#     augmented_dataframe.at[i, "AwayForm20"] = form_score(away_form, 20)
    
    home_elo = elo.get(row["HomeTeam"], 1500)
    away_elo = elo.get(row["AwayTeam"], 1500)
    augmented_dataframe.at[i, "HomeElo"] = home_elo
    augmented_dataframe.at[i, "AwayElo"] = away_elo

    new_home_elo, new_away_elo = elo_score(home_elo, away_elo, row["FTR"])
    
    elo[row["HomeTeam"]] = new_home_elo
    elo[row["AwayTeam"]] = new_away_elo
        
    if row["FTR"] == "H":
        home_form.append(1)
        away_form.append(0)
    elif row["FTR"] == "D":
        home_form.append(0.5)
        away_form.append(0.5)
    if row["FTR"] == "A":
        home_form.append(0)
        away_form.append(1)
                
    form[row["HomeTeam"]] = home_form
    form[row["AwayTeam"]] = away_form
    
display(augmented_dataframe)


Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,Year,Month,Week,Day,Dayofweek,Dayofyear,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,HomeElo,AwayElo
0,Arsenal,Coventry,0,3,A,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000
1,Aston Villa,QPR,4,1,H,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000
2,Chelsea,Blackburn,1,2,A,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000
3,Liverpool,Sheffield Weds,2,0,H,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000
4,Man City,Leeds,1,1,D,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000
5,Newcastle,Tottenham,0,1,A,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000
6,Oldham,Ipswich,0,3,A,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000
7,Sheffield United,Swindon,3,1,H,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000
8,Southampton,Everton,0,2,A,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000
9,West Ham,Wimbledon,0,2,A,1993,8,32,14,5,226,False,False,False,False,False,False,1500.000000,1500.000000


Remembering each team's form data