# Introduction

Importing all the packages that are required for the notebook to run.

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler, LabelEncoder

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

import seaborn as sns
sns.set_theme(style="whitegrid")

import matplotlib.pyplot as plt

# Data Import

In [None]:
root_dir = './' # ENTER YOUR WORKING DRIECTORY HERE #
epl_training_path = root_dir + 'epl-training.csv'
epl_teams_path = root_dir + 'epl-teams.csv'

# Load and describe the csv file
df = pd.read_csv(epl_training_path)
df_teams = pd.read_csv(epl_teams_path)
df.describe()
df_teams.tail()

In [None]:
df.tail()

# Data Transformation and Exploration

Replacing all the Team names with labels

In [None]:
# Instantiate LabelEncoder
team_encoder = LabelEncoder()

# Remove any row where the HomeTeam or AwayTeam is not a valid name
df = df.dropna()
df['Date'] = pd.to_datetime(df['Date'])
df = df[df['Date'].dt.year >= 2013]

# Check that all invalid fields have been removed
assert(df['HomeTeam'].apply(lambda x: not isinstance(x, str)).sum() == 0)
assert(df['AwayTeam'].apply(lambda x: not isinstance(x, str)).sum() == 0)

# Fit and transform the 'Names' column with LabelEncoder
team_encoder = team_encoder.fit(df['HomeTeam'])
df['HomeTeamLabel'] = team_encoder.transform(df['HomeTeam'])
df['AwayTeamLabel'] = team_encoder.transform(df['AwayTeam'])
df_teams['TeamLabel'] = team_encoder.transform(df_teams['Team'])

print(f"Team Names: {team_encoder.classes_}")


In [None]:
result_encoder = LabelEncoder()
result_encoder = result_encoder.fit(df['FTR'])
df['FTRLabel'] = result_encoder.transform(df['FTR'])
df['HTRLabel'] = result_encoder.transform(df['HTR'])

print(f"Result names: {result_encoder.classes_}")

In [None]:
df.describe()

## Base rates of home, and away

Implies that there is a home advantage! It's worth considering away and home independently.

In [None]:
ftr_counts = df['FTR'].value_counts(normalize=True) # Normalize to convert to probabilities

plt.figure(figsize=(8, 6))
ftr_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.xlabel('Game Outcome')
plt.ylabel('Probability')
plt.xticks(range(len(ftr_counts)), ['Home Win', 'Away Win', 'Draw'], rotation=0)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

In [None]:
home_columns = ['FTHG', 'HTHG', 'HS', 'HST', 'HF', 'HC', 'HY', 'HR']
away_columns = ['FTAG', 'HTAG', 'AS', 'AST', 'AF', 'AC', 'AY', 'AR']
team_columns_single = ['OA', 'AT', 'MD', 'DF', 'CW']
team_columns = ['HOA', 'HAT', 'HMD', 'HDF', 'HCW', 'AOA', 'AAT', 'AMD', 'ADF', 'ACW']

td = pd.Timedelta(days=365) # Adjust for exponential weighting

# Precomputing features
precomputed_features = {}

for team in team_encoder.classes_:
    #  Calculating the Home features
    df_home = df[df['HomeTeam'] == team]
    df_home = df_home[['Date', *home_columns]]
    df_home[home_columns] = df_home[home_columns].ewm(times=df_home['Date'], halflife=td).mean()
    precomputed_features[team] = {}
    precomputed_features[team]['Home'] = df_home
    
    # Calculating the Away features
    df_away = df[df['AwayTeam'] == team]
    df_away = df_away[['Date', *away_columns]]
    df_away[away_columns] = df_away[away_columns].ewm(times=df_away['Date'], halflife=td).mean()
    precomputed_features[team]['Away'] = df_away


In [None]:
# List to store league date ranges
league_dates = []

# Loop through the years from 2013 to 2023
for year in range(2014, 2023 + 1):
    # Define the start and end dates for each league year (assuming August 1 to June 1)
    start_date = f"{year}-08-01"
    end_date = f"{year + 1}-06-01"
    
    # Append the start and end dates as a tuple to the league_dates list
    league_dates.append((start_date, end_date, year))


feature_columns = ['Date', 'League', 'HomeTeam', 'AwayTeam', *home_columns, *away_columns, *team_columns, 'FTRLabel']
df_features = pd.DataFrame(columns=feature_columns)
feature_list = []

for start, end, year in league_dates:
    print(f"{start} : {end}")
    league = df[(df['Date'] >= start) & (df['Date'] <= end)]
    
    for index in range(len(league)):
        row = league.iloc[index]
        home =  precomputed_features[row['HomeTeam']]['Home']
        home = home[home['Date'] < row['Date']]
        if home.empty:
            continue
        else:
            home = home.iloc[-1]

        # Check if ratings exist for the year
        home_team = df_teams[df_teams['Team'] == row['HomeTeam']]
        home_team = home_team[home_team['Year'] == year]

        # Else use the latest one
        if home_team.empty:
            home_team = df_teams[df_teams['Team'] == row['HomeTeam']]
            home_team = home_team[home_team['Year'] < year]
            # Else use sensible default team stats
            if home_team.empty:
                home_team = pd.DataFrame([[60, 60, 60, 60, 1]], columns=team_columns_single) # Defaults
                home_team = home_team.iloc[0]
            else:
                home_team = home_team.iloc[0]
        else:
            home_team = home_team.iloc[0]

        away =  precomputed_features[row['AwayTeam']]['Away']
        away = away[away['Date'] < row['Date']]
        if away.empty:
            continue
        else:
            away = away.iloc[-1]

        # Check if ratings exist for the year
        away_team = df_teams[df_teams['Team'] == row['AwayTeam']]
        away_team = away_team[away_team['Year'] == year]

        # Else use the latest one
        if away_team.empty:
            away_team = df_teams[df_teams['Team'] == row['AwayTeam']]
            away_team = away_team[away_team['Year'] < year]
            if away_team.empty:
                # Else use sensible default team stats
                away_team = pd.DataFrame([[60, 60, 60, 60, 1]], columns=team_columns_single) # Defaults
                away_team = away_team.iloc[0]
            else:
                away_team = away_team.iloc[0]
        else:
            away_team = away_team.iloc[0]

        feature_list.append([row['Date'], year, row['HomeTeam'], row['AwayTeam'],
                            *home[home_columns], *away[away_columns],
                            *home_team[team_columns_single], *away_team[team_columns_single], row['FTRLabel']])

df_features = pd.DataFrame(feature_list, columns=feature_columns)
df_features.tail()

In [None]:
df_features_train = df_features[df_features['League'] <= 2018] # Train on everything before 2018
df_features_test = df_features[df_features['League'] > 2018] # Test on 2019 and onwards

## Plot the correlation matrix

In [None]:
corr_mat = df_features_train[[*home_columns, *away_columns, *team_columns]].corr().stack().reset_index(name="correlation")

# Draw each cell as a scatter point with varying size and color
g = sns.relplot(
    data=corr_mat,
    x="level_0", y="level_1", hue="correlation", size="correlation",
    palette="vlag", hue_norm=(-1, 1), edgecolor=".7",
    height=10, sizes=(50, 250), size_norm=(-.2, .8),
)

# Tweak the figure to finalize
g.set(xlabel="", ylabel="", aspect="equal")
g.despine(left=True, bottom=True)
g.ax.margins(.02)
for label in g.ax.get_xticklabels():
    label.set_rotation(90)

# Methodology Overview

## Comparing classifier models

In [None]:
X_train = df_features_train[[*home_columns, *away_columns, *team_columns]].values
y_train = df_features_train[['FTRLabel']].values.ravel()

In [None]:
def train_and_compare_classifiers(X, y):
    accuracies = {}
    classifiers = {
        'SVM': SVC(),
        'KNN': KNeighborsClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Naive Bayes': GaussianNB(),
        'Logistic Regression': LogisticRegression()
    }
    
    for clf_name, clf in classifiers.items():
        # Create a pipeline with StandardScaler and the classifier to normalize values
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('classifier', clf)
        ])
        scores = cross_val_score(pipeline, X, y, cv=5)
        accuracies[clf_name] = scores.mean()

    # Plotting accuracies
    plt.figure(figsize=(10, 6))
    bars = plt.bar(accuracies.keys(), accuracies.values(), color='skyblue')
    plt.ylabel('Accuracy')
    plt.title('Comparison of Classifier Accuracies')
    plt.ylim([0, 1])
    plt.tight_layout()

    # Add labels on top of each bar
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, height, f'{height:.3f}', ha='center', va='bottom')

    plt.show()

train_and_compare_classifiers(X_train, y_train)

# Model Training and Validation

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', SVC())
])

scores = cross_val_score(pipeline, X_train, y_train, cv=5)
print(f"Cross validation accuracy of the pipeline is {scores.mean()}")

X_test = df_features_test[[*home_columns, *away_columns, *team_columns]].values
y_test = df_features_test[['FTRLabel']].values.ravel()

test_score = pipeline.fit(X_train, y_train).score(X_test, y_test)
print(f"Test accuracy of the pipeline is {test_score}")

# Results

# Final Predictions on Test Set

# References

- [Forecasting football](https://mercurius.io/en/learn/predicting-forecasting-football)
- [Prediction of football match results with Machine Learning](https://www.sciencedirect.com/science/article/pii/S1877050922007955)
    - You have to account for home advantage in the base rates 
    - https://sofifa.com/ for scores on player ability and the scores on team ability
- [Predicting Football Matches Results using Bayesian Networks for English Premier League (EPL)](https://iopscience.iop.org/article/10.1088/1757-899X/226/1/012099)
- [Predicting Football Results Using Machine Learning Techniques](https://www.imperial.ac.uk/media/imperial-college/faculty-of-engineering/computing/public/1718-ug-projects/Corentin-Herbinet-Using-Machine-Learning-techniques-to-predict-the-outcome-of-profressional-football-matches.pdf)
- [Forecasting football match results using a player rating based model](https://www.sciencedirect.com/science/article/pii/S016920702300033X)
    - Use historical betting odds as the predictive label
    - 
- [Forecasting football matches by predicting match statistics](https://content.iospress.com/articles/journal-of-sports-analytics/jsa200462)
- Datasets
    - https://www.kaggle.com/datasets/hugomathien/soccer

- https://www.football-data.co.uk/ratings.pdf