# Predicting Premier League Outcomes Using Articles and Statistics

### Imports

In [5]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from autofeat import AutoFeatClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Statistics

In [11]:
matches = pd.read_csv("13_23.csv")
s23_24 = pd.read_csv("23_24.csv")
match_training = matches.drop(columns=["Div"])
test_set = s23_24.drop(columns=["Date","Time","Div"])
#test_set.to_csv('s23_24.csv', index=False)

match_training.head()

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA
0,9/8/2019,Liverpool,Norwich,4,1,H,4,0,H,M Oliver,...,0,1.14,10.0,19.0,1.14,8.25,18.5,1.15,8.0,18.0
1,10/8/2019,West Ham,Man City,0,5,A,0,1,A,M Dean,...,0,12.0,6.5,1.22,11.5,5.75,1.26,11.0,6.1,1.25
2,10/8/2019,Bournemouth,Sheffield United,1,1,D,0,0,D,K Friend,...,0,1.95,3.6,3.6,1.95,3.6,3.9,1.97,3.55,3.8
3,10/8/2019,Burnley,Southampton,3,0,H,0,0,D,G Scott,...,0,2.62,3.2,2.75,2.65,3.2,2.75,2.65,3.2,2.75
4,10/8/2019,Crystal Palace,Everton,0,0,D,0,0,D,J Moss,...,1,3.0,3.25,2.37,3.2,3.2,2.35,3.1,3.2,2.4


In [4]:
s23_24.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam
0,E0,11/8/2023,20:00,Burnley,Man City
1,E0,12/8/2023,12:30,Arsenal,Nott'm Forest
2,E0,12/8/2023,15:00,Bournemouth,West Ham
3,E0,12/8/2023,15:00,Brighton,Luton
4,E0,12/8/2023,15:00,Everton,Fulham


In [5]:
match_training.shape

(3800, 29)

In [6]:
match_training.dtypes

HomeTeam     object
AwayTeam     object
FTHG          int64
FTAG          int64
FTR          object
HTHG          int64
HTAG          int64
HTR          object
HS            int64
AS            int64
HST           int64
AST           int64
HF            int64
AF            int64
HC            int64
AC            int64
HY            int64
AY            int64
HR            int64
AR            int64
B365H       float64
B365D       float64
B365A       float64
BWH         float64
BWD         float64
BWA         float64
IWH         float64
IWD         float64
IWA         float64
dtype: object

Dropping:

Date: converted to higher weights for recent years since they are more relevant

Time: only the times for the 23-24 are available from our data source

Div: all games are in the first division

Referee: referees have impact but their patterns cannot be determined by our data and some of the referees have already retired. Games are also scheduled to not have the same referee for the same games if possible. Therefore, it will add noise and extra complexity in predictions.

In [4]:
describe = (match_training.describe())

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print((match_training.describe()))

              FTHG         FTAG         HTHG         HTAG           HS  \
count  3800.000000  3800.000000  3800.000000  3800.000000  3800.000000   
mean      1.525263     1.216842     0.690000     0.544211    13.959474   
std       1.319022     1.192161     0.855563     0.755183     5.666333   
min       0.000000     0.000000     0.000000     0.000000     0.000000   
25%       1.000000     0.000000     0.000000     0.000000    10.000000   
50%       1.000000     1.000000     0.000000     0.000000    13.000000   
75%       2.000000     2.000000     1.000000     1.000000    17.000000   
max       9.000000     9.000000     5.000000     5.000000    43.000000   

                AS          HST          AST           HF           AF  \
count  3800.000000  3800.000000  3800.000000  3800.000000  3800.000000   
mean     11.387632     4.729737     3.911579    10.559211    10.901316   
std       4.908749     2.617265     2.296655     3.400551     3.543212   
min       0.000000     0.000000     0

### Data Preprocessing

In [12]:
# Encoding
label_encoder = LabelEncoder()

match_training['FTR'] = label_encoder.fit_transform(match_training['FTR'])
match_training['HTR'] = label_encoder.fit_transform(match_training['HTR'])
tr_mapping_legend = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
tr_mapping_legend = {v: k for k, v in tr_mapping_legend.items()}

# match_training['HomeTeam'] = label_encoder.fit_transform(match_training['HomeTeam'])
# match_training['AwayTeam'] = label_encoder.fit_transform(match_training['AwayTeam'])
# team_mapping_legend = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
# team_mapping_legend = {v: k for k, v in team_mapping_legend.items()}

match_training['Referee'] = label_encoder.fit_transform(match_training['Referee'])
ref_mapping_legend = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
ref_mapping_legend = {v: k for k, v in ref_mapping_legend.items()}

print("TR Mapping Legend:", tr_mapping_legend)
# print("Team Mapping Legend:", team_mapping_legend)
print("Ref Mapping Legend:", ref_mapping_legend)

match_training.to_csv('encoded_13_23.csv', index=False)

match_training.head()

TR Mapping Legend: {0: 'A', 1: 'D', 2: 'H'}
Ref Mapping Legend: {0: 'A Madley', 1: 'A Marriner', 2: 'A Moss', 3: 'A Taylor', 4: 'C Foy', 5: 'C Kavanagh', 6: 'C Pawson', 7: 'D Bond', 8: 'D Coote', 9: 'D England', 10: 'G Scott', 11: 'H Webb', 12: 'J Brooks', 13: 'J Gillett', 14: 'J Moss', 15: 'J Smith', 16: 'K Friend', 17: 'K Stroud', 18: 'L Mason', 19: 'L Probert', 20: 'M Atkinson', 21: 'M Clattenburg', 22: 'M Dean', 23: 'M Jones', 24: 'M Oliver', 25: 'M Salisbury', 26: 'N Swarbrick', 27: 'O Langford', 28: 'P Bankes', 29: 'P Dowd', 30: 'P Tierney', 31: 'R East', 32: 'R Jones', 33: 'R Madley', 34: 'S Attwell', 35: 'S Hooper', 36: 'S Scott', 37: 'T Bramall', 38: 'T Harrington', 39: 'T Robinson', 40: 'l Mason'}


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA
0,9/8/2019,Liverpool,Norwich,4,1,2,4,0,2,24,...,0,1.14,10.0,19.0,1.14,8.25,18.5,1.15,8.0,18.0
1,10/8/2019,West Ham,Man City,0,5,0,0,1,0,22,...,0,12.0,6.5,1.22,11.5,5.75,1.26,11.0,6.1,1.25
2,10/8/2019,Bournemouth,Sheffield United,1,1,1,0,0,1,16,...,0,1.95,3.6,3.6,1.95,3.6,3.9,1.97,3.55,3.8
3,10/8/2019,Burnley,Southampton,3,0,2,0,0,1,10,...,0,2.62,3.2,2.75,2.65,3.2,2.75,2.65,3.2,2.75
4,10/8/2019,Crystal Palace,Everton,0,0,1,0,0,1,14,...,1,3.0,3.25,2.37,3.2,3.2,2.35,3.1,3.2,2.4


In [56]:
match_training.dtypes

HomeTeam      int32
AwayTeam      int32
FTHG          int64
FTAG          int64
FTR           int32
HTHG          int64
HTAG          int64
HTR           int32
HS            int64
AS            int64
HST           int64
AST           int64
HF            int64
AF            int64
HC            int64
AC            int64
HY            int64
AY            int64
HR            int64
AR            int64
B365H       float64
B365D       float64
B365A       float64
BWH         float64
BWD         float64
BWA         float64
IWH         float64
IWD         float64
IWA         float64
dtype: object

## Our Model

### Random Forest
To find non-linearities

In [57]:
rf =  RandomForestClassifier(n_estimators=50,min_samples_split=10, random_state=1)
features = ['FTHG','FTAG','HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365H','B365D','B365A','BWH','BWD','BWA','IWH','IWD','IWA']
rf.fit(match_training[features], match_training["FTR"])

ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

### XGBoost

In [15]:
features = match_training[['FTHG','FTAG','HTHG','HTAG','HTR','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365H','B365D','B365A','BWH','BWD','BWA','IWH','IWD','IWA']]
target = match_training['FTR']  # Full-Time Result

# Map categorical variables (if any)
# For simplicity, let's assume the only categorical variable is 'HTR'
#features['HTR'] = features['HTR'].map({'H': 0, 'D': 1, 'A': 2})

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train the XGBoost model
model = XGBClassifier()
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 100.00%


In [13]:
xgb_model = xgb.XGBClassifier( # run gridsearch to find best parameters
    learning_rate=0.1,
    n_estimators=500,
    max_depth=3,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.2,
    objective='binary:logistic'
)

xgb_model.fit(X_train, y_train)

xgb_predictions = xgb_model.predict(test_set)

xgb_predictions_df = pd.DataFram({
    'Id': match_training['Id'],
    'Y': xgb_predictions
})

xgb_predictions_df.to_csv('single_xgb.csv', index=False)

## GPT-4 Model Implementation

## Model Ensembling