# Predicting Premier League Outcomes Using Articles and Statistics

### Imports

In [7]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from autofeat import AutoFeatClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Approach:

Make lookup table for each possible matchup to make it identifiable, so predicting is just indexing table

Elements are array of averaged stats for every matchup in training set

This is passed into model to give a result to put into table

Each element is a weight determined by a model

23/24 Teams Undefined: Luton Town

### Statistics

In [8]:
matches = pd.read_csv("MODEL_TRAIN.csv", index_col=0)
matches = matches.dropna()
matches

Unnamed: 0_level_0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Liverpool,Norwich,2,4,1,4,0,15,12,7,...,0,1.14,10.00,19.00,1.14,8.25,18.50,1.15,8.00,18.00
2,West Ham,Man City,0,0,5,0,1,5,14,3,...,0,12.00,6.50,1.22,11.50,5.75,1.26,11.00,6.10,1.25
3,Bournemouth,Sheffield United,1,1,1,0,0,13,8,3,...,0,1.95,3.60,3.60,1.95,3.60,3.90,1.97,3.55,3.80
4,Burnley,Southampton,2,3,0,0,0,10,11,4,...,0,2.62,3.20,2.75,2.65,3.20,2.75,2.65,3.20,2.75
5,Crystal Palace,Everton,1,0,0,0,0,6,10,2,...,1,3.00,3.25,2.37,3.20,3.20,2.35,3.10,3.20,2.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,Norwich,Arsenal,0,0,2,0,0,11,14,5,...,0,4.50,3.80,1.85,4.00,3.50,1.87,4.00,3.60,1.80
3797,Southampton,Man United,1,1,1,1,0,15,8,6,...,0,2.63,3.50,2.80,2.60,3.30,2.60,2.60,3.20,2.60
3798,Sunderland,Swansea,0,1,3,0,2,20,8,4,...,0,2.30,3.40,3.40,2.20,3.40,3.10,2.00,3.30,3.60
3799,Tottenham,Aston Villa,2,3,0,3,0,12,4,6,...,0,1.50,4.50,7.50,1.45,4.50,6.25,1.45,4.00,7.00


In [9]:
# Encoding
label_encoder = LabelEncoder()
matches['HomeTeam'] = label_encoder.fit_transform(matches['HomeTeam'])
matches['AwayTeam'] = label_encoder.fit_transform(matches['AwayTeam'])
team_mapping_legend = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
team_mapping_legend = {v: k for k, v in team_mapping_legend.items()}
print("Team Mapping Legend:", team_mapping_legend)
num_teams = len(team_mapping_legend)
matches

Team Mapping Legend: {0: 'Arsenal', 1: 'Aston Villa', 2: 'Bournemouth', 3: 'Brentford', 4: 'Brighton', 5: 'Burnley', 6: 'Cardiff', 7: 'Chelsea', 8: 'Crystal Palace', 9: 'Everton', 10: 'Fulham', 11: 'Huddersfield', 12: 'Hull', 13: 'Leeds', 14: 'Leicester', 15: 'Liverpool', 16: 'Man City', 17: 'Man United', 18: 'Middlesbrough', 19: 'Newcastle', 20: 'Norwich', 21: "Nott'm Forest", 22: 'QPR', 23: 'Sheffield United', 24: 'Southampton', 25: 'Stoke', 26: 'Sunderland', 27: 'Swansea', 28: 'Tottenham', 29: 'Watford', 30: 'West Brom', 31: 'West Ham', 32: 'Wolves'}


Unnamed: 0_level_0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15,20,2,4,1,4,0,15,12,7,...,0,1.14,10.00,19.00,1.14,8.25,18.50,1.15,8.00,18.00
2,31,16,0,0,5,0,1,5,14,3,...,0,12.00,6.50,1.22,11.50,5.75,1.26,11.00,6.10,1.25
3,2,23,1,1,1,0,0,13,8,3,...,0,1.95,3.60,3.60,1.95,3.60,3.90,1.97,3.55,3.80
4,5,24,2,3,0,0,0,10,11,4,...,0,2.62,3.20,2.75,2.65,3.20,2.75,2.65,3.20,2.75
5,8,9,1,0,0,0,0,6,10,2,...,1,3.00,3.25,2.37,3.20,3.20,2.35,3.10,3.20,2.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,20,0,0,0,2,0,0,11,14,5,...,0,4.50,3.80,1.85,4.00,3.50,1.87,4.00,3.60,1.80
3797,24,17,1,1,1,1,0,15,8,6,...,0,2.63,3.50,2.80,2.60,3.30,2.60,2.60,3.20,2.60
3798,26,27,0,1,3,0,2,20,8,4,...,0,2.30,3.40,3.40,2.20,3.40,3.10,2.00,3.30,3.60
3799,28,1,2,3,0,3,0,12,4,6,...,0,1.50,4.50,7.50,1.45,4.50,6.25,1.45,4.00,7.00


In [10]:
stats = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0, 0]
matchups = [[stats for _ in range(num_teams)] for _ in range(num_teams)]
# for row in matchups:
#     print(row)

In [20]:
train_set = matches.drop(columns=["HomeTeam","AwayTeam"])
X = train_set.drop(["result","FTHG","FTAG","HTHG","HTAG"], axis=1)
y = train_set["result"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

catboost_model = CatBoostClassifier(
    iterations=1000, 
    depth=3, 
    learning_rate=0.05, 
    l2_leaf_reg=3,
    loss_function='MultiClass',
)
xgb_model = xgb.XGBClassifier(
    learning_rate=0.05,
    n_estimators=1000,
    max_depth=4,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.3,
    objective='multi:softprob',
    num_class=len(y_train.unique())
)

estimators = [('xgb', xgb_model), ('catboost', catboost_model)]
stacking_model = StackingClassifier(estimators=estimators)
stacking_model.fit(X_train, y_train)

y_pred = stacking_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


0:	learn: 1.0827007	total: 1.32ms	remaining: 1.32s
1:	learn: 1.0676991	total: 2.35ms	remaining: 1.17s
2:	learn: 1.0557336	total: 3.43ms	remaining: 1.14s
3:	learn: 1.0442180	total: 4.46ms	remaining: 1.11s
4:	learn: 1.0320313	total: 5.63ms	remaining: 1.12s
5:	learn: 1.0222969	total: 6.65ms	remaining: 1.1s
6:	learn: 1.0123196	total: 7.69ms	remaining: 1.09s
7:	learn: 1.0031726	total: 8.69ms	remaining: 1.08s
8:	learn: 0.9939773	total: 9.79ms	remaining: 1.08s
9:	learn: 0.9855601	total: 10.8ms	remaining: 1.07s
10:	learn: 0.9782273	total: 11.8ms	remaining: 1.06s
11:	learn: 0.9734460	total: 12.8ms	remaining: 1.05s
12:	learn: 0.9663317	total: 13.9ms	remaining: 1.05s
13:	learn: 0.9596990	total: 15.2ms	remaining: 1.07s
14:	learn: 0.9544335	total: 16.2ms	remaining: 1.06s
15:	learn: 0.9499988	total: 17.3ms	remaining: 1.06s
16:	learn: 0.9441069	total: 18.3ms	remaining: 1.06s
17:	learn: 0.9404751	total: 19.3ms	remaining: 1.05s
18:	learn: 0.9359896	total: 20.2ms	remaining: 1.04s
19:	learn: 0.9316565	to

In [13]:
def match_predict(feature_values):
    feature_df = pd.DataFrame([feature_values], columns=X.columns)
    return stacking_model.predict(feature_df)

In [14]:
print(match_predict([22,14,10,6,9,3,14,5,0,1,0,0,1.3,6,9,1.3,6,9,1.3,6.25,8.75]))

[2]


In [15]:
matches

Unnamed: 0_level_0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA
match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,15,20,2,4,1,4,0,15,12,7,...,0,1.14,10.00,19.00,1.14,8.25,18.50,1.15,8.00,18.00
2,31,16,0,0,5,0,1,5,14,3,...,0,12.00,6.50,1.22,11.50,5.75,1.26,11.00,6.10,1.25
3,2,23,1,1,1,0,0,13,8,3,...,0,1.95,3.60,3.60,1.95,3.60,3.90,1.97,3.55,3.80
4,5,24,2,3,0,0,0,10,11,4,...,0,2.62,3.20,2.75,2.65,3.20,2.75,2.65,3.20,2.75
5,8,9,1,0,0,0,0,6,10,2,...,1,3.00,3.25,2.37,3.20,3.20,2.35,3.10,3.20,2.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,20,0,0,0,2,0,0,11,14,5,...,0,4.50,3.80,1.85,4.00,3.50,1.87,4.00,3.60,1.80
3797,24,17,1,1,1,1,0,15,8,6,...,0,2.63,3.50,2.80,2.60,3.30,2.60,2.60,3.20,2.60
3798,26,27,0,1,3,0,2,20,8,4,...,0,2.30,3.40,3.40,2.20,3.40,3.10,2.00,3.30,3.60
3799,28,1,2,3,0,3,0,12,4,6,...,0,1.50,4.50,7.50,1.45,4.50,6.25,1.45,4.00,7.00


In [16]:
for id, row in matches.iterrows():
    features = ['HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365H','B365D','B365A','BWH','BWD','BWA','IWH','IWD','IWA']
    df = row[features]
    feat_arr = df.to_numpy()
    home = int(row['HomeTeam'])
    away = int(row['AwayTeam'])
    prev = matchups[home][away]
    if(sum(prev) <= 0):
        matchups[home][away] = feat_arr
    else:
        matchups[home][away] = [(x + y) / 2 for x, y in zip(prev, df)]

In [17]:
matchups_predictions = [['-' for _ in range(num_teams)] for _ in range(num_teams)]
for row_index, row in enumerate(matchups_predictions):
    for col_index, element in enumerate(row):
            if(row_index != col_index):
                prediction = match_predict(matchups[row_index][col_index])
                matchups_predictions[row_index][col_index] = str(prediction[0])
for row in matchups_predictions:
    print(row)

['-', '2', '2', '2', '2', '2', '2', '0', '2', '2', '2', '2', '2', '2', '2', '2', '0', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2']
['0', '-', '2', '2', '2', '0', '2', '0', '1', '0', '0', '2', '2', '2', '2', '0', '0', '0', '2', '0', '2', '2', '2', '1', '0', '0', '2', '2', '0', '0', '2', '2', '2']
['0', '2', '-', '0', '0', '2', '2', '0', '2', '2', '2', '2', '2', '2', '2', '0', '0', '0', '2', '0', '2', '2', '2', '2', '0', '0', '2', '2', '0', '2', '2', '2', '0']
['0', '2', '2', '-', '2', '2', '2', '0', '2', '0', '2', '2', '2', '2', '1', '0', '0', '2', '2', '0', '2', '2', '2', '2', '2', '2', '2', '2', '0', '2', '2', '2', '1']
['0', '2', '2', '2', '-', '0', '0', '0', '1', '0', '0', '2', '2', '2', '0', '0', '0', '2', '2', '2', '2', '2', '2', '2', '2', '1', '2', '2', '0', '2', '2', '2', '2']
['0', '0', '2', '1', '2', '-', '2', '0', '2', '0', '1', '2', '1', '0', '0', '0', '0', '0', '2', '1', '2', '2', '0', '2', '2', '1', '2', '2', '0', '2', '2', '0', '2']
['0'

In [18]:
test_set = pd.read_csv("s23_24.csv")
test_set['HomeTeam_encoded'] = label_encoder.fit_transform(test_set['HomeTeam'])
test_set['AwayTeam_encoded'] = label_encoder.fit_transform(test_set['AwayTeam'])
test_set['Predicted Outcome'] = [matchups_predictions[row['HomeTeam_encoded']][row['AwayTeam_encoded']] for _, row in test_set.iterrows()]
outcome_mapping = {'0': 'L', '1': 'D', '2': 'W'}
test_set['Predicted Outcome'] = test_set['Predicted Outcome'].map(outcome_mapping)
test_set = test_set.drop(['HomeTeam_encoded', 'AwayTeam_encoded'], axis=1)

# test_set.to_csv('OUTPUT.csv', index=False)
test_set


Unnamed: 0,HomeTeam,AwayTeam,Predicted Outcome
0,Burnley,Man City,W
1,Arsenal,Nott'm Forest,W
2,Bournemouth,West Ham,L
3,Everton,Fulham,L
4,Sheffield United,Crystal Palace,W
...,...,...,...
112,Sheffield United,Bournemouth,W
113,Brentford,Arsenal,L
114,Tottenham,Aston Villa,W
115,Everton,Man United,D


## GPT-4 Model Implementation

## Model Ensembling