# Predicting Premier League Outcomes Using Articles and Statistics

### Imports

In [83]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier, StackingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from autofeat import AutoFeatClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Approach:

Make lookup table for each possible matchup to make it identifiable, so predicting is just indexing table

Elements are array of averaged stats for every matchup in training set

This is passed into model to give a result to put into table

Each element is a weight determined by a model

23/24 Teams Undefined: Luton Town

### Statistics

In [84]:
matches = pd.read_csv("data/SPREADS_BIN_CLASS.csv", index_col=0)
matches = matches.dropna()
matches

FileNotFoundError: [Errno 2] No such file or directory: 'SPREADS_BIN_CLASS.csv'

In [None]:
# Encoding
label_encoder = LabelEncoder()
matches['HomeTeam'] = label_encoder.fit_transform(matches['HomeTeam'])
matches['AwayTeam'] = label_encoder.fit_transform(matches['AwayTeam'])
team_mapping_legend = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
team_mapping_legend = {v: k for k, v in team_mapping_legend.items()}
print("Team Mapping Legend:", team_mapping_legend)
num_teams = len(team_mapping_legend)
matches

Team Mapping Legend: {0: 'Arsenal', 1: 'Aston Villa', 2: 'Bournemouth', 3: 'Brentford', 4: 'Brighton', 5: 'Burnley', 6: 'Cardiff', 7: 'Chelsea', 8: 'Crystal Palace', 9: 'Everton', 10: 'Fulham', 11: 'Huddersfield', 12: 'Hull', 13: 'Leeds', 14: 'Leicester', 15: 'Liverpool', 16: 'Man City', 17: 'Man United', 18: 'Middlesbrough', 19: 'Newcastle', 20: 'Norwich', 21: "Nott'm Forest", 22: 'QPR', 23: 'Sheffield United', 24: 'Southampton', 25: 'Stoke', 26: 'Sunderland', 27: 'Swansea', 28: 'Tottenham', 29: 'Watford', 30: 'West Brom', 31: 'West Ham', 32: 'Wolves'}


Unnamed: 0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,HY,AY,HR,AR,B365,B365Spread,BW,BSpread,IW,ISpread
1,15,20,1.0,4,1,4,0,15,12,7,...,0,2,0,0,2,8.930,2,1.570,2,1.575
2,31,16,0.0,0,5,0,1,5,14,3,...,2,2,0,0,0,-5.390,0,0.630,0,0.025
3,2,23,0.5,1,1,0,0,13,8,3,...,2,1,0,0,2,0.825,2,-0.675,2,-0.665
4,5,24,1.0,3,0,0,0,10,11,4,...,0,0,0,0,2,0.065,1,-0.500,1,-0.500
5,8,9,0.5,0,0,0,0,6,10,2,...,2,1,0,1,0,-0.315,0,-0.425,1,-0.450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,20,0,0.0,0,2,0,0,11,14,5,...,1,0,0,0,0,-1.325,0,-0.565,0,-0.700
3797,24,17,0.5,1,1,1,0,15,8,6,...,3,2,0,0,2,0.085,1,-0.700,2,-0.600
3798,26,27,0.0,1,3,0,2,20,8,4,...,2,3,0,0,2,0.550,1,-0.750,2,-0.500
3799,28,1,1.0,3,0,3,0,12,4,6,...,1,0,0,0,2,3.000,2,-0.650,2,0.225


In [None]:
stats = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
matchups = [[stats for _ in range(num_teams)] for _ in range(num_teams)]

In [None]:
train_set = matches.drop(columns=["HomeTeam","AwayTeam"])
X = train_set.drop(["result","FTHG","FTAG"], axis=1)
y = train_set["result"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

catboost_model = CatBoostRegressor(
    iterations=1000, 
    depth=3, 
    learning_rate=0.05, 
    l2_leaf_reg=3,
    loss_function='RMSE'
)

xgb_model = xgb.XGBRegressor(
    learning_rate=0.05,
    n_estimators=1000,
    max_depth=4,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.3,
    objective='binary:logistic'
)

estimators = [('xgb', xgb_model), ('catboost', catboost_model)]
stacking_model = StackingRegressor(estimators=estimators)
stacking_model.fit(X_train, y_train)

0:	learn: 0.4267716	total: 625us	remaining: 625ms
1:	learn: 0.4191294	total: 1.14ms	remaining: 569ms
2:	learn: 0.4118595	total: 1.69ms	remaining: 562ms
3:	learn: 0.4053313	total: 2.19ms	remaining: 546ms
4:	learn: 0.3992491	total: 2.72ms	remaining: 541ms
5:	learn: 0.3938016	total: 3.17ms	remaining: 525ms
6:	learn: 0.3885252	total: 3.66ms	remaining: 520ms
7:	learn: 0.3837685	total: 4.18ms	remaining: 518ms
8:	learn: 0.3792293	total: 4.65ms	remaining: 512ms
9:	learn: 0.3751512	total: 5.14ms	remaining: 509ms
10:	learn: 0.3719085	total: 5.66ms	remaining: 509ms
11:	learn: 0.3682733	total: 6.2ms	remaining: 510ms
12:	learn: 0.3647429	total: 6.68ms	remaining: 507ms
13:	learn: 0.3619320	total: 7.19ms	remaining: 506ms
14:	learn: 0.3590061	total: 7.69ms	remaining: 505ms
15:	learn: 0.3560876	total: 8.17ms	remaining: 503ms
16:	learn: 0.3531152	total: 8.67ms	remaining: 502ms
17:	learn: 0.3501965	total: 9.21ms	remaining: 503ms
18:	learn: 0.3479880	total: 9.73ms	remaining: 503ms
19:	learn: 0.3456250	tot

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow import keras
from tensorflow.keras import layers

# Assuming you have a DataFrame named train_set with the given structure
# Replace this with your actual DataFrame

# Load your dataset
# train_set = pd.read_csv("your_dataset.csv")

# Drop unnecessary columns
X = train_set.drop(["result", "FTHG", "FTAG", "HTHG", "HTAG", "result_encoded"], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Neural network model using Keras
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(3, activation='softmax')  # Assuming 3 classes in the target variable
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, verbose=2)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)


Epoch 1/100
86/86 - 0s - loss: 3.0349 - accuracy: 0.3659 - val_loss: 1.0369 - val_accuracy: 0.4309 - 458ms/epoch - 5ms/step
Epoch 2/100
86/86 - 0s - loss: 1.3103 - accuracy: 0.3936 - val_loss: 1.0466 - val_accuracy: 0.5822 - 75ms/epoch - 873us/step
Epoch 3/100
86/86 - 0s - loss: 1.1243 - accuracy: 0.4349 - val_loss: 1.0431 - val_accuracy: 0.5822 - 77ms/epoch - 896us/step
Epoch 4/100
86/86 - 0s - loss: 1.0847 - accuracy: 0.4635 - val_loss: 1.0226 - val_accuracy: 0.5493 - 78ms/epoch - 901us/step
Epoch 5/100
86/86 - 0s - loss: 1.0576 - accuracy: 0.4755 - val_loss: 1.0200 - val_accuracy: 0.5592 - 75ms/epoch - 872us/step
Epoch 6/100
86/86 - 0s - loss: 1.0398 - accuracy: 0.4784 - val_loss: 0.9797 - val_accuracy: 0.5789 - 80ms/epoch - 928us/step
Epoch 7/100
86/86 - 0s - loss: 1.0159 - accuracy: 0.5040 - val_loss: 0.9678 - val_accuracy: 0.6250 - 76ms/epoch - 878us/step
Epoch 8/100
86/86 - 0s - loss: 0.9985 - accuracy: 0.5175 - val_loss: 0.9435 - val_accuracy: 0.6184 - 76ms/epoch - 878us/step
E

In [None]:
def match_predict(feature_values):
    feature_df = pd.DataFrame([feature_values], columns=X.columns)
    return stacking_model.predict(feature_df)

In [None]:
print(match_predict([4,0,15,12,7,5,9,9,11,2,0,2,0,0,2,8.93,2,1.57,2,1.575]))

[0.9539599]


In [None]:
matches

Unnamed: 0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,HY,AY,HR,AR,B365,B365Spread,BW,BSpread,IW,ISpread
1,15,20,1.0,4,1,4,0,15,12,7,...,0,2,0,0,2,8.930,2,1.570,2,1.575
2,31,16,0.0,0,5,0,1,5,14,3,...,2,2,0,0,0,-5.390,0,0.630,0,0.025
3,2,23,0.5,1,1,0,0,13,8,3,...,2,1,0,0,2,0.825,2,-0.675,2,-0.665
4,5,24,1.0,3,0,0,0,10,11,4,...,0,0,0,0,2,0.065,1,-0.500,1,-0.500
5,8,9,0.5,0,0,0,0,6,10,2,...,2,1,0,1,0,-0.315,0,-0.425,1,-0.450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,20,0,0.0,0,2,0,0,11,14,5,...,1,0,0,0,0,-1.325,0,-0.565,0,-0.700
3797,24,17,0.5,1,1,1,0,15,8,6,...,3,2,0,0,2,0.085,1,-0.700,2,-0.600
3798,26,27,0.0,1,3,0,2,20,8,4,...,2,3,0,0,2,0.550,1,-0.750,2,-0.500
3799,28,1,1.0,3,0,3,0,12,4,6,...,1,0,0,0,2,3.000,2,-0.650,2,0.225


In [None]:
for id, row in matches.iterrows():
    features = ['HTHG','HTAG','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365','B365Spread','BW','BSpread','IW','ISpread']
    df = row[features]
    feat_arr = df.to_numpy()
    home = int(row['HomeTeam'])
    away = int(row['AwayTeam'])
    prev = matchups[home][away]
    if(sum(prev) <= 0):
        matchups[home][away] = feat_arr
    else:
        matchups[home][away] = [(x + y) / 2 for x, y in zip(prev, df)]

In [None]:
matchups_predictions = [['-' for _ in range(num_teams)] for _ in range(num_teams)]
for row_index, row in enumerate(matchups_predictions):
    for col_index, element in enumerate(row):
            if(row_index != col_index):
                prediction = match_predict(matchups[row_index][col_index])
                # matchups_predictions[row_index][col_index] = prediction[0]
                matchups_predictions[row_index][col_index] = float(np.where(prediction[0] < 0.25, 0, np.where(prediction[0] <= 0.7, 0.5, 1)))
for row in matchups_predictions:
    print(row)

['-', 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 0.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 0.5]
[0.0, '-', 0.5, 1.0, 1.0, 0.5, 1.0, 0.5, 0.5, 0.5, 0.5, 0.0, 1.0, 1.0, 0.5, 0.0, 0.0, 0.5, 0.0, 0.0, 1.0, 1.0, 1.0, 0.5, 0.5, 0.0, 0.5, 0.5, 0.0, 0.0, 0.5, 0.5, 0.5]
[0.5, 0.5, '-', 0.5, 0.5, 1.0, 1.0, 0.0, 0.5, 0.5, 0.5, 1.0, 1.0, 1.0, 0.5, 0.0, 0.0, 0.5, 1.0, 0.0, 1.0, 0.5, 0.0, 0.5, 0.5, 0.0, 1.0, 1.0, 0.0, 0.5, 0.5, 1.0, 0.5]
[0.0, 0.5, 1.0, '-', 0.5, 1.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 1.0, 0.5, 0.5, 0.0, 1.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.5, 0.5, 0.0, 1.0, 0.5]
[0.5, 0.5, 0.5, 1.0, '-', 0.5, 0.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.0, 0.5, 0.5, 0.0, 0.0, 0.5, 0.0, 0.5, 0.5, 1.0, 0.0, 0.5, 0.5, 0.5, 0.0, 1.0, 0.5, 0.5, 1.0, 0.5, 1.0]
[0.5, 0.0, 1.0, 1.0, 0.5, '-', 1.0, 0.0, 1.0, 0.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.0, 0.0, 0.0, 1.0, 0.5, 1.0, 0.0, 1.0, 1.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1.0, 0.5, 0.5]
[0.0

In [None]:
test_set = pd.read_csv("s23_24.csv")
test_set['HomeTeam_encoded'] = label_encoder.fit_transform(test_set['HomeTeam'])
test_set['AwayTeam_encoded'] = label_encoder.fit_transform(test_set['AwayTeam'])
test_set['Predicted Outcome'] = [matchups_predictions[row['HomeTeam_encoded']][row['AwayTeam_encoded']] for _, row in test_set.iterrows()]
# outcome_mapping = {0: 'L', 0.5: 'D', 1: 'W'}
# test_set['Predicted Outcome'] = test_set['Predicted Outcome'].map(outcome_mapping)
test_set = test_set.drop(['HomeTeam_encoded', 'AwayTeam_encoded'], axis=1)

test_set.to_csv('OUTPUT.csv', index=False)
test_set


Unnamed: 0,HomeTeam,AwayTeam,Predicted Outcome
0,Burnley,Man City,0.5
1,Arsenal,Nott'm Forest,1.0
2,Bournemouth,West Ham,0.5
3,Everton,Fulham,0.5
4,Sheffield United,Crystal Palace,0.5
...,...,...,...
112,Sheffield United,Bournemouth,1.0
113,Brentford,Arsenal,0.0
114,Tottenham,Aston Villa,1.0
115,Everton,Man United,1.0


## GPT-4 Model Implementation

## Model Ensembling