# Premier League Predictions Using Statistics, Betting Odds, and GPT-4

### Imports

In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier, StackingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from autofeat import AutoFeatClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Approach:

Make lookup table for each possible matchup to make it identifiable, so predicting is just indexing table of model predictions based on previous matchups' aggregated stats.

Elements are array of aggregated stats and betting odds for every matchup in training set

This is passed into model to give a result to replace into table, this can be predicted outcome or win chances (classifier or regressor)

23/24 Teams Undefined: Luton Town, first season in premier league, no sufficient data

### Statistics

In [16]:
matches = pd.read_csv("data/matches.csv", index_col=0)
matches = matches.dropna()
matches

Unnamed: 0,Season,HomeTeam,AwayTeam,Result,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA
1,2023,Arsenal,Newcastle,D,0,0,1,0,0,1,...,0,1.80,3.80,4.33,1.80,3.6,4.40,1.83,3.75,4.20
2,2023,Arsenal,Man United,W,3,2,2,1,1,1,...,0,1.80,3.80,4.20,1.87,3.7,4.00,1.85,3.80,4.00
3,2023,Arsenal,Brentford,D,1,1,1,0,0,1,...,0,1.44,4.75,7.00,1.43,4.5,7.50,1.45,4.60,7.00
4,2023,Arsenal,Man City,L,1,3,0,1,1,1,...,0,2.90,3.50,2.35,2.85,3.4,2.40,2.90,3.30,2.45
5,2023,Arsenal,Everton,W,4,0,2,2,0,2,...,0,1.36,4.75,10.00,1.34,5.0,9.25,1.35,5.00,8.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,2018,Wolves,Tottenham,L,2,3,0,0,2,0,...,0,3.10,3.40,2.45,3.00,3.3,2.40,3.15,3.30,2.30
3797,2018,Wolves,Huddersfield,L,0,2,0,0,1,0,...,0,1.50,4.33,8.00,1.50,4.0,7.50,1.53,4.10,6.30
3798,2018,Wolves,Chelsea,W,2,1,2,0,1,0,...,0,5.00,3.75,1.80,4.75,3.7,1.75,4.50,3.60,1.80
3799,2018,Wolves,Bournemouth,W,2,0,2,1,0,2,...,0,1.95,3.60,4.33,1.95,3.5,4.00,2.00,3.40,3.80


#### Preprocessing:

Compute betting predictions by seeing which of H/D/A was highest/lowest and their spread.

Convert result to 0/0.5/1 for W/D/L

Drop Seasons

In [17]:
matches = pd.read_csv("data/matches_processed.csv", index_col=0)
matches = matches.dropna()
matches

Unnamed: 0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,HY,AY,HR,AR,B365,B365Spread,BW,BSpread,IW,ISpread
1,Liverpool,Norwich,1.0,4,1,4,0,15,12,7,...,0,2,0,0,2,8.930,2,1.570,2,1.575
2,West Ham,Man City,0.0,0,5,0,1,5,14,3,...,2,2,0,0,0,-5.390,0,0.630,0,0.025
3,Bournemouth,Sheffield United,0.5,1,1,0,0,13,8,3,...,2,1,0,0,2,0.825,2,-0.675,2,-0.665
4,Burnley,Southampton,1.0,3,0,0,0,10,11,4,...,0,0,0,0,2,0.065,1,-0.500,1,-0.500
5,Crystal Palace,Everton,0.5,0,0,0,0,6,10,2,...,2,1,0,1,0,-0.315,0,-0.425,1,-0.450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,Norwich,Arsenal,0.0,0,2,0,0,11,14,5,...,1,0,0,0,0,-1.325,0,-0.565,0,-0.700
3797,Southampton,Man United,0.5,1,1,1,0,15,8,6,...,3,2,0,0,2,0.085,1,-0.700,2,-0.600
3798,Sunderland,Swansea,0.0,1,3,0,2,20,8,4,...,2,3,0,0,2,0.550,1,-0.750,2,-0.500
3799,Tottenham,Aston Villa,1.0,3,0,3,0,12,4,6,...,1,0,0,0,2,3.000,2,-0.650,2,0.225


Encode numerical labels for each team

In [3]:
# Encoding
label_encoder = LabelEncoder()
matches['HomeTeam'] = label_encoder.fit_transform(matches['HomeTeam'])
matches['AwayTeam'] = label_encoder.fit_transform(matches['AwayTeam'])
team_mapping_legend = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
team_mapping_legend = {v: k for k, v in team_mapping_legend.items()}
print("Team Mapping Legend:", team_mapping_legend)
num_teams = len(team_mapping_legend)
matches

Team Mapping Legend: {0: 'Arsenal', 1: 'Aston Villa', 2: 'Bournemouth', 3: 'Brentford', 4: 'Brighton', 5: 'Burnley', 6: 'Cardiff', 7: 'Chelsea', 8: 'Crystal Palace', 9: 'Everton', 10: 'Fulham', 11: 'Huddersfield', 12: 'Hull', 13: 'Leeds', 14: 'Leicester', 15: 'Liverpool', 16: 'Man City', 17: 'Man United', 18: 'Middlesbrough', 19: 'Newcastle', 20: 'Norwich', 21: "Nott'm Forest", 22: 'QPR', 23: 'Sheffield United', 24: 'Southampton', 25: 'Stoke', 26: 'Sunderland', 27: 'Swansea', 28: 'Tottenham', 29: 'Watford', 30: 'West Brom', 31: 'West Ham', 32: 'Wolves'}


Unnamed: 0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,HY,AY,HR,AR,B365,B365Spread,BW,BSpread,IW,ISpread
1,15,20,1.0,4,1,4,0,15,12,7,...,0,2,0,0,2,8.930,2,1.570,2,1.575
2,31,16,0.0,0,5,0,1,5,14,3,...,2,2,0,0,0,-5.390,0,0.630,0,0.025
3,2,23,0.5,1,1,0,0,13,8,3,...,2,1,0,0,2,0.825,2,-0.675,2,-0.665
4,5,24,1.0,3,0,0,0,10,11,4,...,0,0,0,0,2,0.065,1,-0.500,1,-0.500
5,8,9,0.5,0,0,0,0,6,10,2,...,2,1,0,1,0,-0.315,0,-0.425,1,-0.450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,20,0,0.0,0,2,0,0,11,14,5,...,1,0,0,0,0,-1.325,0,-0.565,0,-0.700
3797,24,17,0.5,1,1,1,0,15,8,6,...,3,2,0,0,2,0.085,1,-0.700,2,-0.600
3798,26,27,0.0,1,3,0,2,20,8,4,...,2,3,0,0,2,0.550,1,-0.750,2,-0.500
3799,28,1,1.0,3,0,3,0,12,4,6,...,1,0,0,0,2,3.000,2,-0.650,2,0.225


Initialize a matchup table

In [4]:
stats = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
matchups = [[stats for _ in range(num_teams)] for _ in range(num_teams)]

### XGBoost + Catboost

In [None]:
train_set = matches.drop(columns=["HomeTeam","AwayTeam"])
X = train_set.drop(["result","FTHG","FTAG"], axis=1)
y = train_set["result"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

catboost_model = CatBoostRegressor(
    iterations=1000, 
    depth=3, 
    learning_rate=0.05, 
    l2_leaf_reg=3,
    loss_function='RMSE'
)

xgb_model = xgb.XGBRegressor(
    learning_rate=0.05,
    n_estimators=1000,
    max_depth=4,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.3,
    objective='binary:logistic'
)

estimators = [('xgb', xgb_model), ('catboost', catboost_model)]
stacking_model = StackingRegressor(estimators=estimators)
stacking_model.fit(X_train, y_train)

### Tensorflow experimenting

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

# Drop unnecessary columns
X = train_set.drop(["result", "FTHG", "FTAG", "HTHG", "HTAG"], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Neural network model using Keras
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(3, activation='softmax')  # Assuming 3 classes in the target variable
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, verbose=2)


Metrics

In [15]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy: ", accuracy)

Accuracy: 0.7502631378173828


Predict function based on a single set of features

In [7]:
def match_predict(feature_values):
    feature_df = pd.DataFrame([feature_values], columns=X.columns)
    return stacking_model.predict(feature_df)

Store every matchup combination for all teams, stats are aggregated.

In [9]:
for id, row in matches.iterrows():
    features = ['HTHG','HTAG','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365','B365Spread','BW','BSpread','IW','ISpread']
    df = row[features]
    feat_arr = df.to_numpy()
    home = int(row['HomeTeam'])
    away = int(row['AwayTeam'])
    prev = matchups[home][away]
    if(sum(prev) <= 0):
        matchups[home][away] = feat_arr
    else:
        matchups[home][away] = [(x + y) / 2 for x, y in zip(prev, df)]

Make a prediction for each team's matchup based on their past matchups by calling our match_predict function from earlier

In [12]:
matchups_predictions = [['-' for _ in range(num_teams)] for _ in range(num_teams)]
for row_index, row in enumerate(matchups_predictions):
    for col_index, element in enumerate(row):
            if(row_index != col_index):
                prediction = match_predict(matchups[row_index][col_index])
                matchups_predictions[row_index][col_index] = float(np.where(prediction[0] < 0.25, 0, np.where(prediction[0] <= 0.7, 0.5, 1)))
                matchups_predictions[row_index][col_index] = prediction[0]

Given a list of match fixtures, use the predictions and matchup table to compute a "win percentage" for each home team

0% = Loss;    50% = Draw;    100% = Win

In [13]:
test_set = pd.read_csv("data/s23_24.csv")
test_set['HomeTeam_encoded'] = label_encoder.fit_transform(test_set['HomeTeam'])
test_set['AwayTeam_encoded'] = label_encoder.fit_transform(test_set['AwayTeam'])
win_chances = [matchups_predictions[row['HomeTeam_encoded']][row['AwayTeam_encoded']] for _, row in test_set.iterrows()]
test_set['Win Chance'] = [f"{min(matchups_predictions[row['HomeTeam_encoded']][row['AwayTeam_encoded']] * 100, 95):.2f}%" for _, row in test_set.iterrows()]

# outcome_mapping = {0: 'L', 0.5: 'D', 1: 'W'}
# test_set['Predicted Outcome'] = test_set['Predicted Outcome'].map(outcome_mapping)
test_set = test_set.drop(['HomeTeam_encoded', 'AwayTeam_encoded'], axis=1)

test_set.to_csv('OUTPUT.csv', index=False)
test_set


Unnamed: 0,HomeTeam,AwayTeam,Win Chance
0,Burnley,Man City,65.65%
1,Arsenal,Nott'm Forest,88.26%
2,Bournemouth,West Ham,34.45%
3,Everton,Fulham,42.36%
4,Sheffield United,Crystal Palace,35.27%
...,...,...,...
112,Sheffield United,Bournemouth,90.65%
113,Brentford,Arsenal,15.50%
114,Tottenham,Aston Villa,90.14%
115,Everton,Man United,73.41%


## GPT-4

List of target stats to return

In [6]:
list = ["HTAG", "HTHG", "FTAG", "FTHG"]

Create list of home teams, away teams, and dates based on input csv for 2023-2024 season

In [2]:
import csv


listOfHomeTeams = []
listOfAwayTeams = []
listOfDates = []
with open('data/23_24.csv', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)

    for row in csv_reader:
        listOfDates.append(row[1])
        listOfHomeTeams.append(row[3])
        listOfAwayTeams.append(row[4])
    
    listOfHomeTeams.remove('HomeTeam')
    listOfAwayTeams.remove('AwayTeam')
    listOfDates.remove('Date')
    print(listOfHomeTeams)
    print(listOfAwayTeams)
    print(listOfDates)


['Burnley', 'Arsenal', 'Bournemouth', 'Everton', 'Sheffield United', 'Newcastle', 'Brentford', 'Chelsea', 'Man United', "Nott'm Forest", 'Fulham', 'Liverpool', 'Wolves', 'Tottenham', 'Man City', 'Aston Villa', 'West Ham', 'Crystal Palace', 'Bournemouth', 'Arsenal', 'Brentford', 'Everton', 'Man United', 'Brighton', 'Burnley', 'Sheffield United', 'Newcastle', 'Sheffield United', 'Brentford', 'Burnley', 'Chelsea', 'Man City', 'Brighton', 'Crystal Palace', 'Liverpool', 'Arsenal', 'Wolves', 'Aston Villa', 'Man United', 'Tottenham', 'West Ham', 'Newcastle', 'Bournemouth', 'Everton', "Nott'm Forest", 'Crystal Palace', 'Man City', 'Brentford', 'Burnley', 'Arsenal', 'Brighton', 'Chelsea', 'Liverpool', 'Sheffield United', 'Aston Villa', 'Bournemouth', 'Man United', 'Newcastle', 'West Ham', 'Wolves', 'Tottenham', "Nott'm Forest", 'Fulham', 'Burnley', 'Everton', 'Fulham', 'Man United', 'Crystal Palace', 'Brighton', 'West Ham', 'Wolves', 'Arsenal', 'Liverpool', 'Bournemouth', 'Brentford', 'Man City

Convert a CSV to text format to input to GPT

In [27]:
csv_path = 'data/truncated_matches.csv'

with open(csv_path, 'r') as file:
    csv_content = file.read()

print(csv_content)

,Season,HomeTeam,AwayTeam,Result,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA
1,2023,Arsenal,Newcastle,D,0,0,1,0,0,1,0,17,8,4,1,10,16,5,5,4,4,0,0,1.8,3.8,4.33,1.8,3.6,4.4,1.83,3.75,4.2
2,2023,Arsenal,Man United,W,3,2,2,1,1,1,3,25,6,5,4,8,9,12,4,1,2,0,0,1.8,3.8,4.2,1.87,3.7,4,1.85,3.8,4
3,2023,Arsenal,Brentford,D,1,1,1,0,0,1,28,23,9,7,2,9,9,7,4,0,2,0,0,1.44,4.75,7,1.43,4.5,7.5,1.45,4.6,7
4,2023,Arsenal,Man City,L,1,3,0,1,1,1,3,10,9,1,6,11,15,1,3,2,3,0,0,2.9,3.5,2.35,2.85,3.4,2.4,2.9,3.3,2.45
5,2023,Arsenal,Everton,W,4,0,2,2,0,2,24,15,8,5,5,5,12,5,1,0,2,0,0,1.36,4.75,10,1.34,5,9.25,1.35,5,8.75
6,2023,Arsenal,Bournemouth,W,3,2,2,0,1,0,5,31,4,9,4,6,8,17,1,0,2,0,0,1.22,7,12,1.21,6.5,14,1.22,6.75,14
7,2023,Arsenal,Crystal Palace,W,4,1,2,2,0,2,34,15,8,5,4,8,10,5,4,0,1,0,0,1.25,5.5,15,1.26,5.75,12,1.25,5.75,12
8,2023,Arsenal,Leeds,W,4,1,2,1,0,2,9,13,7,6,5,11,13,4,3,0,2,0,0,1.29,6,9,1.29,5.75,10,1.3,5.75,10
9,2023,Arsenal,So

Create GPT queries

In [4]:
import openai
from openai import OpenAI
client = OpenAI(api_key = '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')
dictionaryOfResults = {}
for i in range(len(listOfHomeTeams)):
  prompt = "Can you make guesses on the " + listOfHomeTeams[i] + "vs " + listOfAwayTeams[i] + " match." + "Return only your prediction of the score, no other text in your response, just the final score."
  completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
      {"role": "system", "content": "You are an expert in predicting premier leageue scores"},
      {"role": "user", "content": prompt}
    ]
  )
  print(listOfHomeTeams[i] + ":" + listOfAwayTeams[i] + ' ' + completion.choices[0].message.content)
  dictionaryOfResults[listOfHomeTeams[i] + ":" + listOfAwayTeams[i]] = completion.choices[0].message.content




Burnley:Man City 2-1
Arsenal:Nott'm Forest 3-1
Bournemouth:West Ham 1-1
Everton:Fulham 2-1
Sheffield United:Crystal Palace 2-1
