# Premier League Predictions Using Statistics, Betting Odds, and GPT-4

### Imports

In [1]:
import os
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier, XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, StackingClassifier, StackingRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from autofeat import AutoFeatClassifier

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Approach:

Make lookup table for each possible matchup to make it identifiable, so predicting is just indexing table

Elements are array of averaged stats for every matchup in training set

This is passed into model to give a result to put into table

Each element is a weight determined by a model

23/24 Teams Undefined: Luton Town

### Statistics

In [2]:
matches = pd.read_csv("data/SPREADS_BIN_CLASS.csv", index_col=0)
matches = matches.dropna()
matches

Unnamed: 0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,HY,AY,HR,AR,B365,B365Spread,BW,BSpread,IW,ISpread
1,Liverpool,Norwich,1.0,4,1,4,0,15,12,7,...,0,2,0,0,2,8.930,2,1.570,2,1.575
2,West Ham,Man City,0.0,0,5,0,1,5,14,3,...,2,2,0,0,0,-5.390,0,0.630,0,0.025
3,Bournemouth,Sheffield United,0.5,1,1,0,0,13,8,3,...,2,1,0,0,2,0.825,2,-0.675,2,-0.665
4,Burnley,Southampton,1.0,3,0,0,0,10,11,4,...,0,0,0,0,2,0.065,1,-0.500,1,-0.500
5,Crystal Palace,Everton,0.5,0,0,0,0,6,10,2,...,2,1,0,1,0,-0.315,0,-0.425,1,-0.450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,Norwich,Arsenal,0.0,0,2,0,0,11,14,5,...,1,0,0,0,0,-1.325,0,-0.565,0,-0.700
3797,Southampton,Man United,0.5,1,1,1,0,15,8,6,...,3,2,0,0,2,0.085,1,-0.700,2,-0.600
3798,Sunderland,Swansea,0.0,1,3,0,2,20,8,4,...,2,3,0,0,2,0.550,1,-0.750,2,-0.500
3799,Tottenham,Aston Villa,1.0,3,0,3,0,12,4,6,...,1,0,0,0,2,3.000,2,-0.650,2,0.225


In [3]:
# Encoding
label_encoder = LabelEncoder()
matches['HomeTeam'] = label_encoder.fit_transform(matches['HomeTeam'])
matches['AwayTeam'] = label_encoder.fit_transform(matches['AwayTeam'])
team_mapping_legend = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
team_mapping_legend = {v: k for k, v in team_mapping_legend.items()}
print("Team Mapping Legend:", team_mapping_legend)
num_teams = len(team_mapping_legend)
matches

Team Mapping Legend: {0: 'Arsenal', 1: 'Aston Villa', 2: 'Bournemouth', 3: 'Brentford', 4: 'Brighton', 5: 'Burnley', 6: 'Cardiff', 7: 'Chelsea', 8: 'Crystal Palace', 9: 'Everton', 10: 'Fulham', 11: 'Huddersfield', 12: 'Hull', 13: 'Leeds', 14: 'Leicester', 15: 'Liverpool', 16: 'Man City', 17: 'Man United', 18: 'Middlesbrough', 19: 'Newcastle', 20: 'Norwich', 21: "Nott'm Forest", 22: 'QPR', 23: 'Sheffield United', 24: 'Southampton', 25: 'Stoke', 26: 'Sunderland', 27: 'Swansea', 28: 'Tottenham', 29: 'Watford', 30: 'West Brom', 31: 'West Ham', 32: 'Wolves'}


Unnamed: 0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,HY,AY,HR,AR,B365,B365Spread,BW,BSpread,IW,ISpread
1,15,20,1.0,4,1,4,0,15,12,7,...,0,2,0,0,2,8.930,2,1.570,2,1.575
2,31,16,0.0,0,5,0,1,5,14,3,...,2,2,0,0,0,-5.390,0,0.630,0,0.025
3,2,23,0.5,1,1,0,0,13,8,3,...,2,1,0,0,2,0.825,2,-0.675,2,-0.665
4,5,24,1.0,3,0,0,0,10,11,4,...,0,0,0,0,2,0.065,1,-0.500,1,-0.500
5,8,9,0.5,0,0,0,0,6,10,2,...,2,1,0,1,0,-0.315,0,-0.425,1,-0.450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,20,0,0.0,0,2,0,0,11,14,5,...,1,0,0,0,0,-1.325,0,-0.565,0,-0.700
3797,24,17,0.5,1,1,1,0,15,8,6,...,3,2,0,0,2,0.085,1,-0.700,2,-0.600
3798,26,27,0.0,1,3,0,2,20,8,4,...,2,3,0,0,2,0.550,1,-0.750,2,-0.500
3799,28,1,1.0,3,0,3,0,12,4,6,...,1,0,0,0,2,3.000,2,-0.650,2,0.225


In [4]:
stats = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
matchups = [[stats for _ in range(num_teams)] for _ in range(num_teams)]

In [None]:
train_set = matches.drop(columns=["HomeTeam","AwayTeam"])
X = train_set.drop(["result","FTHG","FTAG"], axis=1)
y = train_set["result"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

catboost_model = CatBoostRegressor(
    iterations=1000, 
    depth=3, 
    learning_rate=0.05, 
    l2_leaf_reg=3,
    loss_function='RMSE'
)

xgb_model = xgb.XGBRegressor(
    learning_rate=0.05,
    n_estimators=1000,
    max_depth=4,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.3,
    objective='binary:logistic'
)

estimators = [('xgb', xgb_model), ('catboost', catboost_model)]
stacking_model = StackingRegressor(estimators=estimators)
stacking_model.fit(X_train, y_train)

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

# Drop unnecessary columns
X = train_set.drop(["result", "FTHG", "FTAG", "HTHG", "HTAG"], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Neural network model using Keras
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(3, activation='softmax')  # Assuming 3 classes in the target variable
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, verbose=2)


In [15]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy: ", accuracy)

Accuracy: 0.7502631378173828


In [7]:
def match_predict(feature_values):
    feature_df = pd.DataFrame([feature_values], columns=X.columns)
    return stacking_model.predict(feature_df)

In [8]:
matches

Unnamed: 0,HomeTeam,AwayTeam,result,FTHG,FTAG,HTHG,HTAG,HS,AS,HST,...,HY,AY,HR,AR,B365,B365Spread,BW,BSpread,IW,ISpread
1,15,20,1.0,4,1,4,0,15,12,7,...,0,2,0,0,2,8.930,2,1.570,2,1.575
2,31,16,0.0,0,5,0,1,5,14,3,...,2,2,0,0,0,-5.390,0,0.630,0,0.025
3,2,23,0.5,1,1,0,0,13,8,3,...,2,1,0,0,2,0.825,2,-0.675,2,-0.665
4,5,24,1.0,3,0,0,0,10,11,4,...,0,0,0,0,2,0.065,1,-0.500,1,-0.500
5,8,9,0.5,0,0,0,0,6,10,2,...,2,1,0,1,0,-0.315,0,-0.425,1,-0.450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3796,20,0,0.0,0,2,0,0,11,14,5,...,1,0,0,0,0,-1.325,0,-0.565,0,-0.700
3797,24,17,0.5,1,1,1,0,15,8,6,...,3,2,0,0,2,0.085,1,-0.700,2,-0.600
3798,26,27,0.0,1,3,0,2,20,8,4,...,2,3,0,0,2,0.550,1,-0.750,2,-0.500
3799,28,1,1.0,3,0,3,0,12,4,6,...,1,0,0,0,2,3.000,2,-0.650,2,0.225


In [9]:
for id, row in matches.iterrows():
    features = ['HTHG','HTAG','HS','AS','HST','AST','HF','AF','HC','AC','HY','AY','HR','AR','B365','B365Spread','BW','BSpread','IW','ISpread']
    df = row[features]
    feat_arr = df.to_numpy()
    home = int(row['HomeTeam'])
    away = int(row['AwayTeam'])
    prev = matchups[home][away]
    if(sum(prev) <= 0):
        matchups[home][away] = feat_arr
    else:
        matchups[home][away] = [(x + y) / 2 for x, y in zip(prev, df)]

In [12]:
matchups_predictions = [['-' for _ in range(num_teams)] for _ in range(num_teams)]
for row_index, row in enumerate(matchups_predictions):
    for col_index, element in enumerate(row):
            if(row_index != col_index):
                prediction = match_predict(matchups[row_index][col_index])
                matchups_predictions[row_index][col_index] = float(np.where(prediction[0] < 0.25, 0, np.where(prediction[0] <= 0.7, 0.5, 1)))
                matchups_predictions[row_index][col_index] = prediction[0]

In [13]:
test_set = pd.read_csv("data/s23_24.csv")
test_set['HomeTeam_encoded'] = label_encoder.fit_transform(test_set['HomeTeam'])
test_set['AwayTeam_encoded'] = label_encoder.fit_transform(test_set['AwayTeam'])
win_chances = [matchups_predictions[row['HomeTeam_encoded']][row['AwayTeam_encoded']] for _, row in test_set.iterrows()]
test_set['Win Chance'] = [f"{min(matchups_predictions[row['HomeTeam_encoded']][row['AwayTeam_encoded']] * 100, 95):.2f}%" for _, row in test_set.iterrows()]

# outcome_mapping = {0: 'L', 0.5: 'D', 1: 'W'}
# test_set['Predicted Outcome'] = test_set['Predicted Outcome'].map(outcome_mapping)
test_set = test_set.drop(['HomeTeam_encoded', 'AwayTeam_encoded'], axis=1)

test_set.to_csv('OUTPUT.csv', index=False)
test_set


Unnamed: 0,HomeTeam,AwayTeam,Win Chance
0,Burnley,Man City,65.65%
1,Arsenal,Nott'm Forest,88.26%
2,Bournemouth,West Ham,34.45%
3,Everton,Fulham,42.36%
4,Sheffield United,Crystal Palace,35.27%
...,...,...,...
112,Sheffield United,Bournemouth,90.65%
113,Brentford,Arsenal,15.50%
114,Tottenham,Aston Villa,90.14%
115,Everton,Man United,73.41%


## GPT-4