In [46]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, Normalizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight

from xgboost import XGBClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

import warnings
warnings.filterwarnings("ignore")

import sys
sys.path.append("..")

import utils

In [47]:
round_number = 5

## 1- Import data

In [48]:
seasons = {
    '2025-26': '../data/2025-26.csv',
    '2024-25': '../data/2024-25.csv',
    '2023-24': '../data/2023-24.csv',
    '2022-23': '../data/2022-23.csv',
    '2021-22': '../data/2021-22.csv',
    '2020-21': '../data/2020-21.csv',
    '2019-20': '../data/2019-20.csv',
    '2018-19': '../data/2018-19.csv',
    '2017-18': '../data/2017-18.csv',
    '2016-17': '../data/2016-17.csv'
}

master_df = utils.build_master_df(seasons, save_path="../data/all_seasons_data.csv")

Loading 2025-26...
Loading 2024-25...
Loading 2023-24...
Loading 2022-23...
Loading 2021-22...
Loading 2020-21...
Loading 2019-20...
Loading 2018-19...
Loading 2017-18...
Loading 2016-17...
Saved master dataset to ../data/all_seasons_data.csv (Total matches: 3460).


### Add betting market consensus probabilities

In [49]:
#Keep odds where >90% are populated. Odds are decimal.
odds = ['B365', 'BW', 'IW', 'VC', 'WH']

home_odds = [i + 'H' for i in odds]
draw_odds = [i + 'D' for i in odds]
away_odds = [i + 'A' for i in odds]

#Imput NA with mean across rows
master_df[home_odds] = master_df[home_odds].apply(lambda row: row.fillna(row.mean()), axis=1)
master_df[draw_odds] = master_df[draw_odds].apply(lambda row: row.fillna(row.mean()), axis=1)
master_df[away_odds] = master_df[away_odds].apply(lambda row: row.fillna(row.mean()), axis=1)

master_df['home_prob'] = 1/master_df[home_odds].mean(axis=1)
master_df['draw_prob'] = 1/master_df[draw_odds].mean(axis=1)
master_df['away_prob'] = 1/master_df[away_odds].mean(axis=1)

### Adding fixtures for 2025/26

In [50]:
fixtures = pd.read_csv('../data/fixtures.csv')

In [51]:
fixtures['Date'] = pd.to_datetime(fixtures['Date'])
fixtures['season'] = '2025-26'

In [52]:
team_map = {'Spurs': 'Tottenham',
            'Man Utd': 'Man United'}

fixture_column_map = {'Date': 'date',
                      'Home Team': 'home_team',
                      'Away Team': 'away_team',
                      'Round Number': 'round_number'}

fixtures['Home Team'] = fixtures['Home Team'].map(lambda x: team_map.get(x, x))
fixtures['Away Team'] = fixtures['Away Team'].map(lambda x: team_map.get(x, x))

fixtures = fixtures.rename(columns=fixture_column_map)
fixtures = fixtures[['date', 'home_team', 'away_team', 'round_number']]
fixtures = fixtures.loc[fixtures['round_number']==round_number]

In [53]:
master_column_map = {'Date': 'date',
                     'HomeTeam': 'home_team',
                     'AwayTeam': 'away_team',
                     'FTHG': 'home_goals',
                     'FTAG': 'away_goals',
                     'FTR': 'result',
                     'home_prob': 'home_prob',
                     'draw_prob': 'draw_prob',
                     'away_prob': 'away_prob',
                     'season': 'season'}

In [54]:
master_df = master_df.rename(columns=master_column_map)
master_df = master_df[list(master_column_map.values())]
master_df = master_df.assign(round_number=0)

In [55]:
master_df = pd.concat([master_df, fixtures])

## 2 - Adding Features

### Rolling features

In [56]:
window_len = 3

# Home points and Away points
points_map_home = {"H": 3, "D": 1, "A": 0}
points_map_away = {"H": 0, "D": 1, "A": 3}
master_df["home_points"] = master_df["result"].map(points_map_home)
master_df["away_points"] = master_df["result"].map(points_map_away)

# Rolling average home and away goals
master_df[f'avg_home_goals_last_{window_len}'] = master_df.groupby('home_team')['home_goals'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())
master_df[f'avg_away_goals_last_{window_len}'] = master_df.groupby('away_team')['away_goals'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())

# olling average home and away points per game
master_df[f'home_ppg_last_{window_len}'] = master_df.groupby('home_team')['home_points'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())
master_df[f'away_ppg_last_{window_len}'] = master_df.groupby('away_team')['away_points'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())

### Elo rating

In [57]:
elo_ratings, elo_df = utils.calculate_elo(master_df, k_factor=20, home_adv=100, start_rating=500)
master_df = master_df.merge(elo_df[["date", "home_team", "away_team", "home_elo_before", "away_elo_before", "home_elo_after", "away_elo_after"]], on=["date", "home_team", "away_team"])

In [58]:
master_df['date'] = pd.to_datetime(master_df['date'])

### Add rest days

In [59]:
master_df = utils.add_rest_days(master_df)

## 3 - Prepare Training Dataset

In [60]:
team_list = master_df['home_team'].unique().tolist()
home_team_cols = ['home_team_' + team for team in team_list]
away_team_cols = ['away_team_' + team for team in team_list]

In [61]:
feat_cols = ['home_team', 'away_team',
             f'avg_home_goals_last_{window_len}', f'avg_away_goals_last_{window_len}',
             f'home_ppg_last_{window_len}', f'away_ppg_last_{window_len}',
             'home_elo_before', 'away_elo_before',
             'days_rest_home_team', 'days_rest_away_team',
             'date', 'result',
             'home_prob', 'draw_prob',
             'away_prob', 'round_number']

feature_df = master_df[feat_cols]

In [62]:
train_test_data = feature_df.loc[feature_df['round_number'] != round_number].dropna()

predict_data = master_df.loc[master_df['round_number'] == round_number]
predict_data = predict_data[['home_team', 'away_team',
                             f'avg_home_goals_last_{window_len}', f'avg_away_goals_last_{window_len}',
                             f'home_ppg_last_{window_len}', f'away_ppg_last_{window_len}',
                             'home_elo_before', 'away_elo_before',
                             'days_rest_home_team', 'days_rest_away_team',
                             'home_prob', 'draw_prob', 
                             'away_prob']].reset_index(drop=True)

In [63]:
# Time based split for improved accuracy
split_date = pd.to_datetime('31-07-2024')

x_train = train_test_data[train_test_data['date'] < split_date].drop(['date', 'result', 'round_number'], axis=1).reset_index(drop=True)
y_train = train_test_data[train_test_data['date'] < split_date].reset_index(drop=True)
x_test  = train_test_data[train_test_data['date'] >= split_date].drop(['date', 'result', 'round_number'], axis=1).reset_index(drop=True)
y_test  = train_test_data[train_test_data['date'] >= split_date].reset_index(drop=True)

y_train = y_train[['result']]
y_test = y_test[['result']]

mapping = {'H': 2, 'D': 1, 'A': 0}

y_train['result'] = y_train['result'].map(mapping)
y_test['result'] = y_test['result'].map(mapping)

## 4 - Train Dataset

### Hyperparameter tuning

In [19]:
model_choice = 'xgb'

if model_choice == 'xgb':
    
    model = XGBClassifier(eval_metric='logloss', random_state=42)
    
    param_grid = {'model__n_estimators': [35, 40, 45],      
                  'model__max_depth': [2, 3, 4],              
                  'model__learning_rate': [0.05, 0.1, 0.15], 
                  'model__subsample': [0.15, 0.2, 0.25],    
                  'model__colsample_bytree': [0.8, 1, 1.2], 
                  'model__gamma': [6.25, 6.5, 6.75],
                  'model__reg_alpha': [0, 0.1, 0.5],
                  'model__reg_lambda': [1, 1.5, 2]}

elif model_choice == 'rf':
    
    model = RandomForestClassifier(random_state=42)

    param_grid = {'model__n_estimators': [100, 200, 300],
                  'model__max_depth': [4, 6, 8],
                  'model__min_samples_split': [2, 5, 10],
                  'model__min_samples_leaf': [1, 2, 4], 
                  'model__max_features': ['sqrt', 'log2', None]}


numeric_features = ['avg_home_goals_last_3', 'avg_away_goals_last_3', 
                    'home_ppg_last_3', 'away_ppg_last_3', 
                    'home_elo_before', 'away_elo_before', 
                    'days_rest_home_team', 'days_rest_away_team', 
                    'home_prob', 'draw_prob', 'away_prob']

categorical_features = ['home_team', 'away_team']

pipe = Pipeline([
    ('preprocessor', ColumnTransformer([
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical_features)
    ])),
    ('model', model)
])

grid_search = GridSearchCV(estimator=pipe,
                           param_grid=param_grid,
                           cv=TimeSeriesSplit(n_splits=5),                 
                           scoring='accuracy',                 
                           verbose=1,
                           n_jobs=-1)

# Fit
grid_result = grid_search.fit(x_train, y_train)

# Best parameters
print("\nBest params:", grid_result.best_params_)
print("\nBest score:", grid_result.best_score_)

Fitting 5 folds for each of 6561 candidates, totalling 32805 fits

Best params: {'model__colsample_bytree': 1, 'model__gamma': 6.5, 'model__learning_rate': 0.05, 'model__max_depth': 2, 'model__n_estimators': 40, 'model__reg_alpha': 0, 'model__reg_lambda': 1, 'model__subsample': 0.2}

Best score: 0.558316633266533


### Fit model using tuned parameters

In [20]:
print("RF Log Loss:", round(log_loss(y_test, grid_search.best_estimator_.predict_proba(x_test)),4))
print("\nRF Accuracy:", round(accuracy_score(y_test, grid_search.best_estimator_.predict(x_test)),4))

labels = ['Win', 'Draw', 'Loss']

cm_df = pd.DataFrame(confusion_matrix(y_test, grid_search.best_estimator_.predict(x_test)), index=labels, columns=labels)
print('\n', cm_df)

RF Log Loss: 0.9925

RF Accuracy: 0.5431

       Win  Draw  Loss
Win    82     0    60
Draw   37     0    65
Loss   29     0   145


### Diagnosis - Feature importance

In [21]:
importances = grid_search.best_estimator_.named_steps['model'].feature_importances_

feature_names = grid_search.best_estimator_.named_steps['preprocessor'].get_feature_names_out()

feat_importances = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False).reset_index(drop=True)

feat_importances.head(10)

Unnamed: 0,feature,importance
0,num__away_prob,0.209885
1,num__home_prob,0.186547
2,num__home_elo_before,0.114487
3,num__draw_prob,0.093517
4,num__home_ppg_last_3,0.08885
5,num__away_elo_before,0.082992
6,cat__away_team_Burnley,0.078927
7,cat__away_team_Everton,0.072811
8,cat__home_team_Brighton,0.071984
9,num__avg_home_goals_last_3,0.0


## 7 - Predict this weeks results

### Add this week's odds

In [None]:
#Due to lack of free downloadable odds adding in manually. Odds are fractional
predict_data.loc[0, ['home_odds', 'draw_odds', 'away_odds']] = [10/21, 39/10, 6/1]
predict_data.loc[1, ['home_odds', 'draw_odds', 'away_odds']] = [6/4, 13/5, 112/100]
predict_data.loc[2, ['home_odds', 'draw_odds', 'away_odds']] = [63/50, 27/10, 205/100]
predict_data.loc[3, ['home_odds', 'draw_odds', 'away_odds']] = [49/20, 49/20, 6/5]
predict_data.loc[4, ['home_odds', 'draw_odds', 'away_odds']] = [41/20, 5/2, 29/20]
predict_data.loc[5, ['home_odds', 'draw_odds', 'away_odds']] = [17/10, 11/5, 182/100]
predict_data.loc[6, ['home_odds', 'draw_odds', 'away_odds']] = [37/20, 14/5, 148/100]
predict_data.loc[7, ['home_odds', 'draw_odds', 'away_odds']] = [21/20, 5/2, 3/1]
predict_data.loc[8, ['home_odds', 'draw_odds', 'away_odds']] = [13/5, 5/2, 3/1]
predict_data.loc[9, ['home_odds', 'draw_odds', 'away_odds']] = [19/20, 14/5, 3/1]

predict_data['home_prob'] = 1/(predict_data['home_odds']+1)
predict_data['draw_prob'] = 1/(predict_data['draw_odds']+1)
predict_data['away_prob'] = 1/(predict_data['away_odds']+1)
predict_data.drop(['home_odds', 'draw_odds', 'away_odds'], axis=1, inplace=True)

In [None]:
fixtures_this_week = fixtures.loc[fixtures['round_number']==round_number]
fixtures_this_week = fixtures_this_week[['home_team', 'away_team']].reset_index(drop=True)
fixtures_this_week['prediction'] = rf.predict(predict_data)

In [None]:
probabilities = pd.DataFrame(rf.predict_proba(predict_data), columns=['away_prob', 'draw_prob', 'home_prob'])
probabilities = probabilities.round(2)
fixtures_this_week = pd.concat([fixtures_this_week, probabilities], axis=1)
fixtures_this_week = fixtures_this_week[['home_team', 'away_team', 'prediction', 'home_prob', 'draw_prob', 'away_prob']]

In [None]:
fixtures_this_week

### Training a Neural Network

In [66]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded = encoder.fit_transform(train_test_data[['home_team', 'away_team']])
feature_names = encoder.get_feature_names_out(['home_team', 'away_team'])
encoded_df = pd.DataFrame(encoded, columns=feature_names)

train_test_data = pd.concat([train_test_data.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1).drop(columns=['home_team', 'away_team'])

In [67]:
# Time based split for improved accuracy
split_date = pd.to_datetime('31-07-2024')

x_train = train_test_data[train_test_data['date'] < split_date].drop(['date', 'result', 'round_number'], axis=1).reset_index(drop=True)
y_train = train_test_data[train_test_data['date'] < split_date].reset_index(drop=True)
x_test  = train_test_data[train_test_data['date'] >= split_date].drop(['date', 'result', 'round_number'], axis=1).reset_index(drop=True)
y_test  = train_test_data[train_test_data['date'] >= split_date].reset_index(drop=True)

y_train = y_train[['result']]
y_test = y_test[['result']]

mapping = {'H': 2, 'D': 1, 'A': 0}

y_train['result'] = y_train['result'].map(mapping)
y_test['result'] = y_test['result'].map(mapping)

In [70]:
model = Sequential([
    Dense(128, activation='relu', input_shape=(x_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.25),

    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    x_train,
    y_train,
    validation_split=0.2,
    epochs=40,
    batch_size=32,
    verbose=1)

Epoch 1/40
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.4296 - loss: 1.2283 - val_accuracy: 0.5526 - val_loss: 1.0233
Epoch 2/40
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.4923 - loss: 1.0653 - val_accuracy: 0.5209 - val_loss: 1.0512
Epoch 3/40
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5065 - loss: 1.0320 - val_accuracy: 0.5676 - val_loss: 0.9551
Epoch 4/40
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5269 - loss: 1.0074 - val_accuracy: 0.5710 - val_loss: 0.9516
Epoch 5/40
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5198 - loss: 1.0048 - val_accuracy: 0.5509 - val_loss: 0.9741
Epoch 6/40
[1m75/75[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.5374 - loss: 0.9962 - val_accuracy: 0.5693 - val_loss: 0.9555
Epoch 7/40
[1m75/75[0m [32m━━━━

In [71]:
test_loss, test_acc = model.evaluate(x_test, y_test)
print("Test accuracy:", test_acc)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5167 - loss: 0.9976
Test accuracy: 0.5167464017868042
