In [374]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import log_loss, accuracy_score
import warnings

import sys
sys.path.append("..")

import utils

warnings.filterwarnings("ignore")

In [375]:
round_number = 5

## 1- Import data

In [376]:
seasons = {
    '2025-26': '../data/2025-26.csv',
    '2024-25': '../data/2024-25.csv',
    '2023-24': '../data/2023-24.csv',
    '2022-23': '../data/2022-23.csv',
    '2021-22': '../data/2021-22.csv',
    '2020-21': '../data/2020-21.csv',
    '2019-20': '../data/2019-20.csv',
    '2018-19': '../data/2018-19.csv',
    '2017-18': '../data/2017-18.csv',
    '2016-17': '../data/2016-17.csv'
}

master_df = utils.build_master_df(seasons, save_path="../data/all_seasons_data.csv")

Loading 2025-26...
Loading 2024-25...
Loading 2023-24...
Loading 2022-23...
Loading 2021-22...
Loading 2020-21...
Loading 2019-20...
Loading 2018-19...
Loading 2017-18...
Loading 2016-17...
Saved master dataset to ../data/all_seasons_data.csv (Total matches: 3460).


### Add betting market consensus probabilities

In [377]:
#Keep odds where >90% are populated. Odds are decimal.
odds = ['B365', 'BW', 'IW', 'VC', 'WH']

home_odds = [i + 'H' for i in odds]
draw_odds = [i + 'D' for i in odds]
away_odds = [i + 'A' for i in odds]

In [378]:
#Imput NA with mean across rows
master_df[home_odds] = master_df[home_odds].apply(lambda row: row.fillna(row.mean()), axis=1)
master_df[draw_odds] = master_df[draw_odds].apply(lambda row: row.fillna(row.mean()), axis=1)
master_df[away_odds] = master_df[away_odds].apply(lambda row: row.fillna(row.mean()), axis=1)

In [379]:
master_df['home_prob'] = 1/master_df[home_odds].mean(axis=1)
master_df['draw_prob'] = 1/master_df[draw_odds].mean(axis=1)
master_df['away_prob'] = 1/master_df[away_odds].mean(axis=1)

### Adding fixtures for 2025/26

In [380]:
fixtures = pd.read_csv('../data/fixtures.csv')

In [381]:
fixtures['Date'] = pd.to_datetime(fixtures['Date'])
fixtures['season'] = '2025-26'

In [382]:
team_map = {'Spurs': 'Tottenham',
            'Man Utd': 'Man United'}

fixture_column_map = {'Date': 'date',
                      'Home Team': 'home_team',
                      'Away Team': 'away_team',
                      'Round Number': 'round_number'}

fixtures['Home Team'] = fixtures['Home Team'].map(lambda x: team_map.get(x, x))
fixtures['Away Team'] = fixtures['Away Team'].map(lambda x: team_map.get(x, x))

fixtures = fixtures.rename(columns=fixture_column_map)
fixtures = fixtures[['date', 'home_team', 'away_team', 'round_number']]
fixtures = fixtures.loc[fixtures['round_number']==round_number]

In [383]:
master_column_map = {'Date': 'date',
                     'HomeTeam': 'home_team',
                     'AwayTeam': 'away_team',
                     'FTHG': 'home_goals',
                     'FTAG': 'away_goals',
                     'FTR': 'result',
                     'home_prob': 'home_prob',
                     'draw_prob': 'draw_prob',
                     'away_prob': 'away_prob',
                     'season': 'season'}

In [384]:
master_df = master_df.rename(columns=master_column_map)
master_df = master_df[list(master_column_map.values())]
master_df = master_df.assign(round_number=0)

In [385]:
master_df = pd.concat([master_df, fixtures])

## 2 - Encoding Teams

In [386]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

encoded = encoder.fit_transform(master_df[['home_team', 'away_team']])

feature_names = encoder.get_feature_names_out(['home_team', 'away_team'])

encoded_df = pd.DataFrame(encoded, columns=feature_names)

master_df = pd.concat([master_df.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

## 3 - Adding Features

### Rolling features

In [387]:
window_len = 3

# Home points and Away points
points_map_home = {"H": 3, "D": 1, "A": 0}
points_map_away = {"H": 0, "D": 1, "A": 3}
master_df["home_points"] = master_df["result"].map(points_map_home)
master_df["away_points"] = master_df["result"].map(points_map_away)

# Rolling average home and away goals
master_df[f'avg_home_goals_last_{window_len}'] = master_df.groupby('home_team')['home_goals'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())
master_df[f'avg_away_goals_last_{window_len}'] = master_df.groupby('away_team')['away_goals'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())

# olling average home and away points per game
master_df[f'home_ppg_last_{window_len}'] = master_df.groupby('home_team')['home_points'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())
master_df[f'away_ppg_last_{window_len}'] = master_df.groupby('away_team')['away_points'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())

### Elo rating

In [388]:
elo_ratings, elo_df = utils.calculate_elo(master_df)
master_df = master_df.merge(elo_df[["date", "home_team", "away_team", "home_elo_before", "away_elo_before", "home_elo_after", "away_elo_after"]], on=["date", "home_team", "away_team"])

In [389]:
master_df['date'] = pd.to_datetime(master_df['date'])

### Add rest days

In [390]:
master_df = utils.add_rest_days(master_df)

## 5- Prepare Training Dataset

In [391]:
team_list = master_df['home_team'].unique().tolist()
home_team_cols = ['home_team_' + team for team in team_list]
away_team_cols = ['away_team_' + team for team in team_list]

In [392]:
feat_cols = [f'avg_home_goals_last_{window_len}', f'avg_away_goals_last_{window_len}',
             f'home_ppg_last_{window_len}', f'away_ppg_last_{window_len}',
             'home_elo_before', 'away_elo_before',
             'days_rest_home_team', 'days_rest_away_team',
             'date', 'result',
             'home_prob', 'draw_prob',
             'away_prob', 'round_number'] + home_team_cols + away_team_cols

feature_df = master_df[feat_cols]

In [393]:
train_test_data = feature_df.loc[feature_df['round_number'] != round_number].dropna()

predict_data = master_df.loc[master_df['round_number'] == round_number]
predict_data = predict_data[[f'avg_home_goals_last_{window_len}', f'avg_away_goals_last_{window_len}',
                             f'home_ppg_last_{window_len}', f'away_ppg_last_{window_len}',
                             'home_elo_before', 'away_elo_before',
                             'days_rest_home_team', 'days_rest_away_team',
                             'home_prob', 'draw_prob', 
                             'away_prob'] + home_team_cols + away_team_cols].reset_index(drop=True)

In [394]:
# Time based split for improved accuracy
split_date = pd.to_datetime('31-07-2024')

x_train = train_test_data[train_test_data['date'] < split_date].drop(['date', 'result', 'round_number'], axis=1).reset_index(drop=True)
y_train = train_test_data[train_test_data['date'] < split_date].reset_index(drop=True)
x_test  = train_test_data[train_test_data['date'] >= split_date].drop(['date', 'result', 'round_number'], axis=1).reset_index(drop=True)
y_test  = train_test_data[train_test_data['date'] >= split_date].reset_index(drop=True)

y_train = y_train[['result']]
y_test = y_test[['result']]

## 6- Train Dataset

### Hyperparameter tuning

In [395]:
param_grid = {'n_estimators': [100, 200, 300],  
              'max_depth': [None, 10, 20, 30],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 5],           
              'max_features': ['sqrt', 'log2']}        

# rf = RandomForestClassifier(random_state=42, class_weight='balanced')
# tscv = TimeSeriesSplit(n_splits=10)

# grid_search = GridSearchCV(estimator=rf,
#                            param_grid=param_grid,
#                            cv=tscv,                 
#                            scoring='accuracy',    
#                            n_jobs=1,             
#                            verbose=2)

# Fit
#grid_search.fit(x_train, y_train)

In [396]:
# Best parameters
print("Best params:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best params: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
Best score: 0.543014705882353


### Fit model using tuned parameters

In [397]:
best_params = grid_search.best_params_ 

#Best params: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}

rf = RandomForestClassifier(max_depth=30,
                            max_features='sqrt',
                            min_samples_leaf=2,
                            min_samples_split=5,
                            n_estimators=300,
                            random_state=42)

rf.fit(x_train, y_train)

print("RF Log Loss:", round(log_loss(y_test, rf.predict_proba(x_test)),4))
print("RF Accuracy:", round(accuracy_score(y_test, rf.predict(x_test)),4))

RF Log Loss: 0.974
RF Accuracy: 0.5407


### Diagnosis - Feature importance

In [398]:
importances = rf.feature_importances_

feature_names = x_train.columns

feat_imp = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values(by="Importance", ascending=False)

print(feat_imp.head(11))

                  Feature  Importance
8               home_prob    0.132040
10              away_prob    0.131674
4         home_elo_before    0.094164
5         away_elo_before    0.092257
9               draw_prob    0.087903
0   avg_home_goals_last_3    0.049742
2         home_ppg_last_3    0.044452
1   avg_away_goals_last_3    0.044301
6     days_rest_home_team    0.042024
3         away_ppg_last_3    0.041271
7     days_rest_away_team    0.040406


## 7 - Predict this weeks results

### Add this week's odds

In [444]:
#Due to lack of free downloadable odds adding in manually. Odds are fractional
predict_data.loc[0, ['home_odds', 'draw_odds', 'away_odds']] = [10/21, 39/10, 6/1]
predict_data.loc[1, ['home_odds', 'draw_odds', 'away_odds']] = [6/4, 13/5, 112/100]
predict_data.loc[2, ['home_odds', 'draw_odds', 'away_odds']] = [63/50, 27/10, 205/100]
predict_data.loc[3, ['home_odds', 'draw_odds', 'away_odds']] = [49/20, 49/20, 6/5]
predict_data.loc[4, ['home_odds', 'draw_odds', 'away_odds']] = [41/20, 5/2, 29/20]
predict_data.loc[5, ['home_odds', 'draw_odds', 'away_odds']] = [17/10, 11/5, 182/100]
predict_data.loc[6, ['home_odds', 'draw_odds', 'away_odds']] = [37/20, 14/5, 148/100]
predict_data.loc[7, ['home_odds', 'draw_odds', 'away_odds']] = [21/20, 5/2, 3/1]
predict_data.loc[8, ['home_odds', 'draw_odds', 'away_odds']] = [13/5, 5/2, 3/1]
predict_data.loc[9, ['home_odds', 'draw_odds', 'away_odds']] = [19/20, 14/5, 3/1]

predict_data['home_prob'] = 1/(predict_data['home_odds']+1)
predict_data['draw_prob'] = 1/(predict_data['draw_odds']+1)
predict_data['away_prob'] = 1/(predict_data['away_odds']+1)
predict_data.drop(['home_odds', 'draw_odds', 'away_odds'], axis=1, inplace=True)

In [445]:
fixtures_this_week = fixtures.loc[fixtures['round_number']==round_number]
fixtures_this_week = fixtures_this_week[['home_team', 'away_team']].reset_index(drop=True)
fixtures_this_week['prediction'] = rf.predict(predict_data)

In [446]:
probabilities = pd.DataFrame(rf.predict_proba(predict_data), columns=['away_prob', 'draw_prob', 'home_prob'])
probabilities = probabilities.round(2)
fixtures_this_week = pd.concat([fixtures_this_week, probabilities], axis=1)
fixtures_this_week = fixtures_this_week[['home_team', 'away_team', 'prediction', 'home_prob', 'draw_prob', 'away_prob']]

In [447]:
fixtures_this_week

Unnamed: 0,home_team,away_team,prediction,home_prob,draw_prob,away_prob
0,Liverpool,Everton,H,0.75,0.18,0.08
1,Bournemouth,Newcastle,H,0.37,0.29,0.35
2,Brighton,Tottenham,H,0.39,0.24,0.37
3,Burnley,Nott'm Forest,A,0.3,0.21,0.49
4,West Ham,Crystal Palace,A,0.33,0.31,0.36
5,Wolves,Leeds,D,0.34,0.39,0.26
6,Man United,Chelsea,D,0.32,0.34,0.33
7,Fulham,Brentford,H,0.53,0.19,0.29
8,Sunderland,Aston Villa,H,0.39,0.25,0.36
9,Arsenal,Man City,H,0.49,0.25,0.27
