In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import log_loss, accuracy_score
import warnings

import sys
sys.path.append("..")

import utils

warnings.filterwarnings("ignore")

In [2]:
round_number = 3

## 1- Import data

In [3]:
seasons = {
    '2025-26': '../data/2025-26.csv',
    '2024-25': '../data/2024-25.csv',
    '2023-24': '../data/2023-24.csv',
    '2022-23': '../data/2022-23.csv',
    '2021-22': '../data/2021-22.csv',
    '2020-21': '../data/2020-21.csv',
    '2019-20': '../data/2019-20.csv',
    '2018-19': '../data/2018-19.csv',
    '2017-18': '../data/2017-18.csv',
    '2016-17': '../data/2016-17.csv'
}

master_df = utils.build_master_df(seasons, save_path="../data/all_seasons_data.csv")

Loading 2025-26...
Loading 2024-25...
Loading 2023-24...
Loading 2022-23...
Loading 2021-22...
Loading 2020-21...
Loading 2019-20...
Loading 2018-19...
Loading 2017-18...
Loading 2016-17...
Saved master dataset to ../data/all_seasons_data.csv (Total matches: 3440).


### Add betting market consensus probabilities

In [4]:
#Keep odds where >90% are populated. Odds are decimal.
odds = ['B365', 'BW', 'IW', 'VC', 'WH']

home_odds = [i + 'H' for i in odds]
draw_odds = [i + 'D' for i in odds]
away_odds = [i + 'A' for i in odds]

In [5]:
#Imput NA with mean across rows
master_df[home_odds] = master_df[home_odds].apply(lambda row: row.fillna(row.mean()), axis=1)
master_df[draw_odds] = master_df[draw_odds].apply(lambda row: row.fillna(row.mean()), axis=1)
master_df[away_odds] = master_df[away_odds].apply(lambda row: row.fillna(row.mean()), axis=1)

In [6]:
master_df['home_prob'] = 1/master_df[home_odds].mean(axis=1)
master_df['draw_prob'] = 1/master_df[draw_odds].mean(axis=1)
master_df['away_prob'] = 1/master_df[away_odds].mean(axis=1)

### Adding fixtures for 2025/26

In [7]:
fixtures = pd.read_csv('../data/fixtures.csv')

In [8]:
fixtures['Date'] = pd.to_datetime(fixtures['Date'])
fixtures['season'] = '2025-26'

In [9]:
team_map = {'Spurs': 'Tottenham',
            'Man Utd': 'Man United'}

fixture_column_map = {'Date': 'date',
                      'Home Team': 'home_team',
                      'Away Team': 'away_team',
                      'Round Number': 'round_number'}

fixtures['Home Team'] = fixtures['Home Team'].map(lambda x: team_map.get(x, x))
fixtures['Away Team'] = fixtures['Away Team'].map(lambda x: team_map.get(x, x))

fixtures = fixtures.rename(columns=fixture_column_map)
fixtures = fixtures[['date', 'home_team', 'away_team', 'round_number']]
fixtures = fixtures.loc[fixtures['round_number']==round_number]

In [10]:
master_column_map = {'Date': 'date',
                     'HomeTeam': 'home_team',
                     'AwayTeam': 'away_team',
                     'FTHG': 'home_goals',
                     'FTAG': 'away_goals',
                     'FTR': 'result',
                     'home_prob': 'home_prob',
                     'draw_prob': 'draw_prob',
                     'away_prob': 'away_prob',
                     'season': 'season'}

In [11]:
master_df = master_df.rename(columns=master_column_map)
master_df = master_df[list(master_column_map.values())]
master_df = master_df.assign(round_number=0)

In [12]:
master_df = pd.concat([master_df, fixtures])

## 2 - Encoding Teams

In [13]:
team_encoder = LabelEncoder()
master_df['home_id'] = team_encoder.fit_transform(master_df['home_team'])
master_df['away_id'] = team_encoder.transform(master_df['away_team'])

## 3 - Adding Features

### Rolling features

In [14]:
window_len = 3

# Home points and Away points
points_map_home = {"H": 3, "D": 1, "A": 0}
points_map_away = {"H": 0, "D": 1, "A": 3}
master_df["home_points"] = master_df["result"].map(points_map_home)
master_df["away_points"] = master_df["result"].map(points_map_away)

# Rolling average home and away goals
master_df[f'avg_home_goals_last_{window_len}'] = master_df.groupby('home_team')['home_goals'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())
master_df[f'avg_away_goals_last_{window_len}'] = master_df.groupby('away_team')['away_goals'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())

# olling average home and away points per game
master_df[f'home_ppg_last_{window_len}'] = master_df.groupby('home_team')['home_points'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())
master_df[f'away_ppg_last_{window_len}'] = master_df.groupby('away_team')['away_points'].transform(lambda x: x.shift().rolling(window_len, min_periods=1).mean())

### Elo rating

In [15]:
elo_ratings, elo_df = utils.calculate_elo(master_df)
master_df = master_df.merge(elo_df[["date", "home_team", "away_team", "home_elo_before", "away_elo_before", "home_elo_after", "away_elo_after"]], on=["date", "home_team", "away_team"])

In [16]:
master_df['date'] = pd.to_datetime(master_df['date'])

### Add rest days

In [17]:
master_df = utils.add_rest_days(master_df)

## 5- Prepare Training Dataset

In [18]:
feat_cols = ['home_id', 'away_id',
             f'avg_home_goals_last_{window_len}', f'avg_away_goals_last_{window_len}',
             f'home_ppg_last_{window_len}', f'away_ppg_last_{window_len}',
             'home_elo_before', 'away_elo_before',
             'days_rest_home_team', 'days_rest_away_team',
             'date', 'result',
             'home_prob', 'draw_prob',
             'away_prob', 'round_number']

feature_df = master_df[feat_cols]

In [19]:
train_test_data = feature_df.loc[feature_df['round_number']!=round_number].dropna()
predict_data = master_df.loc[master_df['round_number']==round_number]
predict_data = predict_data[['home_id', 'away_id', 
                             f'avg_home_goals_last_{window_len}', f'avg_away_goals_last_{window_len}',
                             f'home_ppg_last_{window_len}', f'away_ppg_last_{window_len}',
                             'home_elo_before', 'away_elo_before',
                             'days_rest_home_team', 'days_rest_away_team',
                             'home_prob', 'draw_prob', 
                             'away_prob']].reset_index(drop=True)

In [20]:
# Time based split for improved accuracy

split_date = train_test_data['date'].quantile(0.8)
x_train = train_test_data[train_test_data['date'] <= split_date].drop(['date', 'result', 'round_number'], axis=1).reset_index(drop=True)
y_train = train_test_data[train_test_data['date'] <= split_date].reset_index(drop=True)
x_test  = train_test_data[train_test_data['date'] > split_date].drop(['date', 'result', 'round_number'], axis=1).reset_index(drop=True)
y_test  = train_test_data[train_test_data['date'] > split_date].reset_index(drop=True)

y_train = y_train[['result']]
y_test = y_test[['result']]

## 6- Train Dataset

In [21]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf.fit(x_train, y_train)

y_pred_proba_rf = rf.predict_proba(x_test)
print("RF Log Loss:", log_loss(y_test, y_pred_proba_rf))
print("RF Accuracy:", accuracy_score(y_test, rf.predict(x_test)))

RF Log Loss: 0.9841340338112698
RF Accuracy: 0.551622418879056


## 7 - Predict this weeks results

### Add this week's odds

In [22]:
#Due to lack of free downloadable odds adding in manually. Odds are fractional
predict_data.loc[0, ['home_odds', 'draw_odds', 'away_odds']] = [8/15, 18/5, 11/2]
predict_data.loc[1, ['home_odds', 'draw_odds', 'away_odds']] = [3/8, 17/4, 8/1]
predict_data.loc[2, ['home_odds', 'draw_odds', 'away_odds']] = [2/1, 23/10, 154/100]
predict_data.loc[3, ['home_odds', 'draw_odds', 'away_odds']] = [8/11, 10/3, 7/2]
predict_data.loc[4, ['home_odds', 'draw_odds', 'away_odds']] = [164/100, 9/4, 19/10]
predict_data.loc[5, ['home_odds', 'draw_odds', 'away_odds']] = [5/2, 47/18, 23/20]
predict_data.loc[6, ['home_odds', 'draw_odds', 'away_odds']] = [16/5, 3/1, 9/10]
predict_data.loc[7, ['home_odds', 'draw_odds', 'away_odds']] = [8/11, 35/12, 9/2]
predict_data.loc[8, ['home_odds', 'draw_odds', 'away_odds']] = [6/5, 13/5, 40/17]
predict_data.loc[9, ['home_odds', 'draw_odds', 'away_odds']] = [19/20, 13/5, 16/5]

predict_data['home_prob'] = 1/(predict_data['home_odds']+1)
predict_data['draw_prob'] = 1/(predict_data['draw_odds']+1)
predict_data['away_prob'] = 1/(predict_data['away_odds']+1)
predict_data.drop(['home_odds', 'draw_odds', 'away_odds'], axis=1, inplace=True)

In [23]:
fixtures_this_week = fixtures.loc[fixtures['round_number']==round_number]
fixtures_this_week = fixtures_this_week[['home_team', 'away_team']].reset_index(drop=True)
fixtures_this_week['prediction'] = rf.predict(predict_data)

In [24]:
fixtures_this_week

Unnamed: 0,home_team,away_team,prediction
0,Chelsea,Fulham,H
1,Man United,Burnley,H
2,Sunderland,Brentford,A
3,Tottenham,Bournemouth,H
4,Wolves,Everton,D
5,Leeds,Newcastle,A
6,Brighton,Man City,H
7,Nott'm Forest,West Ham,H
8,Liverpool,Arsenal,D
9,Aston Villa,Crystal Palace,H


## Rolling cross-validation

In [25]:
# Example: make some fake time-ordered data
# from sklearn.model_selection import TimeSeriesSplit
# n_samples = 100
# x = pd.concat([x_train, x_test]).reset_index(drop=True)   # features
# y = pd.concat([y_train, y_test]).reset_index(drop=True)  # binary target

# # Define rolling cross-validation
# tscv = TimeSeriesSplit(n_splits=5)  # 5 folds

# clf = RandomForestClassifier(random_state=42)

# accuracies = []

# for train_idx, test_idx in tscv.split(x):
#     # expanding window: train on everything before test, test on the next block
#     X_train, X_test = x[train_idx], x[test_idx]
#     y_train, y_test = y[train_idx], y[test_idx]

#     clf.fit(X_train, y_train)
#     y_pred = clf.predict(X_test)
    
#     acc = accuracy_score(y_test, y_pred)
#     accuracies.append(acc)
#     print(f"Train size: {len(train_idx)}, Test size: {len(test_idx)}, Accuracy: {acc:.3f}")