In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import log_loss, accuracy_score
import warnings

import helpers

warnings.filterwarnings("ignore")

In [2]:
round_number = 2

## 1- Import data

In [3]:
seasons = {
    '2025-26': './data/2025-26.csv',
    '2024-25': './data/2024-25.csv',
    '2023-24': './data/2023-24.csv',
    '2022-23': './data/2022-23.csv',
    '2021-22': './data/2021-22.csv',
    '2020-21': './data/2020-21.csv',
    '2019-20': './data/2019-20.csv',
    '2018-19': './data/2018-19.csv',
    '2017-18': './data/2017-18.csv',
    '2016-17': './data/2016-17.csv'
}

master_df = helpers.build_master_df(seasons)

Loading 2025-26...
Loading 2024-25...
Loading 2023-24...
Loading 2022-23...
Loading 2021-22...
Loading 2020-21...
Loading 2019-20...
Loading 2018-19...
Loading 2017-18...
Loading 2016-17...
Saved master dataset to ./data/all_seasons_data.csv (Total matches: 3430).


### Adding fixtures for 2025/26

In [4]:
fixtures = pd.read_csv('./data/fixtures.csv')

In [5]:
fixtures['Date'] = pd.to_datetime(fixtures['Date'])

In [6]:
team_map = {'Spurs': 'Tottenham',
            'Man Utd': 'Man United'}

fixture_column_map = {'Date': 'date',
                      'Home Team': 'home_team',
                      'Away Team': 'away_team',
                      'Round Number': 'round_number'}

fixtures['Home Team'] = fixtures['Home Team'].map(lambda x: team_map.get(x, x))
fixtures['Away Team'] = fixtures['Away Team'].map(lambda x: team_map.get(x, x))

fixtures = fixtures.rename(columns=fixture_column_map)
fixtures = fixtures[['date', 'home_team', 'away_team', 'round_number']]
fixtures = fixtures.loc[fixtures['round_number']==2]

In [7]:
master_column_map = {'Date': 'date',
                     'HomeTeam': 'home_team',
                     'AwayTeam': 'away_team',
                     'FTHG': 'home_goals',
                     'FTAG': 'away_goals',
                     'FTR': 'result'}

In [8]:
master_df = master_df.rename(columns=master_column_map)
master_df = master_df[list(master_column_map.values())]
master_df = master_df.assign(round_number=0)

In [9]:
master_df = pd.concat([master_df, fixtures])

## 2 - Encoding Teams

In [10]:
team_encoder = LabelEncoder()
master_df['home_id'] = team_encoder.fit_transform(master_df['home_team'])
master_df['away_id'] = team_encoder.transform(master_df['away_team'])

## 3 - Adding Features

In [11]:
window_len = 5

# Home points and Away points
points_map_home = {"H": 3, "D": 1, "A": 0}
points_map_away = {"H": 0, "D": 1, "A": 3}
master_df["home_points"] = master_df["result"].map(points_map_home)
master_df["away_points"] = master_df["result"].map(points_map_away)

# Rolling average home and away goals
master_df[f'avg_home_goals_last_{window_len}'] = master_df.groupby('home_team')['home_goals'].transform(lambda x: x.rolling(window_len, min_periods=1).mean())
master_df[f'avg_away_goals_last_{window_len}'] = master_df.groupby('away_team')['away_goals'].transform(lambda x: x.rolling(window_len, min_periods=1).mean())

# Points per game last 5
master_df[f'home_ppg_last_{window_len}'] = master_df.groupby('home_team')['home_points'].transform(lambda x: x.rolling(window_len, min_periods=1).mean())
master_df[f'away_ppg_last_{window_len}'] = master_df.groupby('away_team')['away_points'].transform(lambda x: x.rolling(window_len, min_periods=1).mean())

### Elo rating

In [12]:
elo_ratings, elo_df = helpers.calculate_elo(master_df)
master_df = master_df.merge(elo_df[["date","home_team", "away_team", "home_elo_before", "away_elo_before", "home_elo_after", "away_elo_after"]], on=["date", "home_team", "away_team"])

In [13]:
master_df['date'] = pd.to_datetime(master_df['date'])

In [14]:
master_df.dtypes

date                     datetime64[ns]
home_team                        object
away_team                        object
home_goals                      float64
away_goals                      float64
result                           object
round_number                      int64
home_id                           int64
away_id                           int64
home_points                     float64
away_points                     float64
avg_home_goals_last_5           float64
avg_away_goals_last_5           float64
home_ppg_last_5                 float64
away_ppg_last_5                 float64
home_elo_before                 float64
away_elo_before                 float64
home_elo_after                  float64
away_elo_after                  float64
dtype: object

In [15]:
master_df["days_rested_home"] = master_df.groupby("home_team")["date"].diff().dt.days
master_df["days_rested_away"] = master_df.groupby("away_team")["date"].diff().dt.days

## 5- Prepare Training Dataset

In [16]:
feat_cols = ['home_id', 'away_id',
             'avg_home_goals_last_5', 'avg_away_goals_last_5',
             'home_ppg_last_5', 'away_ppg_last_5',
             'home_elo_before', 'away_elo_before',
             'days_rested_home', 'days_rested_away',
             'date', 'result',
             'round_number']

feature_df = master_df[feat_cols]

In [17]:
train_test_data = feature_df.loc[feature_df['round_number']!=round_number].dropna()
predict_data = master_df.loc[master_df['round_number']==round_number]
predict_data = predict_data[['home_id', 'away_id', 
                             'avg_home_goals_last_5', 'avg_away_goals_last_5',
                             'home_ppg_last_5', 'away_ppg_last_5',
                             'home_elo_before', 'away_elo_before',
                             'days_rested_home', 'days_rested_away']]

In [18]:
# Time based split for improved accuracy

split_date = train_test_data['date'].quantile(0.8)
x_train = train_test_data[train_test_data['date'] <= split_date].drop(['date', 'result', 'round_number'], axis=1)
y_train = train_test_data[train_test_data['date'] <= split_date]
x_test  = train_test_data[train_test_data['date'] > split_date].drop(['date', 'result', 'round_number'], axis=1)
y_test  = train_test_data[train_test_data['date'] > split_date]

y_train = y_train[['result']]
y_test = y_test[['result']]

In [19]:
predict_data

Unnamed: 0,home_id,away_id,avg_home_goals_last_5,avg_away_goals_last_5,home_ppg_last_5,away_ppg_last_5,home_elo_before,away_elo_before,days_rested_home,days_rested_away
3430,32,7,1.25,0.75,0.75,1.75,1506.292347,1662.061672,96.0,89.0
3431,18,29,2.75,1.0,3.0,0.25,1774.411638,1510.014208,95.0,99.0
3432,2,33,1.0,1.25,1.75,1.5,1558.465776,1501.811371,90.0,95.0
3433,3,1,2.5,1.25,1.75,1.5,1568.004073,1656.307663,97.0,90.0
3434,5,27,1.0,0.75,0.5,0.75,1402.168882,1380.815222,461.0,3016.0
3435,0,14,1.25,1.0,1.25,0.0,1762.129492,1450.061681,97.0,825.0
3436,8,23,1.75,1.5,2.0,1.75,1603.291968,1583.2931,96.0,98.0
3437,9,4,1.25,2.25,1.25,1.5,1558.156589,1610.190165,98.0,91.0
3438,10,19,1.25,1.25,0.75,0.25,1559.161599,1540.052219,91.0,100.0
3439,21,16,2.5,1.5,2.25,0.75,1646.311291,1774.053919,92.0,98.0


## 6- Train Dataset

In [20]:
rf = RandomForestClassifier(n_estimators=300, max_depth=None, random_state=42)
rf.fit(x_train, y_train)

y_pred_proba_rf = rf.predict_proba(x_test)
print("RF Log Loss:", log_loss(y_test, y_pred_proba_rf))
print("RF Accuracy:", accuracy_score(y_test, rf.predict(x_test)))

RF Log Loss: 0.7716075873060149
RF Accuracy: 0.654357459379616


## 7 - Predict this weeks results

In [21]:
fixtures_this_week = fixtures.loc[fixtures['round_number']==2]
fixtures_this_week = fixtures_this_week[['home_team', 'away_team']].reset_index(drop=True)
fixtures_this_week['prediction'] = rf.predict(predict_data)

In [22]:
fixtures_this_week

Unnamed: 0,home_team,away_team,prediction
0,West Ham,Chelsea,A
1,Man City,Tottenham,H
2,Bournemouth,Wolves,H
3,Brentford,Aston Villa,A
4,Burnley,Sunderland,A
5,Arsenal,Leeds,H
6,Crystal Palace,Nott'm Forest,D
7,Everton,Brighton,A
8,Fulham,Man United,D
9,Newcastle,Liverpool,H
