In [73]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix
)

pd.set_option("display.max_columns", None)
RANDOM_STATE = 1

In [74]:
files = [
    "data/2025.csv",
    "data/2024.csv",
    "data/2023.csv",
    "data/2022.csv",
    "data/2021.csv",
    "data/2020.csv",
    "data/2019.csv",
    "data/2018.csv"
]

df_list = [pd.read_csv(f) for f in files]
matches_raw = pd.concat(df_list, ignore_index=True)
matches_raw.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC,AC,HY,AY,HR,AR,B365H,B365D,B365A,BFDH,BFDD,BFDA,BMGMH,BMGMD,BMGMA,BVH,BVD,BVA,BWH,BWD,BWA,CLH,CLD,CLA,LBH,LBD,LBA,PSH,PSD,PSA,MaxH,MaxD,MaxA,AvgH,AvgD,AvgA,BFEH,BFED,BFEA,B365>2.5,B365<2.5,P>2.5,P<2.5,Max>2.5,Max<2.5,Avg>2.5,Avg<2.5,BFE>2.5,BFE<2.5,AHh,B365AHH,B365AHA,PAHH,PAHA,MaxAHH,MaxAHA,AvgAHH,AvgAHA,BFEAHH,BFEAHA,B365CH,B365CD,B365CA,BFDCH,BFDCD,BFDCA,BMGMCH,BMGMCD,BMGMCA,BVCH,BVCD,BVCA,BWCH,BWCD,BWCA,CLCH,CLCD,CLCA,LBCH,LBCD,LBCA,PSCH,PSCD,PSCA,MaxCH,MaxCD,MaxCA,AvgCH,AvgCD,AvgCA,BFECH,BFECD,BFECA,B365C>2.5,B365C<2.5,PC>2.5,PC<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,BFEC>2.5,BFEC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA,BFECAHH,BFECAHA,BFH,BFD,BFA,WHH,WHD,WHA,1XBH,1XBD,1XBA,BFCH,BFCD,BFCA,WHCH,WHCD,WHCA,1XBCH,1XBCD,1XBCA,IWH,IWD,IWA,VCH,VCD,VCA,IWCH,IWCD,IWCA,VCCH,VCCD,VCCA,Bb1X2,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA
0,E0,15/08/2025,20:00,Liverpool,Bournemouth,4,2,H,1,0,H,A Taylor,19,10,10,3,7,10,6,7,1,2,0,0,1.3,6.0,8.5,1.3,6.0,9.5,1.29,6.5,9.0,1.3,6.0,8.5,1.32,5.5,7.5,1.33,5.75,8.0,1.33,5.75,7.5,1.28,6.56,9.07,1.34,6.5,9.5,1.31,5.96,8.31,1.34,6.6,9.4,1.36,3.2,1.37,3.26,1.38,3.3,1.35,3.13,1.4,3.4,-1.5,1.83,2.03,1.9,2.03,1.83,2.06,1.78,1.99,1.9,2.08,1.29,6.25,9.0,1.3,6.0,9.5,1.3,6.25,9.0,1.29,6.0,9.0,1.31,5.75,8.0,1.3,6.0,8.0,1.3,5.75,8.0,1.29,6.55,9.75,1.31,6.6,9.5,1.29,6.02,8.68,1.32,6.8,10.0,1.36,3.2,1.41,2.95,1.4,3.2,1.36,3.05,1.41,3.4,-1.75,2.03,1.78,2.07,1.85,2.03,1.88,1.94,1.76,2.14,1.86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,E0,16/08/2025,12:30,Aston Villa,Newcastle,0,0,D,0,0,D,C Pawson,3,16,3,3,13,11,3,6,1,1,1,0,2.25,3.5,2.9,2.25,3.75,3.1,2.38,3.55,2.95,2.3,3.5,3.0,2.3,3.6,2.87,2.3,3.6,2.87,2.3,3.6,2.87,2.24,3.72,3.13,2.38,3.75,3.1,2.3,3.56,2.94,2.4,3.75,3.1,1.62,2.3,1.65,2.33,1.65,2.33,1.61,2.25,1.72,2.36,-0.25,2.0,1.85,1.96,1.94,2.0,1.87,1.9,1.78,2.08,1.9,2.45,3.4,2.8,2.38,3.5,3.1,2.4,3.45,2.95,2.38,3.3,3.0,2.4,3.4,2.85,2.37,3.5,2.87,2.37,3.5,2.87,2.32,3.63,3.07,2.45,3.5,3.1,2.38,3.41,2.93,2.48,3.55,3.1,1.83,2.03,1.66,2.31,1.83,2.18,1.75,2.04,1.87,2.12,-0.25,2.05,1.8,2.02,1.89,2.06,1.8,1.95,1.74,2.14,1.86,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,E0,16/08/2025,15:00,Brighton,Fulham,1,1,D,0,0,D,S Barrott,10,7,4,2,16,15,4,3,3,3,0,0,1.91,3.6,4.0,1.95,3.75,4.0,1.92,3.65,4.0,1.91,3.6,3.9,1.9,3.6,3.9,1.91,3.6,3.9,1.91,3.6,3.9,1.94,3.73,3.98,1.95,3.75,4.0,1.92,3.62,3.91,1.99,3.85,4.2,1.73,2.1,1.76,2.15,1.74,2.15,1.71,2.08,1.78,2.22,-0.5,1.93,1.93,1.95,1.95,1.93,1.93,1.87,1.88,1.99,2.0,1.8,3.7,4.33,1.83,3.75,4.33,1.87,3.65,4.3,1.83,3.6,4.33,1.85,3.6,4.1,1.85,3.6,4.2,1.85,3.6,4.0,1.92,3.66,4.27,1.87,3.75,4.4,1.83,3.64,4.26,1.9,3.8,4.6,1.88,1.98,1.91,2.0,1.91,2.04,1.84,1.93,1.93,2.06,-0.5,1.83,2.03,1.93,2.0,1.84,2.03,1.8,1.96,1.91,2.08,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,E0,16/08/2025,15:00,Sunderland,West Ham,3,0,H,0,0,D,R Jones,10,12,5,4,8,10,5,7,0,1,0,0,3.25,3.4,2.25,3.2,3.5,2.3,3.25,3.5,2.25,3.2,3.3,2.3,3.1,3.4,2.25,3.1,3.4,2.25,3.1,3.4,2.25,3.27,3.43,2.29,3.25,3.5,2.3,3.17,3.38,2.26,3.35,3.5,2.34,1.93,1.93,1.98,1.91,1.94,1.93,1.88,1.88,2.02,1.96,0.25,1.9,1.95,1.93,1.98,1.9,1.95,1.84,1.81,1.97,2.01,3.4,3.2,2.25,3.4,3.3,2.3,3.55,3.1,2.3,3.4,3.13,2.25,3.3,3.25,2.25,3.3,3.25,2.25,3.25,3.2,2.25,3.55,3.24,2.29,3.55,3.3,2.31,3.4,3.17,2.25,3.6,3.4,2.32,2.1,1.73,2.24,1.7,2.25,1.75,2.11,1.69,2.2,1.8,0.25,1.95,1.9,1.97,1.95,1.95,1.94,1.86,1.78,2.02,1.97,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,E0,16/08/2025,15:00,Tottenham,Burnley,3,0,H,1,0,H,M Oliver,16,14,6,4,14,8,6,5,0,0,0,0,1.38,4.75,8.5,1.36,5.0,9.5,1.37,5.0,9.0,1.33,4.6,8.0,1.39,4.8,8.0,1.4,4.8,8.0,1.4,4.8,8.0,1.4,4.71,8.79,1.4,5.0,9.5,1.37,4.8,8.3,1.42,5.2,9.6,1.73,2.1,1.74,2.17,1.73,2.3,1.66,2.16,1.75,2.28,-1.25,1.88,1.98,1.89,2.01,1.88,1.98,1.75,1.95,1.93,2.03,1.55,3.9,6.25,1.57,4.0,6.5,1.55,4.1,6.5,1.53,3.9,5.75,1.55,4.1,6.0,1.55,4.0,6.0,1.55,4.0,6.0,1.56,4.2,6.7,1.58,4.1,6.5,1.55,4.01,6.16,1.61,4.3,6.8,1.93,1.93,1.91,2.0,1.93,1.93,1.88,1.88,1.99,2.0,-1.0,1.98,1.88,1.99,1.93,1.98,1.91,1.88,1.83,2.07,1.92,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [75]:
matches_raw.columns = [c.strip().lower() for c in matches_raw.columns]

if "date" in matches_raw.columns:
    matches_raw["date"] = pd.to_datetime(matches_raw["date"], dayfirst=True, errors="coerce")

matches_raw = matches_raw.dropna(subset=["date"])
matches_raw.sort_values("date", inplace=True)
matches_raw.reset_index(drop=True, inplace=True)

matches_raw[["date", "hometeam", "awayteam", "fthg", "ftag", "ftr"]].head()

Unnamed: 0,date,hometeam,awayteam,fthg,ftag,ftr
0,2018-08-10,Man United,Leicester,2,1,H
1,2018-08-11,Bournemouth,Cardiff,2,0,H
2,2018-08-11,Fulham,Crystal Palace,0,2,A
3,2018-08-11,Huddersfield,Chelsea,0,3,A
4,2018-08-11,Newcastle,Tottenham,1,2,A


In [76]:
matches_raw["match_id"] = np.arange(len(matches_raw))

In [77]:
matches_raw = matches_raw.dropna(subset=["b365h", "b365d", "b365a"])

In [78]:
import numpy as np

def make_team_rows(df):
    result_map_home = {"H": "W", "D": "D", "A": "L"}
    result_map_away = {"H": "L", "D": "D", "A": "W"}

    home = pd.DataFrame({
        "date": df["date"],
        "team": df["hometeam"],
        "opponent": df["awayteam"],
        "goals_for": df["fthg"],
        "goals_against": df["ftag"],
        "venue": "H",
        "ftr": df["ftr"],          # original result code
        "b365h": df["b365h"],
        "b365d": df["b365d"],
        "b365a": df["b365a"],
    })
    home["result"] = df["ftr"].map(result_map_home)

    away = pd.DataFrame({
        "date": df["date"],
        "team": df["awayteam"],
        "opponent": df["hometeam"],
        "goals_for": df["ftag"],
        "goals_against": df["fthg"],
        "venue": "A",
        "ftr": df["ftr"],
        "b365h": df["b365h"],
        "b365d": df["b365d"],
        "b365a": df["b365a"],
    })
    away["result"] = df["ftr"].map(result_map_away)

    return pd.concat([home, away], ignore_index=True)

matches = make_team_rows(matches_raw)
matches.head()

Unnamed: 0,date,team,opponent,goals_for,goals_against,venue,ftr,b365h,b365d,b365a,result
0,2018-08-10,Man United,Leicester,2,1,H,H,1.57,3.9,7.5,W
1,2018-08-11,Bournemouth,Cardiff,2,0,H,H,1.9,3.6,4.5,W
2,2018-08-11,Fulham,Crystal Palace,0,2,H,A,2.5,3.4,3.0,L
3,2018-08-11,Huddersfield,Chelsea,0,3,H,A,6.5,4.0,1.61,L
4,2018-08-11,Newcastle,Tottenham,1,2,H,A,3.9,3.5,2.04,L


In [80]:
K = 30

team_elos = {}

def expected_score(elo_a, elo_b):
    return 1 / (1 + 10 ** ((elo_b - elo_a) / 400))

elos = []

for idx, row in matches.iterrows():
    team = row["team"]
    opp = row["opponent"]
    
    if team not in team_elos:
        team_elos[team] = 1500
    if opp not in team_elos:
        team_elos[opp] = 1500

    elo_t = team_elos[team]
    elo_o = team_elos[opp]

    exp_t = expected_score(elo_t, elo_o)
    
    if row["result"] == "W":
        actual = 1.0
    elif row["result"] == "D":
        actual = 0.5
    else:
        actual = 0.0

    team_elos[team] = elo_t + K * (actual - exp_t)
    team_elos[opp] = elo_o + K * ((1 - actual) - (1 - exp_t))

    elos.append(team_elos[team])

matches["elo"] = elos

In [81]:
matches = matches.sort_values(["team", "date"]).reset_index(drop=True)

matches["opp_elo"] = matches["elo"].shift(1)

matches["opp_elo"] = matches.groupby("team")["elo"].shift(1)

In [82]:
matches["match_id"] = np.repeat(matches_raw["match_id"].values, 2)

In [83]:
matches["p_home_win"] = 1 / matches["b365h"]
matches["p_draw"] = 1 / matches["b365d"]
matches["p_away_win"] = 1 / matches["b365a"]

prob_sum = matches["p_home_win"] + matches["p_draw"] + matches["p_away_win"]
matches["p_home_win"] /= prob_sum
matches["p_draw"] /= prob_sum
matches["p_away_win"] /= prob_sum

# From the *team's* perspective: what is P(this team wins)?
matches["p_team_win"] = np.where(
    matches["venue"] == "H",
    matches["p_home_win"],
    matches["p_away_win"],
)

matches["p_opp_win"] = np.where(
    matches["venue"] == "H",
    matches["p_away_win"],
    matches["p_home_win"],
)

In [84]:
odds_cols = [c for c in matches_raw.columns if c.startswith("b365")]
odds_cols

['b365h',
 'b365d',
 'b365a',
 'b365>2.5',
 'b365<2.5',
 'b365ahh',
 'b365aha',
 'b365ch',
 'b365cd',
 'b365ca',
 'b365c>2.5',
 'b365c<2.5',
 'b365cahh',
 'b365caha']

In [85]:
matches_raw.columns

Index(['div', 'date', 'time', 'hometeam', 'awayteam', 'fthg', 'ftag', 'ftr',
       'hthg', 'htag',
       ...
       'bbav>2.5', 'bbmx<2.5', 'bbav<2.5', 'bbah', 'bbahh', 'bbmxahh',
       'bbavahh', 'bbmxaha', 'bbavaha', 'match_id'],
      dtype='object', length=181)

In [86]:
matches["target"] = (matches["result"] == "W").astype(int)

matches["venue_code"] = matches["venue"].astype("category").cat.codes
matches["team_code"] = matches["team"].astype("category").cat.codes
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

baseline = matches["target"].mean()
print("Baseline win rate (always predict win):", baseline)

Baseline win rate (always predict win): 0.38826916156890967


In [87]:
matches = matches.sort_values(["team", "date"]).reset_index(drop=True)

matches["points"] = matches["result"].map({"W": 3, "D": 1, "L": 0})

matches["form_rolling"] = (
    matches.groupby("team")["points"]
    .transform(lambda s: s.shift().rolling(5, min_periods=1).mean())
)

matches[["date", "team", "opponent", "result", "points", "form_rolling"]].head(10)

Unnamed: 0,date,team,opponent,result,points,form_rolling
0,2018-08-12,Arsenal,Man City,L,0,
1,2018-08-18,Arsenal,Chelsea,L,0,0.0
2,2018-08-25,Arsenal,West Ham,W,3,0.0
3,2018-09-02,Arsenal,Cardiff,W,3,1.0
4,2018-09-15,Arsenal,Newcastle,W,3,1.5
5,2018-09-23,Arsenal,Everton,W,3,1.8
6,2018-09-29,Arsenal,Watford,W,3,2.4
7,2018-10-07,Arsenal,Fulham,W,3,3.0
8,2018-10-22,Arsenal,Leicester,W,3,3.0
9,2018-10-28,Arsenal,Crystal Palace,D,1,3.0


In [88]:
# Build match-level date table
match_dates = matches_raw[["match_id", "date"]].drop_duplicates()
match_dates = match_dates.sort_values("date")

unique_matches = match_dates["match_id"].values
cutoff = int(len(unique_matches) * 0.8)

train_ids = unique_matches[:cutoff]
test_ids  = unique_matches[cutoff:]

train = matches[matches["match_id"].isin(train_ids)]
test  = matches[matches["match_id"].isin(test_ids)]

overlap = set(train["match_id"]).intersection(test["match_id"])
print("Overlap:", len(overlap))

Overlap: 0


In [89]:
predictors = [
    "venue_code",
    "elo",
    "opp_elo",
    "p_team_win",
    "p_opp_win",
    "form_rolling"
]
data = matches.dropna(subset=predictors + ["target", "date"]).copy()

SPLIT_DATE = pd.Timestamp("2023-08-01")  

train = data[data["date"] < SPLIT_DATE]
test = data[data["date"] >= SPLIT_DATE]

print("Train size:", len(train))
print("Test size:", len(test))

Train size: 3773
Test size: 1755


In [95]:
rf = RandomForestClassifier(
    n_estimators=200,
    min_samples_split=10,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

rf.fit(train[predictors], train["target"])

preds = rf.predict(test[predictors])
probs = rf.predict_proba(test[predictors])[:, 1]

acc = accuracy_score(test["target"], preds)
prec = precision_score(test["target"], preds)
rec = recall_score(test["target"], preds)
f1 = f1_score(test["target"], preds)

print("Accuracy:", acc)
print("Precision:", prec)
print("Recall:", rec)
print("F1:", f1)

Accuracy: 0.784045584045584
Precision: 0.7195301027900147
Recall: 0.7227138643067846
F1: 0.7211184694628403


In [91]:
print("Classification report:\n")
print(classification_report(test["target"], preds))

print("Confusion matrix:\n")
print(confusion_matrix(test["target"], preds))

Classification report:

              precision    recall  f1-score   support

           0       0.82      0.82      0.82      1077
           1       0.72      0.72      0.72       678

    accuracy                           0.78      1755
   macro avg       0.77      0.77      0.77      1755
weighted avg       0.78      0.78      0.78      1755

Confusion matrix:

[[886 191]
 [188 490]]


In [92]:
results = test.copy()
results["pred_win"] = preds
results["pred_win_prob"] = probs

results[["date", "team", "opponent", "result", "target", "pred_win", "pred_win_prob"]].head(20)

Unnamed: 0,date,team,opponent,result,target,pred_win,pred_win_prob
190,2023-08-12,Arsenal,Nott'm Forest,W,1,1,0.949793
191,2023-08-21,Arsenal,Crystal Palace,W,1,1,0.524151
192,2023-08-26,Arsenal,Fulham,D,0,1,0.864372
193,2023-09-03,Arsenal,Man United,W,1,1,0.687139
194,2023-09-17,Arsenal,Everton,W,1,1,0.676259
195,2023-09-24,Arsenal,Tottenham,D,0,1,0.636411
196,2023-09-30,Arsenal,Bournemouth,W,1,0,0.491376
197,2023-10-08,Arsenal,Man City,W,1,0,0.313667
198,2023-10-21,Arsenal,Chelsea,D,0,1,0.66369
199,2023-10-28,Arsenal,Sheffield United,W,1,1,0.692427


In [93]:
print(train["match_id"].nunique())
print(test["match_id"].nunique())

overlap = set(train["match_id"]).intersection(set(test["match_id"]))
len(overlap)

1900
885


6