# 2025 Model Run
This notebook applies the trained models to the 2025 Tour de France startlist to generate Top 20, Top 10, and Top 5 finish predictions.

## Key Steps:
- Loads the 2025 startlist and feature data
- Applies trained models to calculate probabilities and predictions
- Merges results with rider names and exports final predictions to CSV

## Import Libraries

In [1]:
from pathlib import Path
import sys
import joblib
import pandas as pd
import unidecode

## Set Folder Path and Read CSVs

In [2]:
def find_project_root(start: Path, anchor_dirs=("src", "Data")) -> Path:
    """
    Walk up the directory tree until we find a folder that
    contains all anchor_dirs (e.g. 'src' and 'Data').
    """
    path = start.resolve()
    for parent in [path] + list(path.parents):
        if all((parent / d).is_dir() for d in anchor_dirs):
            return parent
    raise FileNotFoundError("Could not locate project root")

In [3]:
# Locate the project root regardless of notebook depth
project_root = find_project_root(Path.cwd())

# ----- Code modules --------------------------------------------------
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from data_prep import preprocess_tdf_data   # import data preproc function

# ----- Data ----------------------------------------------------------
data_path = project_root / "Data" / "Processed"
raw_path = project_root / "Data" / "Raw"
print("Raw data folder:", data_path)


Raw data folder: C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Data\Processed\2025


In [5]:
# Race data
df = pd.read_csv(data_path / "tdf_prepared.csv")

In [6]:
# 2025 TDF Startlist
startlist = pd.read_csv(raw_path / "TDF_Startlist_2025.csv")

In [7]:
# Rankings of riders per year by First Cycling's points system, then split the df to get Rider names in a seperate df
fc_rank = pd.read_csv(raw_path / "fc_rankings.csv", usecols=["rider_id","Rider"])
rider_names = fc_rank.drop_duplicates(subset='rider_id', keep="first")

In [8]:
# import missing_value_handler
from missing_value_handler import FillWithSentinel

In [9]:
cleaner = FillWithSentinel()
df = cleaner.fit_transform(df)

In [10]:
df = df[df['Year'] == 2025]

In [11]:
startlist['Rider_ID'] = startlist['url'].str.extract(r"r=(\d+)", expand=False).astype(int)

In [12]:
# Filter test so it only has riders who are starting the Tour de France
df = df[df['Rider_ID'].isin(startlist['Rider_ID'])]

In [13]:
features_top20 = ['Best_Pos_BT_UWT', 'Best_Pos_BT_PT',
            'FC_Pos_YB', 'best_recent_tdf_result',
            'best_recent_other_gt_result', 'rode_giro']

In [14]:
features_top5 = ['Best_Pos_BT_UWT', 'Best_Pos_BT_PT',
            'FC_Pos_YB', 'best_recent_tdf_result',
            'best_recent_other_gt_result']

In [15]:
test_top20 = df[features_top20]

In [16]:
test_top10 = df[features_top5]

In [17]:
test_top5 = df[features_top5]

# Run The Models

In [18]:
model_dir = project_root / "Models"
model_dir.mkdir(parents=True, exist_ok=True)

In [19]:
# Load best models from GridSearchCV (already trained and saved)
model_top20 = joblib.load(model_dir / "final_model_top20.pkl")
model_top10 = joblib.load(model_dir / "final_model_top10.pkl")
model_top5 = joblib.load(model_dir / "final_model_top5.pkl")

In [20]:
# Probabilities for class 1 (top 20 finish)
probs_top20 = model_top20.predict_proba(test_top20)[:, 1]
probs_top10 = model_top10.predict_proba(test_top10)[:, 1]
probs_top5 = model_top5.predict_proba(test_top5)[:, 1]

# Class predictions (0 or 1):
preds_top20 = model_top20.predict(test_top20)
preds_top10 = model_top10.predict(test_top10)
preds_top5 = model_top5.predict(test_top5)

In [21]:
df['top20_probability'] = probs_top20
df['top20_prediction'] = preds_top20

df['top10_probability'] = probs_top10
df['top10_prediction'] = preds_top10

df['top5_probability'] = probs_top5
df['top5_prediction'] = preds_top5

In [22]:
# Make sure the ID columns match in name
# Rename 'rider_id' in rider_names to match 'Rider_ID' in df
rider_names = rider_names.rename(columns={'rider_id': 'Rider_ID'})

# Merge on Rider_ID
df = df.merge(rider_names, on='Rider_ID', how='left')

In [23]:
df = df[["Rider_ID"] + ["Rider"] + ["top20_probability"] +  ["top10_probability"] + ['top5_probability'] + features_top20]

In [24]:
def clean_and_reorder(name):
    # Split by double space to separate surname and first name
    parts = name.split("  ")
    # Reverse order: first name + last name
    reordered = " ".join(parts[::-1])
    # Remove special characters
    cleaned = unidecode.unidecode(reordered)
    return cleaned

In [25]:
# Reorder names and remove special characters
df["Rider"] = df["Rider"].apply(clean_and_reorder)

In [26]:
df.sort_values(by='top20_probability', ascending=False).reset_index(drop=True).head(20)

Unnamed: 0,Rider_ID,Rider,top20_probability,top10_probability,top5_probability,Best_Pos_BT_UWT,Best_Pos_BT_PT,FC_Pos_YB,best_recent_tdf_result,best_recent_other_gt_result,rode_giro
0,45992,Tadej Pogacar,0.837129,0.790385,0.780886,1.0,999.0,1,1.0,1.0,0.0
1,84019,Remco Evenepoel,0.822488,0.746009,0.200891,4.0,999.0,2,3.0,1.0,0.0
2,38195,Jonas Vingegaard,0.808906,0.784846,0.771078,2.0,1.0,6,1.0,2.0,0.0
3,1527,Enric Mas,0.762726,0.554411,0.446888,2.0,18.0,11,19.0,2.0,0.0
4,84926,Carlos Rodriguez,0.748783,0.667273,0.052539,6.0,6.0,13,5.0,7.0,0.0
5,50303,Joao Almeida,0.742168,0.537586,0.317106,1.0,2.0,30,4.0,3.0,0.0
6,68206,Matteo Jorgenson,0.7379,0.543332,0.259159,1.0,999.0,14,8.0,999.0,0.0
7,42186,Ben O'Connor,0.7175,0.457225,0.233038,7.0,10.0,5,17.0,2.0,0.0
8,81835,Santiago Buitrago,0.571216,0.097603,0.011811,13.0,1.0,56,10.0,10.0,0.0
9,37281,Aleksandr Vlasov,0.546545,0.362666,0.026463,21.0,17.0,15,5.0,7.0,0.0


In [27]:
df.sort_values(by='top10_probability', ascending=False).reset_index(drop=True).head(20)

Unnamed: 0,Rider_ID,Rider,top20_probability,top10_probability,top5_probability,Best_Pos_BT_UWT,Best_Pos_BT_PT,FC_Pos_YB,best_recent_tdf_result,best_recent_other_gt_result,rode_giro
0,45992,Tadej Pogacar,0.837129,0.790385,0.780886,1.0,999.0,1,1.0,1.0,0.0
1,38195,Jonas Vingegaard,0.808906,0.784846,0.771078,2.0,1.0,6,1.0,2.0,0.0
2,84019,Remco Evenepoel,0.822488,0.746009,0.200891,4.0,999.0,2,3.0,1.0,0.0
3,84926,Carlos Rodriguez,0.748783,0.667273,0.052539,6.0,6.0,13,5.0,7.0,0.0
4,1527,Enric Mas,0.762726,0.554411,0.446888,2.0,18.0,11,19.0,2.0,0.0
5,68206,Matteo Jorgenson,0.7379,0.543332,0.259159,1.0,999.0,14,8.0,999.0,0.0
6,50303,Joao Almeida,0.742168,0.537586,0.317106,1.0,2.0,30,4.0,3.0,0.0
7,18655,Primoz Roglic,0.545751,0.460801,0.467931,1.0,8.0,7,999.0,1.0,1.0
8,42186,Ben O'Connor,0.7175,0.457225,0.233038,7.0,10.0,5,17.0,2.0,0.0
9,20147,Adam Yates,0.538491,0.408806,0.033159,12.0,1.0,18,3.0,12.0,1.0


In [28]:
df.sort_values(by='top5_probability', ascending=False).reset_index(drop=True).head(10)

Unnamed: 0,Rider_ID,Rider,top20_probability,top10_probability,top5_probability,Best_Pos_BT_UWT,Best_Pos_BT_PT,FC_Pos_YB,best_recent_tdf_result,best_recent_other_gt_result,rode_giro
0,45992,Tadej Pogacar,0.837129,0.790385,0.780886,1.0,999.0,1,1.0,1.0,0.0
1,38195,Jonas Vingegaard,0.808906,0.784846,0.771078,2.0,1.0,6,1.0,2.0,0.0
2,18655,Primoz Roglic,0.545751,0.460801,0.467931,1.0,8.0,7,999.0,1.0,1.0
3,1527,Enric Mas,0.762726,0.554411,0.446888,2.0,18.0,11,19.0,2.0,0.0
4,50303,Joao Almeida,0.742168,0.537586,0.317106,1.0,2.0,30,4.0,3.0,0.0
5,68206,Matteo Jorgenson,0.7379,0.543332,0.259159,1.0,999.0,14,8.0,999.0,0.0
6,42186,Ben O'Connor,0.7175,0.457225,0.233038,7.0,10.0,5,17.0,2.0,0.0
7,84019,Remco Evenepoel,0.822488,0.746009,0.200891,4.0,999.0,2,3.0,1.0,0.0
8,41249,Jhonnatan Narvaez,0.267659,0.087959,0.069258,1.0,999.0,51,999.0,28.0,0.0
9,159196,Lenny Martinez,0.286946,0.082541,0.06678,2.0,11.0,35,124.0,24.0,0.0


In [29]:
df.to_csv(data_path / "TDF_2025_Predictions.csv", index=False)