# Final Model - Top 10
This notebook trains the final model to predict Top 10 finishes for the 2025 Tour de France, using selected features and data. It then saves the best-performing model for future use or deployment.

## Key Steps:
- Loads the cleaned dataset and filters the 2025 startlist
- Trains a GradientBoostingClassifier using a pipeline and GridSearchCV
- Evaluates on the 2025 data and saves the best model to disk

## Import Libraries

In [1]:
from pathlib import Path
import sys
import joblib
import pandas as pd
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

## Set Folder Path and Read CSVs

In [2]:
def find_project_root(start: Path, anchor_dirs=("src", "Data")) -> Path:
    """
    Walk up the directory tree until we find a folder that
    contains all anchor_dirs (e.g. 'src' and 'Data').
    """
    path = start.resolve()
    for parent in [path] + list(path.parents):
        if all((parent / d).is_dir() for d in anchor_dirs):
            return parent
    raise FileNotFoundError("Could not locate project root")

In [3]:
# Locate the project root regardless of notebook depth
project_root = find_project_root(Path.cwd())

# ----- Code modules --------------------------------------------------
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from data_prep import preprocess_tdf_data   # import data preproc function

# ----- Data ----------------------------------------------------------
data_path = project_root / "Data" / "Processed" / "2025"
raw_path = project_root / "Data" / "Raw" / "2025"
print("Raw data folder:", data_path)


Raw data folder: C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Data\Processed\2025


In [4]:
# Go up two levels to reach the project root
project_root = Path.cwd().parents[1]
src_path = project_root / 'src'

# Add to sys.path if not already there
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

# Now you can import your function
from data_prep import preprocess_tdf_data

In [5]:
# Race data
df = pd.read_csv(data_path / "tdf_prepared_2011_2025.csv")

In [6]:
# 2025 TDF Startlist
startlist = pd.read_csv(data_path / "TDF_Startlist_2025.csv")

In [7]:
# Rankings of riders per year by First Cycling's points system, then split the df to get Rider names in a seperate df
fc_rank = pd.read_csv(raw_path / "fc_rankings.csv", usecols=["rider_id","Rider"])
rider_names = fc_rank.drop_duplicates(subset='rider_id', keep="first")

In [8]:
# import missing_value_handler
from missing_value_handler import FillWithSentinel

In [9]:
cleaner = FillWithSentinel()
df = cleaner.fit_transform(df)

In [10]:
startlist['Rider_ID'] = startlist['url'].str.extract(r"r=(\d+)", expand=False).astype(int)

In [11]:
# Filter test so it only has riders who are starting the Tour de France
df_2025 = df[df['Rider_ID'].isin(startlist['Rider_ID'])]

In [12]:
df_2025 = df_2025[df_2025["Year"]==2025]

In [13]:
# Filter out DNF or DSQ from TDF_Pos
final_df = df[~df['TDF_Pos'].isin(['DNF', 'DSQ'])]

In [14]:
final_df = final_df.dropna(subset=['TDF_Pos'])

In [15]:
# Convert TDF_Pos to numeric
final_df['TDF_Pos'] = pd.to_numeric(final_df['TDF_Pos'])

# 1 if TDF_Pos <= 5, else 0
final_df['is_top10'] = (final_df['TDF_Pos'] <= 10).astype(int)
df_2025['is_top10'] = (df_2025['TDF_Pos'] <= 10).astype(int)

In [16]:
# Set date range for 2015+, and exclude 2020 & 2021
final_df = final_df[(final_df['Year'] >= 2015) & (~final_df['Year'].isin([2020, 2021]))]

In [17]:
features = ['Best_Pos_BT_UWT', 'Best_Pos_BT_PT',
            'FC_Pos_YB', 'best_recent_tdf_result',
            'best_recent_other_gt_result']

# Run The Model

In [18]:
# --------------------------------------------
# 0.  Prep – train/test masks + splitter
# --------------------------------------------
train_mask = (final_df['Year'] <= 2024)
test_mask  = (df_2025['Year'] == 2025)

cv_splitter = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [19]:
X_train = final_df.loc[train_mask, features]
y_train = final_df.loc[train_mask, 'is_top10']
X_test  = df_2025.loc[test_mask,  features]
y_test  = df_2025.loc[test_mask,  'is_top10']

In [20]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__learning_rate': [0.01, 0.05],
    'classifier__max_depth': [2, 3],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__subsample': [0.8],
    'classifier__max_features': ['sqrt'],
    'classifier__min_impurity_decrease': [0.0, 0.01]
}

# Add missing flags outside the pipeline
#for col in X_train.columns:
#    if X_train[col].isnull().sum() > 0:
#        X_train[f'{col}_missing'] = X_train[col].isnull().astype(int)
#        X_test[f'{col}_missing'] = X_test[col].isnull().astype(int)

gs = GridSearchCV(
    pipeline,
    param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1,
    verbose=2
)

# Train with new top-end target
gs.fit(X_train, y_train)


Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [21]:
# Use existing function to find the project root
project_root = find_project_root(Path.cwd())

# Define the save path
model_path = project_root / "Models" / "final_model_top10.pkl"

# Save the model
joblib.dump(gs.best_estimator_, model_path)

print(f"Model saved to: {model_path}")

Model saved to: C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Models\top20_likelihood\final_model_top5.pkl
