# 07_Final_Model.ipynb

This notebook loads the best model from hyperparameter tuning, optionally evaluates it on the test set, and saves the final model to disk for future use or deployment.


## Import Libraries

In [1]:
from pathlib import Path
import sys
import joblib
import pandas as pd
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

## Set Folder Path and Read CSVs

In [2]:
def find_project_root(start: Path, anchor_dirs=("src", "Data")) -> Path:
    """
    Walk up the directory tree until we find a folder that
    contains all anchor_dirs (e.g. 'src' and 'Data').
    """
    path = start.resolve()
    for parent in [path] + list(path.parents):
        if all((parent / d).is_dir() for d in anchor_dirs):
            return parent
    raise FileNotFoundError("Could not locate project root")

In [3]:
# Locate the project root regardless of notebook depth
project_root = find_project_root(Path.cwd())

# ----- Code modules --------------------------------------------------
src_path = project_root / "src" / "top20_likelihood"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from data_prep import preprocess_tdf_data   # import data preproc function

# ----- Data ----------------------------------------------------------
data_path = project_root / "Data" / "Processed"
print("Raw data folder:", data_path)


Raw data folder: C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Data\Raw


In [4]:
# Go up two levels to reach the project root
project_root = Path.cwd().parents[1]
src_path = project_root / 'src' / 'top20_likelihood'

# Add to sys.path if not already there
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

# Now you can import your function
from data_prep import preprocess_tdf_data

In [5]:
# Race metadata
df = pd.read_csv(data_path / "tdf_prepared_2011_2024.csv")

In [6]:
# import missing_value_handler
from missing_value_handler import FillWithSentinel

In [7]:
cleaner = FillWithSentinel()
df = cleaner.fit_transform(df)

In [8]:
# Filter out DNF or DSQ from TDF_Pos
df = df[~df['TDF_Pos'].isin(['DNF', 'DSQ'])]

In [9]:
df = df.dropna(subset=['TDF_Pos'])

In [10]:
# Convert TDF_Pos to numeric
df['TDF_Pos'] = pd.to_numeric(df['TDF_Pos'])

# 1 if TDF_Pos <= 20, else 0
df['is_top20'] = (df['TDF_Pos'] <= 20).astype(int)

In [11]:
# Set date range for 2015+, and exclude 2020 & 2021
df = df[(df['Year'] >= 2015) & (~df['Year'].isin([2020, 2021]))]

In [12]:
features = ['Best_Pos_BT_UWT', 'Best_Pos_BT_PT',
            'FC_Pos_YB', 'best_recent_tdf_result',
            'best_recent_other_gt_result', 'rode_giro']

train_mask = (df['Year'] <= 2023)
test_mask  = (df['Year'] == 2024)

X_train = df.loc[train_mask, features]
y_train = df.loc[train_mask, 'is_top20']
X_test  = df.loc[test_mask, features]
y_test  = df.loc[test_mask, 'is_top20']

# Run The Model

In [13]:
model_dir = project_root / "Models" / "top20_likelihood"
model_dir.mkdir(parents=True, exist_ok=True)

In [14]:
# Load best model from GridSearchCV (already trained and saved)
model = joblib.load(model_dir / "final_model.pkl")


In [15]:
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.95      0.96      0.95       121
           1       0.74      0.70      0.72        20

    accuracy                           0.92       141
   macro avg       0.84      0.83      0.84       141
weighted avg       0.92      0.92      0.92       141

ROC AUC: 0.9599173553719008
Confusion Matrix:
 [[116   5]
 [  6  14]]


In [18]:
# Ensure the directory exists
(project_root / "Data" / "Processed").mkdir(parents=True, exist_ok=True)


In [16]:
joblib.dump(model, model_dir / "final_model.pkl")
print(f"Final model saved to: {model_dir / 'final_model.pkl'}")

Final model saved to: C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Models\top20_likelihood\final_model.pkl


In [None]:
df.loc[test_mask, 'predicted_top20_proba'] = y_proba
df.loc[test_mask, 'predicted_top20'] = y_pred

In [19]:
output_path = project_root / "Data" / "Processed" / "2024_predictions.csv"
df.loc[test_mask].to_csv(output_path, index=False)


## Conclusion

The final model has been saved for deployment or further use. It was selected after careful hyperparameter tuning and evaluation. Future work may involve testing the model on new race editions or extending the feature set.

Model path: `Models/top20_likelihood/final_model.pkl`
