In [1]:
import sys
import os
from pathlib import Path

In [2]:
def find_project_root(start: Path, anchor_dirs=("src", "Data")) -> Path:
    """
    Walk up the directory tree until we find a folder that
    contains all anchor_dirs (e.g. 'src' and 'Data').
    """
    path = start.resolve()
    for parent in [path] + list(path.parents):
        if all((parent / d).is_dir() for d in anchor_dirs):
            return parent
    raise FileNotFoundError("Could not locate project root")

In [3]:
# Locate the project root regardless of notebook depth
project_root = find_project_root(Path.cwd())

# ----- Code modules --------------------------------------------------
src_path = project_root / "src" / "top20_likelihood"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

In [4]:
# Add the src directory to sys.path
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

In [5]:
from data_prep import preprocess_tdf_data

In [6]:
# ----- Data ----------------------------------------------------------
raw_data_path = project_root / "Data" / "Raw" / "2025"
processed_data_path = project_root / "Data" / "Processed" / "2025"
print("Raw data folder:", raw_data_path)
print("Processed data folder:", processed_data_path)

Raw data folder: C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Data\Raw\2025
Processed data folder: C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Data\Processed\2025


In [7]:
def run_tdf_preprocessing():
    output_filename = "tdf_prepared_2011_2025.csv"
    output_full_path = os.path.join(processed_data_path, output_filename)

    df = preprocess_tdf_data(raw_data_path, output_full_path)
    print(f"Data preprocessing complete. Output saved to:\n{output_full_path}")
    return df

In [8]:
df = run_tdf_preprocessing()

Wrote prepared data to C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Data\Processed\2025\tdf_prepared_2011_2025.csv
Data preprocessing complete. Output saved to:
C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Data\Processed\2025\tdf_prepared_2011_2025.csv


In [9]:
df

Unnamed: 0,Rider_ID,Year,Age,TDF_Pos,Best_Pos_BT_UWT,Best_Pos_BT_PT,Best_Pos_AT_UWT_YB,Best_Pos_AT_PT_YB,Best_Pos_UWT_YB,Best_Pos_PT_YB,...,gt_debut,rode_giro,FC_Points,FC_Pos,Best_Pos_AT_UWT,Best_Pos_AT_PT,Best_Pos_UWT,Best_Pos_PT,Best_Pos_BT_UWT_YB,Best_Pos_BT_PT_YB
0,2,2011,40,,67.0,,,,,,...,,,0.0,507,,,67.0,,,
1,3,2011,29,5.0,1.0,,,,,,...,,0.0,2685.0,2,,,1.0,,,
2,3,2012,30,,,,,,1.0,,...,,,1719.0,12,1.0,,1.0,,1.0,
3,3,2013,31,4.0,3.0,2.0,1.0,,1.0,,...,,0.0,1622.0,13,,,3.0,2.0,,
4,3,2014,32,DNF,1.0,,,,3.0,2.0,...,,0.0,2893.0,2,1.0,,1.0,,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21229,220860,2023,19,,,,,,,,...,,,0.0,507,,999.0,,999.0,,
21230,229373,2024,19,,,,,,,,...,,,0.0,519,,999.0,,999.0,,
21231,230418,2024,20,,,,,,,,...,,,0.0,519,,999.0,,999.0,,
21232,231012,2024,20,,,,,,,,...,,,1.0,518,999.0,93.0,999.0,93.0,,


In [10]:
df[df["Year"]==2025]["rode_giro"].unique()

array([], dtype=float64)

In [11]:
df[df["Rider_ID"]==20147]

Unnamed: 0,Rider_ID,Year,Age,TDF_Pos,Best_Pos_BT_UWT,Best_Pos_BT_PT,Best_Pos_AT_UWT_YB,Best_Pos_AT_PT_YB,Best_Pos_UWT_YB,Best_Pos_PT_YB,...,gt_debut,rode_giro,FC_Points,FC_Pos,Best_Pos_AT_UWT,Best_Pos_AT_PT,Best_Pos_UWT,Best_Pos_PT,Best_Pos_BT_UWT_YB,Best_Pos_BT_PT_YB
13650,20147,2014,22,,6.0,1.0,,,,,...,,,803.0,70,82.0,,6.0,1.0,,
13651,20147,2015,23,50.0,9.0,,82.0,,6.0,1.0,...,,0.0,741.0,79,,,9.0,,6.0,1.0
13652,20147,2016,24,4.0,7.0,,,,9.0,,...,,0.0,1215.0,36,,,7.0,,9.0,
13653,20147,2017,25,,4.0,,,,7.0,,...,,,1337.0,34,5.0,,4.0,,7.0,
13654,20147,2018,26,29.0,2.0,,5.0,,4.0,,...,,0.0,1070.0,56,45.0,,2.0,,4.0,
13655,20147,2019,27,29.0,2.0,5.0,45.0,,2.0,,...,,0.0,1719.0,17,,,2.0,5.0,2.0,
13656,20147,2020,28,9.0,1.0,,,,2.0,5.0,...,,0.0,829.0,42,,,1.0,,2.0,5.0
13657,20147,2021,29,,1.0,,,,1.0,,...,,,1818.0,8,4.0,96.0,1.0,96.0,1.0,
13658,20147,2022,30,9.0,2.0,,4.0,96.0,1.0,96.0,...,,0.0,1374.0,28,,1.0,2.0,1.0,1.0,
13659,20147,2023,31,3.0,1.0,,,1.0,2.0,1.0,...,,0.0,2326.0,6,,3.0,1.0,3.0,2.0,


In [12]:
df[df["Rider_ID"]==20147]

Unnamed: 0,Rider_ID,Year,Age,TDF_Pos,Best_Pos_BT_UWT,Best_Pos_BT_PT,Best_Pos_AT_UWT_YB,Best_Pos_AT_PT_YB,Best_Pos_UWT_YB,Best_Pos_PT_YB,...,gt_debut,rode_giro,FC_Points,FC_Pos,Best_Pos_AT_UWT,Best_Pos_AT_PT,Best_Pos_UWT,Best_Pos_PT,Best_Pos_BT_UWT_YB,Best_Pos_BT_PT_YB
13650,20147,2014,22,,6.0,1.0,,,,,...,,,803.0,70,82.0,,6.0,1.0,,
13651,20147,2015,23,50.0,9.0,,82.0,,6.0,1.0,...,,0.0,741.0,79,,,9.0,,6.0,1.0
13652,20147,2016,24,4.0,7.0,,,,9.0,,...,,0.0,1215.0,36,,,7.0,,9.0,
13653,20147,2017,25,,4.0,,,,7.0,,...,,,1337.0,34,5.0,,4.0,,7.0,
13654,20147,2018,26,29.0,2.0,,5.0,,4.0,,...,,0.0,1070.0,56,45.0,,2.0,,4.0,
13655,20147,2019,27,29.0,2.0,5.0,45.0,,2.0,,...,,0.0,1719.0,17,,,2.0,5.0,2.0,
13656,20147,2020,28,9.0,1.0,,,,2.0,5.0,...,,0.0,829.0,42,,,1.0,,2.0,5.0
13657,20147,2021,29,,1.0,,,,1.0,,...,,,1818.0,8,4.0,96.0,1.0,96.0,1.0,
13658,20147,2022,30,9.0,2.0,,4.0,96.0,1.0,96.0,...,,0.0,1374.0,28,,1.0,2.0,1.0,1.0,
13659,20147,2023,31,3.0,1.0,,,1.0,2.0,1.0,...,,0.0,2326.0,6,,3.0,1.0,3.0,2.0,


In [13]:
df.columns

Index(['Rider_ID', 'Year', 'Age', 'TDF_Pos', 'Best_Pos_BT_UWT',
       'Best_Pos_BT_PT', 'Best_Pos_AT_UWT_YB', 'Best_Pos_AT_PT_YB',
       'Best_Pos_UWT_YB', 'Best_Pos_PT_YB', 'FC_Points_YB', 'FC_Pos_YB',
       'best_tdf_result', 'best_other_gt_result', 'best_recent_tdf_result',
       'best_recent_other_gt_result', 'tdf_debut', 'gt_debut', 'rode_giro',
       'FC_Points', 'FC_Pos', 'Best_Pos_AT_UWT', 'Best_Pos_AT_PT',
       'Best_Pos_UWT', 'Best_Pos_PT', 'Best_Pos_BT_UWT_YB',
       'Best_Pos_BT_PT_YB'],
      dtype='object')

In [14]:
#20147