In [1]:
import sys
import os
from pathlib import Path

In [None]:
def find_project_root(start: Path, anchor_dirs=("src", "Data")) -> Path:
    """
    Walk up the directory tree until we find a folder that
    contains all anchor_dirs (e.g. 'src' and 'Data').
    """
    path = start.resolve()
    for parent in [path] + list(path.parents):
        if all((parent / d).is_dir() for d in anchor_dirs):
            return parent
    raise FileNotFoundError("Could not locate project root")

In [None]:
# Locate the project root regardless of notebook depth
project_root = find_project_root(Path.cwd())

# ----- Code modules --------------------------------------------------
src_path = project_root / "src" / "top20_likelihood"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

In [2]:
# Add the src directory to sys.path
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'src'))
if src_path not in sys.path:
    sys.path.append(src_path)

In [None]:
from data_prep import preprocess_tdf_data

In [None]:
# ----- Data ----------------------------------------------------------
raw_data_path = project_root / "Data" / "Raw"
processed_data_path = project_root / "Data" / "Processed"
print("Raw data folder:", raw_data_path)
print("Processed data folder:", processed_data_path)

In [3]:
def run_tdf_preprocessing():
    output_filename = "tdf_prepared_2011_2024.csv"
    output_full_path = os.path.join(processed_data_path, output_filename)

    df = preprocess_tdf_data(raw_data_path, output_full_path)
    print(f"Data preprocessing complete. Output saved to:\n{output_full_path}")
    return df


In [4]:
df = run_tdf_preprocessing()

Wrote prepared data to C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Data\Processed\tdf_prepared_2011_2024.csv
Data preprocessing complete. Output saved to:
C:\Users\Shaun Ricketts\Documents\Projects\Cycling\Tour de France Predictor - 2025\Data\Processed\tdf_prepared_2011_2024.csv


In [5]:
df

Unnamed: 0,Rider_ID,Year,Age,TDF_Pos,Best_Pos_BT_UWT,Best_Pos_BT_PT,Best_Pos_AT_UWT_YB,Best_Pos_AT_PT_YB,Best_Pos_UWT_YB,Best_Pos_PT_YB,...,gt_debut,rode_giro,FC_Points,FC_Pos,Best_Pos_AT_UWT,Best_Pos_AT_PT,Best_Pos_UWT,Best_Pos_PT,Best_Pos_BT_UWT_YB,Best_Pos_BT_PT_YB
0,2,2011,40,,67.0,,,,,,...,,,0.0,507,,,67.0,,,
1,3,2011,29,5.0,1.0,,,,,,...,,0.0,2685.0,2,,,1.0,,,
2,3,2012,30,,,,,,1.0,,...,,,1719.0,12,1.0,,1.0,,1.0,
3,3,2013,31,4.0,3.0,2.0,1.0,,1.0,,...,,0.0,1622.0,13,,,3.0,2.0,,
4,3,2014,32,DNF,1.0,,,,3.0,2.0,...,,0.0,2893.0,2,1.0,,1.0,,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21229,220860,2023,19,,,,,,,,...,,,0.0,507,,999.0,,999.0,,
21230,229373,2024,19,,,,,,,,...,,,0.0,519,,999.0,,999.0,,
21231,230418,2024,20,,,,,,,,...,,,0.0,519,,999.0,,999.0,,
21232,231012,2024,20,,,,,,,,...,,,1.0,518,999.0,93.0,999.0,93.0,,
