# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import sys
import os,re,json

from sqlalchemy import create_engine
from dotenv import load_dotenv

import warnings
warnings.filterwarnings('ignore')

import joblib
from sklearn.preprocessing import LabelEncoder

In [2]:
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/f1_db")

engine = create_engine(DATABASE_URL.replace("postgresql+asyncpg://", "postgresql://"))

### Loading raw data

In [3]:
print("\nLoading data...")

query = """
SELECT 
    rr.id as result_id, rr.position, rr.grid_position, rr.points, rr.laps_completed, rr.fastest_lap_rank,
    rr.status, rr.is_sprint,
    d.id as driver_id, d.code as driver_code, d.driver_number, d.nationality as driver_nationality,
    EXTRACT(YEAR FROM AGE(r.race_date, d.date_of_birth)) as driver_age,
    d.championships as driver_championships,
    t.id as team_id, t.name as team_name,
    r.id as race_id, r.season, r.round_number, r.race_name, r.circuit_location, r.country,
    r.circuit_type, r.laps as total_laps, r.circuit_length, r.has_sprint, r.race_date,
    qr.position as quali_position, qr.q1_time, qr.q2_time, qr.q3_time
FROM race_results rr
JOIN drivers d ON rr.driver_id = d.id
JOIN teams t ON rr.team_id = t.id
JOIN races r ON rr.race_id = r.id
LEFT JOIN qualifying_results qr ON rr.race_id = qr.race_id AND rr.driver_id = qr.driver_id
WHERE rr.is_sprint = False
ORDER BY r.season, r.round_number, rr.position;
"""

df = pd.read_sql(query, engine)
print(f"‚úÖ Loaded {len(df)} records with qualifying data")
print(f"   Qualifying data coverage: {df['quali_position'].notna().sum()} / {len(df)}")


Loading data...
‚úÖ Loaded 1699 records with qualifying data
   Qualifying data coverage: 1699 / 1699


In [4]:
circuit_data_map = {
    # 2022-2025 Tracks (with all known aliases)
    "Sakhir": ("Permanent", 57, 5.412),
    "Jeddah": ("Street", 50, 6.174),
    "Melbourne": ("Street", 58, 5.278),
    "Suzuka": ("Permanent", 53, 5.807),
    "Shanghai": ("Permanent", 56, 5.451),
    "Miami Gardens": ("Street", 57, 5.412),   
    "Miami": ("Street", 57, 5.412),          
    "Imola": ("Permanent", 63, 4.909),
    "Monaco": ("Street", 78, 3.337),
    "Montr√©al": ("Street", 70, 4.361),
    "Barcelona": ("Permanent", 66, 4.657),
    "Spielberg": ("Permanent", 71, 4.318),
    "Silverstone": ("Permanent", 52, 5.891),
    "Budapest": ("Permanent", 70, 4.381),
    "Spa-Francorchamps": ("Permanent", 44, 7.004),
    "Zandvoort": ("Permanent", 72, 4.259),
    "Monza": ("Permanent", 53, 5.793),
    "Baku": ("Street", 51, 6.003),
    "Marina Bay": ("Street", 62, 4.940),
    "Austin": ("Permanent", 56, 5.513),
    "Mexico City": ("Permanent", 71, 4.304),
    "S√£o Paulo": ("Permanent", 71, 4.309),
    "Las Vegas": ("Street", 50, 6.201),
    "Lusail": ("Permanent", 57, 5.419),
    "Yas Island": ("Permanent", 58, 5.281),
    "Le Castellet": ("Permanent", 53, 5.842) # For 2022 French GP
}

key_column = 'circuit_location' 

mapped_data = df[key_column].map(circuit_data_map)

df['circuit_type'] = df['circuit_type'].fillna(mapped_data.str[0])
df['total_laps'] = df['total_laps'].fillna(mapped_data.str[1])
df['circuit_length'] = df['circuit_length'].fillna(mapped_data.str[2])

print("Fill complete. Checking for remaining missing values:")
print(f"circuit_type: {df['circuit_type'].isna().sum()} missing")
print(f"total_laps: {df['total_laps'].isna().sum()} missing")
print(f"circuit_length: {df['circuit_length'].isna().sum()} missing")

# Verification
all_db_locations = set(df['circuit_location'].unique())
all_map_keys = set(circuit_data_map.keys())
mismatched_keys = all_db_locations - all_map_keys

if len(mismatched_keys) > 0:
    print(f"\n‚ö†Ô∏è WARNING: Mismatched keys found!")
    print(f"   These locations are in your database but NOT in the map:")
    print(f"   {mismatched_keys}")
else:
    print("\n‚úÖ Success! All circuit locations were matched and filled.")

Fill complete. Checking for remaining missing values:
circuit_type: 0 missing
total_laps: 0 missing
circuit_length: 0 missing

‚úÖ Success! All circuit locations were matched and filled.


In [5]:
df.head()

Unnamed: 0,result_id,position,grid_position,points,laps_completed,fastest_lap_rank,status,is_sprint,driver_id,driver_code,...,country,circuit_type,total_laps,circuit_length,has_sprint,race_date,quali_position,q1_time,q2_time,q3_time
0,1037,1.0,1.0,26.0,57,1.0,Finished,False,4,LEC,...,Bahrain,Permanent,57,5.412,0,2022-03-20,1,0 days 00:01:31.471000,0 days 00:01:30.932000,0 days 00:01:30.558000
1,1038,2.0,3.0,18.0,57,3.0,Finished,False,3,SAI,...,Bahrain,Permanent,57,5.412,0,2022-03-20,3,0 days 00:01:31.567000,0 days 00:01:30.787000,0 days 00:01:30.687000
2,1039,3.0,5.0,15.0,57,5.0,Finished,False,7,HAM,...,Bahrain,Permanent,57,5.412,0,2022-03-20,5,0 days 00:01:32.285000,0 days 00:01:31.048000,0 days 00:01:31.238000
3,1040,4.0,9.0,12.0,57,6.0,Finished,False,5,RUS,...,Bahrain,Permanent,57,5.412,0,2022-03-20,9,0 days 00:01:32.269000,0 days 00:01:31.252000,0 days 00:01:32.216000
4,1041,5.0,7.0,10.0,57,8.0,Finished,False,12,MAG,...,Bahrain,Permanent,57,5.412,0,2022-03-20,7,0 days 00:01:31.955000,0 days 00:01:31.461000,0 days 00:01:31.808000


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1699 entries, 0 to 1698
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   result_id             1699 non-null   int64  
 1   position              1697 non-null   float64
 2   grid_position         1659 non-null   float64
 3   points                1699 non-null   float64
 4   laps_completed        1699 non-null   int64  
 5   fastest_lap_rank      1654 non-null   float64
 6   status                1699 non-null   object 
 7   is_sprint             1699 non-null   bool   
 8   driver_id             1699 non-null   int64  
 9   driver_code           1699 non-null   object 
 10  driver_number         1699 non-null   int64  
 11  driver_nationality    0 non-null      object 
 12  driver_age            0 non-null      object 
 13  driver_championships  1699 non-null   int64  
 14  team_id               1699 non-null   int64  
 15  team_name            

## Temporal Weighting : 
- Give recent races higher weight using exponential decay 
- Makes model focus on recent driver/team performance
<pre>
1) First calculate days since last race
2) Exponential delay function to get temporal weights:
        Half life of 180 days
        i.e. A race of 0 days ago will have weight of 1
             A race of 180 days ago will have weight of 0.5  
3) Calculate season weights as recent season performances are more important
        Recent 2025 season has weight 1
        2024 season has weight 0.7
        Rest all (if any) have weight 0.5
4) Multiply temporal and season weights to get final sample weights                     

In [7]:
df['race_date'] = pd.to_datetime(df['race_date'])
most_recent_race = df['race_date'].max()
df['days_since_race'] = (most_recent_race - df['race_date']).dt.days

# 1. Exponential decay based on time (Half-life of 180 days)
half_life = 180
df['temporal_weight'] = np.exp(-np.log(2) * df['days_since_race'] / half_life)

# 2. UPDATED Season-specific weights
# We have data from 2022, 2023, 2024, and 2025.
season_weight_map = {
    2025: 1.0,  # Most important (Test set)
    2024: 0.7,  # Very important (Recent)
    2023: 0.5,  # Relevant
    2022: 0.3   # Less relevant (different car regulations)
}
df['season_weight'] = df['season'].apply(lambda x: season_weight_map.get(x, 0.2))
print("Season weights applied:")
print(df['season_weight'].value_counts())

# 3. Final sample weight
df['sample_weight'] = df['temporal_weight'] * df['season_weight']

print(df[['season','race_date','temporal_weight','season_weight','sample_weight']].sample(10))

Season weights applied:
season_weight
0.7    460
0.5    435
0.3    419
1.0    385
Name: count, dtype: int64
      season  race_date  temporal_weight  season_weight  sample_weight
72      2022 2022-04-24         0.007206            0.3       0.002162
222     2022 2022-07-24         0.010230            0.3       0.003069
1244    2024 2024-11-03         0.252905            0.7       0.177033
551     2023 2023-06-04         0.034408            0.5       0.017204
389     2022 2022-11-13         0.015746            0.3       0.004724
104     2022 2022-05-22         0.008026            0.3       0.002408
985     2024 2024-05-19         0.132433            0.7       0.092703
44      2022 2022-04-10         0.006827            0.3       0.002048
312     2022 2022-10-02         0.013394            0.3       0.004018
982     2024 2024-05-19         0.132433            0.7       0.092703


### Qualifying feature Engineering:
- Incorporates qualifying performance (Q1‚ÄìQ3), grid penalties, and relative speed to pole.

<pre>
1) Clean data : If any driver is found having a mmissing quali position (happens if they crashed in q1/q2 or didnt set a time),
                it is filled with their grid position.
2) Grid penalty is calculated 
3) Normalize qualifying position.
        Pole Position (P1) becomes 1/20 = 0.05 (the "best" score).
        Last Place (P20) becomes 20/20 = 1.0 (the "worst" score).   
4) Conver lap time strings to numerical format for calculation
5) Calculate gap_to_pole to measure pure,relative pace against the fastest car                     

In [8]:
df['quali_position'].fillna(df['grid_position'], inplace=True)
df['grid_penalty'] = df['grid_position'] - df['quali_position']
df['quali_position_normalized'] = df['quali_position'] / 20 # Assuming ~20 cars

laptime_pattern = re.compile(r'(\d{2}):(\d{2}):(\d{2}\.\d+)')

def parse_laptime_from_string(time_str):
    if pd.isna(time_str):
        return None
    try:
        match = laptime_pattern.search(str(time_str))
        if match:
            hours = int(match.group(1))
            minutes = int(match.group(2))
            seconds = float(match.group(3))
            return (hours * 3600) + (minutes * 60) + seconds
        else:
            return None
    except:
        return None

df['q3_seconds'] = df['q3_time'].apply(parse_laptime_from_string)
df['q3_gap_to_pole'] = df.groupby('race_id')['q3_seconds'].transform(
    lambda x: x - x.min() if x.notna().sum() > 0 else np.nan
)
print("‚úÖ Qualifying features created.")

‚úÖ Qualifying features created.


In [9]:
print(df[['quali_position','grid_position','grid_penalty','q3_time','q3_seconds','q3_gap_to_pole']].head())


   quali_position  grid_position  grid_penalty                 q3_time  \
0               1            1.0           0.0  0 days 00:01:30.558000   
1               3            3.0           0.0  0 days 00:01:30.687000   
2               5            5.0           0.0  0 days 00:01:31.238000   
3               9            9.0           0.0  0 days 00:01:32.216000   
4               7            7.0           0.0  0 days 00:01:31.808000   

   q3_seconds  q3_gap_to_pole  
0      90.558           0.000  
1      90.687           0.129  
2      91.238           0.680  
3      92.216           1.658  
4      91.808           1.250  


### Circuit Features
<pre>
1) Checks if street circuit or not
    Why : Street circuits (like Monaco, Baku, or Singapore) are fundamentally different from permanent tracks (like Silverstone or Suzuka).
          They are often bumpier, have lower grip, and punish mistakes with walls instead of gravel.
          This leads to more safety cars, different setup priorities, and can favor certain "specialist" drivers.

2) Fills any null circuit length values with median of length

3) Checks if circuit is >5km.
    Why : Instead of passing individual length for each track just long/nnt long helps model learn simpler rules

In [10]:
df['is_street_circuit'] = df['circuit_type'].fillna('').str.contains('Street', case=False).astype(int)
df['circuit_length'].fillna(df['circuit_length'].median(), inplace=True)
df['is_long_circuit'] = (df['circuit_length'] > 5.0).astype(int)
df['has_sprint'] = df['has_sprint'].fillna(0).astype(int)
print("‚úÖ Circuit features created.")

‚úÖ Circuit features created.


In [11]:
print(df[['circuit_location','is_street_circuit','is_long_circuit','has_sprint']].sample(5))


     circuit_location  is_street_circuit  is_long_circuit  has_sprint
514             Miami                  1                1           0
1214           Austin                  0                1           0
1513        Spielberg                  0                0           0
439            Jeddah                  1                1           0
488              Baku                  1                1           0


In [12]:
print(f"Original missing positions: {df['position'].isnull().sum()}")
# Fill NaN in 'position' with 20 (last place)
df['position'].fillna(20, inplace=True)
print(f"Missing positions after fill: {df['position'].isnull().sum()}")
print("‚úÖ Target variable 'position' imputed successfully.")

Original missing positions: 2
Missing positions after fill: 0
‚úÖ Target variable 'position' imputed successfully.


### Advanced Historical Features

1) Drivers:
- Two helper columns(podium and dnf) to measure success and reliability
- Weighted average finishing position over the last 5 races
- Weighted average total points over the last 5 races
- Weighted average podiums(eg : 0.6 = 60% podium rate) over last 5 races
- Weighted DNF rate over last 5 matches
- Recent form (A "hotter," more sensitive version of average position, looking at only the last 3 races.)
- Unweighted avg of quali position ober last 5 races

2) Teams: (Gives "car performance feature")
- Weighted avg finishing position
- Weighted avg total points

In [13]:
def calculate_weighted_rolling_stats(group, window=5):
    group = group.sort_values(['season', 'round_number'])
    weights = group['sample_weight']

    # --- ROBUST DNF FIX ---
    dnf_statuses = [
        'Accident', 'Collision', 'Engine', 'Gearbox', 'Retired', 'Disqualified',
        'Chassis', 'Brakes', 'Clutch', 'Hydraulics', 'Electrical', 'Suspension',
        'Puncture', 'Wheel', 'Overheating', 'Fuel system', 'Water pressure', 'Oil leak'
    ]
    group['dnf'] = group['status'].isin(dnf_statuses).astype(int)
    # --- END FIX ---
    
    group['podium'] = (group['position'] <= 3).astype(int)

    def weighted_avg(x):
        return np.average(x, weights=weights[x.index])
    
    group['weighted_avg_position_5'] = group['position'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    group['weighted_points_5'] = group['points'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    group['weighted_podium_rate_5'] = group['podium'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    group['weighted_dnf_rate_5'] = group['dnf'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    # --- recent_form_3 BUG FIX ---
    group['recent_form_3'] = group['position'].rolling(
        window=3, min_periods=1  # Corrected to 3
    ).apply(weighted_avg, raw=False).shift(1)
    
    if 'quali_position' in group.columns:
        group['avg_quali_position_5'] = group['quali_position'].rolling(
            window=window, min_periods=1
        ).mean().shift(1)

    new_cols = [
        'weighted_avg_position_5', 'weighted_points_5', 'weighted_podium_rate_5',
        'weighted_dnf_rate_5', 'recent_form_3', 'avg_quali_position_5'
    ]
    final_cols = [col for col in new_cols if col in group.columns]
    return group[final_cols]


def calculate_team_weighted_stats(group, window=5):
    group = group.sort_values(['season', 'round_number'])
    weights = group['sample_weight']

    def weighted_avg(x):
        return np.average(x, weights=weights[x.index])
    
    group['team_weighted_avg_position_5'] = group['position'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    group['team_weighted_points_5'] = group['points'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    return group[['team_weighted_avg_position_5', 'team_weighted_points_5']]

# --- Use "separate and join" pattern to prevent bugs ---
print("Calculating driver weighted rolling stats...")
driver_stats = df.groupby('driver_id', group_keys=False).apply(calculate_weighted_rolling_stats)

print("Calculating team weighted rolling stats...")
team_stats = df.groupby('team_id', group_keys=False).apply(calculate_team_weighted_stats)

print("Merging stats back to main DataFrame...")
df = df.join(driver_stats)
df = df.join(team_stats)

print("\nAll leak-proof rolling features calculated and merged successfully.")

Calculating driver weighted rolling stats...
Calculating team weighted rolling stats...
Merging stats back to main DataFrame...

All leak-proof rolling features calculated and merged successfully.


### Head to head & teammate comparision
<pre>
It pairs each driver with their teammate in the same race,
computes who beat whom and qualifying gap, then calculates a rolling teammate battle win rate (last 5 races) ‚Äî
shifted by one race to avoid data leakage ‚Äî and merges these features back into the main DataFrame.

In [14]:
teammate_df = df[['race_id','team_id','driver_id','position','quali_position','points']].copy()
teammate_df.columns = ['race_id','team_id','teammate_id','teammate_position','teammate_quali','teammate_points']

teammate_battles_df = df[['race_id', 'team_id', 'driver_id', 'position', 'quali_position', 'season', 'round_number']]

teammate_battles_df = teammate_battles_df.merge(
    teammate_df,
    on=['race_id', 'team_id'],
    how='left'
)

teammate_battles_df = teammate_battles_df[
    teammate_battles_df['driver_id'] != teammate_battles_df['teammate_id']
]

teammate_battles_df['beat_teammate'] = (teammate_battles_df['position'] < teammate_battles_df['teammate_position']).astype(float)
teammate_battles_df['quali_gap_to_teammate'] = teammate_battles_df['quali_position'] - teammate_battles_df['teammate_quali']

teammate_battles_df = teammate_battles_df.sort_values(['driver_id', 'season', 'round_number'])
teammate_battles_df['teammate_battle_rate_5'] = teammate_battles_df.groupby('driver_id')['beat_teammate'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean().shift(1)
)

features_to_join = [
    'race_id', 
    'driver_id', 
    'teammate_id', 
    'beat_teammate', 
    'quali_gap_to_teammate', 
    'teammate_battle_rate_5'
]

df = df.merge(
    teammate_battles_df[features_to_join],
    on=['race_id', 'driver_id'],
    how='left'
)

print(f"‚úÖ Robust teammate features created and merged.")
print(f"   Teammate data available: {df['teammate_id'].notna().sum()} records")

‚úÖ Robust teammate features created and merged.
   Teammate data available: 1638 records


### Circuit Specific Performance


In [15]:
le_circuit = LabelEncoder()
df['circuit_encoded'] = le_circuit.fit_transform(df['circuit_location'])
df = df.sort_values(['season','round_number'])

def weighted_expanding_average(group_data, val_col, w_col):
    # --- BUG FIX: Use variables, not strings ---
    vals = group_data[val_col]
    weights = group_data[w_col]
    # --- END FIX ---

    expanding_sum = (vals*weights).expanding().sum()
    expanding_w_sum = weights.expanding().sum()

    return(expanding_sum/expanding_w_sum).shift(1)

print("   Calculating leak-proof career fallback stats...")
df['career_weighted_pos'] = df.groupby('driver_id').apply(
    lambda x: weighted_expanding_average(x, 'position', 'sample_weight')
).reset_index(level=0, drop=True)

df['career_weighted_points'] = df.groupby('driver_id').apply(
    lambda x: weighted_expanding_average(x, 'points', 'sample_weight')
).reset_index(level=0, drop=True)

print("   Calculating circuit-specific stats...")
grouped = df.groupby(['circuit_encoded', 'driver_id'])

df['driver_circuit_weighted_pos'] = grouped.apply(
    lambda x: weighted_expanding_average(x, 'position', 'sample_weight')
).reset_index(level=[0,1], drop=True)

df['driver_circuit_weighted_points'] = grouped.apply(
    lambda x: weighted_expanding_average(x, 'points', 'sample_weight')
).reset_index(level=[0,1], drop=True)

print("   Filling missing circuit-specific stats...")
df['driver_circuit_weighted_pos'].fillna(df['career_weighted_pos'], inplace=True)
df['driver_circuit_weighted_points'].fillna(df['career_weighted_points'], inplace=True)

# Fallback for a driver's first-ever race
df['driver_circuit_weighted_pos'].fillna(df['position'].expanding().mean().shift(1), inplace=True)
df['driver_circuit_weighted_points'].fillna(0, inplace=True)

df = df.drop(columns=['career_weighted_pos', 'career_weighted_points'])

print(f"‚úÖ Leak-proof circuit-specific features created")

   Calculating leak-proof career fallback stats...
   Calculating circuit-specific stats...
   Filling missing circuit-specific stats...
‚úÖ Leak-proof circuit-specific features created


### Championship position features

In [16]:
df['championship_points_before_race'] = df.groupby(['season', 'driver_id'])['points'].transform(
    lambda x: x.cumsum().shift(1)
).fillna(0)

print("   Calculating championship rank...")
df['championship_position_before_race'] = df.groupby(['season', 'round_number'])['championship_points_before_race'].rank(
    method='min', ascending=False
)

print("   Calculating gap to leader...")
df['points_to_leader'] = df.groupby(['season', 'round_number'])['championship_points_before_race'].transform('max') - df['championship_points_before_race']

print(f"‚úÖ Championship features created")

   Calculating championship rank...
   Calculating gap to leader...
‚úÖ Championship features created


<hr>
<hr>

### Encoding

In [17]:
print("Label Encoding high cardinality categorical features(circuit(done before), nationality)...")

le_nationality = LabelEncoder()
df['nationality_encoded'] = le_nationality.fit_transform(df['driver_nationality'].fillna('Unknown'))

print("One hot encoding low cardinality categorical features(driver,team)...")
df = pd.get_dummies(
    df,
    columns = ['driver_code','team_name'],
    prefix = ['driver','team'],
    dtype = int
)

print("   Saving encoders for pipeline...")
os.makedirs('ml/models/encoders_new', exist_ok=True)
joblib.dump(le_nationality, 'ml/models/encoders_new/nationality_encoder.pkl')
joblib.dump(le_circuit, 'ml/models/encoders_new/circuit_encoder.pkl')

print("‚úÖ Encoding complete. New dummy columns created.")

Label Encoding high cardinality categorical features(circuit(done before), nationality)...
One hot encoding low cardinality categorical features(driver,team)...
   Saving encoders for pipeline...
‚úÖ Encoding complete. New dummy columns created.


### Creating target variables

In [18]:
df['win'] = (df['position'] == 1).astype(int)
df['podium'] = (df['position'] <= 3).astype(int)
df['top5'] = (df['position'] <= 5).astype(int)
df['points_finish'] = (df['position'] <= 10).astype(int)
print("‚úÖ Target variables created.")

‚úÖ Target variables created.


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1699 entries, 0 to 1698
Columns: 104 entries, result_id to points_finish
dtypes: bool(1), datetime64[ns](1), float64(29), int64(63), object(10)
memory usage: 1.3+ MB


### Final feature selection 

In [20]:
print("Filling NaN values with intelligent defaults...")

# --- 1. Special Imputation Rules ---
# These NaNs have a specific meaning.

# 'q3_gap_to_pole': NaN means "didn't reach Q3". Fill with a large penalty.
df['q3_gap_to_pole'].fillna(99, inplace = True)
# 'teammate_battle_rate_5': NaN (from .shift(1)) means no prior history. Fill with a neutral 0.5 (50/50).
df['teammate_battle_rate_5'].fillna(0.5,inplace = True)
# 'quali_gap_to_teammate': NaN means no teammate. Fill with a neutral 0.0 (no gap).
df['quali_gap_to_teammate'].fillna(0.0,inplace = True)
# 'avg_quali_position_5': NaN (from .shift(1)) means no prior history. Fill with a neutral "mid-pack" 10.0.
df['avg_quali_position_5'].fillna(10.0,inplace=True)

# Define the columns we just handled
special_cols = ['q3_gap_to_pole', 'teammate_battle_rate_5', 'quali_gap_to_teammate', 'avg_quali_position_5']

# --- 2. General Median Imputation (The Catch-All) ---
# These are all other numeric features. NaNs here are from .shift(1) or missing DB data.
# We will fill them with the median *of the training set* to prevent data leakage.

# Define the *complete* list of base features first
base_features = [
    'grid_position',
    'quali_position',
    'driver_championships',
    'circuit_encoded',    
    'round_number',
    
    'grid_penalty',
    'q3_gap_to_pole',
    'avg_quali_position_5',
    
    'is_street_circuit',
    'is_long_circuit',
    'has_sprint',
    
    'weighted_avg_position_5',
    'weighted_points_5',
    'weighted_podium_rate_5',
    'weighted_dnf_rate_5',
    'recent_form_3',
    
    'team_weighted_avg_position_5', 
    'team_weighted_points_5',
    
    'quali_gap_to_teammate',
    'teammate_battle_rate_5',
    
    'driver_circuit_weighted_pos',
    'driver_circuit_weighted_points',
    
    'championship_position_before_race',
    'points_to_leader',
]

# Get all numeric columns from base_features *except* the ones we just handled
numeric_cols_to_fill = [
    col for col in base_features 
    if col not in special_cols and df[col].dtype in ['float64', 'int64']
]

print(f"   Imputing {len(numeric_cols_to_fill)} other numeric columns with their training median...")

for col in numeric_cols_to_fill:
    # Calculate median *only* from the training data (seasons < 2025)
    median_val = df[df['season'] < 2025][col].median()
    
    if pd.isna(median_val):
        # Fallback if the whole column is NaN (unlikely, but safe)
        median_val = 0
        
    # Fill NaNs in the *entire* DataFrame (train and test) with this value
    df[col].fillna(median_val, inplace=True)

# --- 3. Dynamically build the feature list ---

# Re-filter base_features to only include ones that actually exist in the df
base_features = [col for col in base_features if col in df.columns]

ohe_driver_cols = [col for col in df.columns if col.startswith('driver_')]
ohe_team_cols = [col for col in df.columns if col.startswith('team_')]

feature_columns = base_features + ohe_driver_cols + ohe_team_cols 
feature_columns = [col for col in feature_columns if col not in ['driver_age', 'driver_nationality']]
target_columns = ['win', 'podium', 'points_finish', 'top5', 'position']

# --- 4. Final Check ---
final_nan_count = df[feature_columns].isnull().sum().sum()
if final_nan_count > 0:
    print(f"\n‚ö†Ô∏è WARNING: {final_nan_count} NaNs still found in feature columns.")
    # Also print *which* columns still have NaNs
    print("Columns with remaining NaNs:")
    print(df[feature_columns].isnull().sum()[df[feature_columns].isnull().sum() > 0])
else:
    print("\n‚úÖ All NaNs successfully imputed.")

print(f"\n‚úÖ Feature set prepared:")
print(f"   Total Features: {len(feature_columns)}")
print(f"   Base Features: {len(base_features)}")
print(f"   One-Hot Driver Features: {len(ohe_driver_cols)}")
print(f"   One-Hot Team Features: {len(ohe_team_cols)}")

Filling NaN values with intelligent defaults...
   Imputing 20 other numeric columns with their training median...

‚úÖ All NaNs successfully imputed.

‚úÖ Feature set prepared:
   Total Features: 74
   Base Features: 24
   One-Hot Driver Features: 36
   One-Hot Team Features: 16


### Train/Test Split (With Weights)

In [21]:
train_df = df[df['season'] < 2025].copy()
test_df = df[df['season'] == 2025].copy()

train_weights = train_df['sample_weight'].values
test_weights = test_df['sample_weight'].values

print(f"‚úÖ Time-based split with temporal weighting:")
print(f"   Training set (Seasons 2022-2024): {len(train_df)} samples")
print(f"   - Avg weight: {train_weights.mean():.3f}")
print(f"   - Weight range: {train_weights.min():.3f} - {train_weights.max():.3f}")
print(f"   Test set (2025): {len(test_df)} samples")
print(f"   - Avg weight: {test_weights.mean():.3f}")

‚úÖ Time-based split with temporal weighting:
   Training set (Seasons 2022-2024): 1314 samples
   - Avg weight: 0.051
   - Weight range: 0.002 - 0.203
   Test set (2025): 385 samples
   - Avg weight: 0.655


### Saving data

In [22]:
output_dir = "ml/data/processed/"
os.makedirs(output_dir, exist_ok=True)

# We'll save these as v3 to show they are the new multi-season files
data_path = os.path.join(output_dir, 'full_data_v3.parquet')
train_path = os.path.join(output_dir, 'train_data_v3.parquet')
test_path = os.path.join(output_dir, 'test_data_v3.parquet')
train_weights_path = os.path.join(output_dir, 'train_weights_v3.npy')
test_weights_path = os.path.join(output_dir, 'test_weights_v3.npy')
metadata_path = os.path.join(output_dir, 'metadata_v3.json')
features_path = "../models/encoders_new/feature_columns_v3.joblib"

print(f"   Saving DataFrames to {output_dir}...")
df.to_parquet(data_path, index=False, engine='fastparquet')
train_df.to_parquet(train_path, index=False, engine='fastparquet')
test_df.to_parquet(test_path, index=False, engine='fastparquet')

print(f"   Saving weights to {output_dir}...")
np.save(train_weights_path, train_weights)
np.save(test_weights_path, test_weights)

print(f"   Saving metadata to {output_dir}...")
metadata = {
    'feature_columns': feature_columns,
    'target_columns': target_columns,
    'train_size': len(train_df),
    'test_size': len(test_df),
    'train_seasons': [2022, 2023, 2024],
    'test_season': 2025,
    'temporal_weighting': {
        '2022_weight': 0.3,
        '2023_weight': 0.5,
        '2024_weight': 0.7,
        '2025_weight': 1.0,
        'half_life_days': 180
    }
}
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2)

# Save the feature list for the pipeline
joblib.dump(feature_columns, features_path)
print(f"   Feature list saved to {features_path}")

print("\n‚úÖ All data saved (V3)!")
print("\nüí° NEXT STEP: Go to your modeling notebooks (LGBM, XGBoost) and update them to load `_v3` files.")

   Saving DataFrames to ml/data/processed/...
   Saving weights to ml/data/processed/...
   Saving metadata to ml/data/processed/...


FileNotFoundError: [Errno 2] No such file or directory: '../models/encoders_new/feature_columns_v3.joblib'