# Data Cleaning

In [2]:
import pandas as pd
import numpy as np
import sys
import os,re

from sqlalchemy import create_engine
from dotenv import load_dotenv

import warnings
warnings.filterwarnings('ignore')

import joblib
from sklearn.preprocessing import LabelEncoder

In [3]:
load_dotenv()
DATABASE_URL = os.getenv("DATABASE_URL", "postgresql://postgres:postgres@localhost:5432/f1_db")

engine = create_engine(DATABASE_URL.replace("postgresql+asyncpg://", "postgresql://"))

### Loading raw data

In [4]:
print("\nLoading data...")

query = """
SELECT 
    rr.id as result_id, rr.position, rr.grid_position, rr.points, rr.laps_completed, rr.fastest_lap_rank,
    rr.status, rr.is_sprint,
    d.id as driver_id, d.code as driver_code, d.driver_number, d.nationality as driver_nationality,
    EXTRACT(YEAR FROM AGE(r.race_date, d.date_of_birth)) as driver_age,
    d.championships as driver_championships,
    t.id as team_id, t.name as team_name,
    r.id as race_id, r.season, r.round_number, r.race_name, r.circuit_location, r.country,
    r.circuit_type, r.laps as total_laps, r.circuit_length, r.has_sprint, r.race_date,
    qr.position as quali_position, qr.q1_time, qr.q2_time, qr.q3_time
FROM race_results rr
JOIN drivers d ON rr.driver_id = d.id
JOIN teams t ON rr.team_id = t.id
JOIN races r ON rr.race_id = r.id
LEFT JOIN qualifying_results qr ON rr.race_id = qr.race_id AND rr.driver_id = qr.driver_id
WHERE rr.is_sprint = False
ORDER BY r.season, r.round_number, rr.position;
"""

df = pd.read_sql(query, engine)
print(f"✅ Loaded {len(df)} records with qualifying data")
print(f"   Qualifying data coverage: {df['quali_position'].notna().sum()} / {len(df)}")



Loading data...
✅ Loaded 845 records with qualifying data
   Qualifying data coverage: 845 / 845


In [5]:
## Adding circuit features

circuit_data_map = {
    "Sakhir": ("Permanent", 57, 5.412),
    "Jeddah": ("Street", 50, 6.174),
    "Melbourne": ("Street", 58, 5.278),
    "Suzuka": ("Permanent", 53, 5.807),
    "Shanghai": ("Permanent", 56, 5.451),
    "Miami Gardens": ("Street", 57, 5.412),   
    "Miami": ("Street", 57, 5.412),           
    "Imola": ("Permanent", 63, 4.909),
    "Monaco": ("Street", 78, 3.337),
    "Montréal": ("Street", 70, 4.361),
    "Barcelona": ("Permanent", 66, 4.657),
    "Spielberg": ("Permanent", 71, 4.318),
    "Silverstone": ("Permanent", 52, 5.891),
    "Budapest": ("Permanent", 70, 4.381),
    "Spa-Francorchamps": ("Permanent", 44, 7.004),
    "Zandvoort": ("Permanent", 72, 4.259),
    "Monza": ("Permanent", 53, 5.793),
    "Baku": ("Street", 51, 6.003),
    "Marina Bay": ("Street", 62, 4.940),
    "Austin": ("Permanent", 56, 5.513),
    "Mexico City": ("Permanent", 71, 4.304),
    "São Paulo": ("Permanent", 71, 4.309),
    "Las Vegas": ("Street", 50, 6.201),
    "Lusail": ("Permanent", 57, 5.419),
    "Yas Island": ("Permanent", 58, 5.281)
}

key_column = 'circuit_location' 

mapped_data = df[key_column].map(circuit_data_map)

df['circuit_type'] = df['circuit_type'].fillna(mapped_data.str[0])
df['total_laps'] = df['total_laps'].fillna(mapped_data.str[1])
df['circuit_length'] = df['circuit_length'].fillna(mapped_data.str[2])

print("Fill complete. Checking for remaining missing values:")
print(f"circuit_type: {df['circuit_type'].isna().sum()} missing")
print(f"total_laps: {df['total_laps'].isna().sum()} missing")
print(f"circuit_length: {df['circuit_length'].isna().sum()} missing")

all_db_locations = set(df['circuit_location'].unique())
all_map_keys = set(circuit_data_map.keys())
mismatched_keys = all_db_locations - all_map_keys

if len(mismatched_keys) > 0:
    print(f"\n⚠️ WARNING: Mismatched keys found!")
    print(f"   These locations are in your database but NOT in the map:")
    print(f"   {mismatched_keys}")
    print(f"   Please add/correct them in 'circuit_data_map' and re-run.")
else:
    print("\n✅ Success! All circuit locations were matched and filled.")

Fill complete. Checking for remaining missing values:
circuit_type: 0 missing
total_laps: 0 missing
circuit_length: 0 missing

✅ Success! All circuit locations were matched and filled.


In [6]:
df.head()

Unnamed: 0,result_id,position,grid_position,points,laps_completed,fastest_lap_rank,status,is_sprint,driver_id,driver_code,...,country,circuit_type,total_laps,circuit_length,has_sprint,race_date,quali_position,q1_time,q2_time,q3_time
0,1,1,1,26.0,57,1.0,Finished,False,1,VER,...,Bahrain,Permanent,57,5.412,0,2024-03-02,1,0 days 00:01:30.031000,0 days 00:01:29.374000,0 days 00:01:29.179000
1,2,2,5,18.0,57,4.0,Finished,False,2,PER,...,Bahrain,Permanent,57,5.412,0,2024-03-02,5,0 days 00:01:30.221000,0 days 00:01:29.932000,0 days 00:01:29.537000
2,3,3,4,15.0,57,6.0,Finished,False,3,SAI,...,Bahrain,Permanent,57,5.412,0,2024-03-02,4,0 days 00:01:29.909000,0 days 00:01:29.573000,0 days 00:01:29.507000
3,4,4,2,12.0,57,2.0,Finished,False,4,LEC,...,Bahrain,Permanent,57,5.412,0,2024-03-02,2,0 days 00:01:30.243000,0 days 00:01:29.165000,0 days 00:01:29.407000
4,5,5,3,10.0,57,12.0,Finished,False,5,RUS,...,Bahrain,Permanent,57,5.412,0,2024-03-02,3,0 days 00:01:30.350000,0 days 00:01:29.922000,0 days 00:01:29.485000


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 845 entries, 0 to 844
Data columns (total 31 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   result_id             845 non-null    int64  
 1   position              845 non-null    int64  
 2   grid_position         845 non-null    int64  
 3   points                845 non-null    float64
 4   laps_completed        845 non-null    int64  
 5   fastest_lap_rank      821 non-null    float64
 6   status                845 non-null    object 
 7   is_sprint             845 non-null    bool   
 8   driver_id             845 non-null    int64  
 9   driver_code           845 non-null    object 
 10  driver_number         845 non-null    int64  
 11  driver_nationality    845 non-null    object 
 12  driver_age            845 non-null    float64
 13  driver_championships  845 non-null    int64  
 14  team_id               845 non-null    int64  
 15  team_name             8

## Temporal Weighting : 
- Give recent races higher weight using exponential decay 
- Makes model focus on recent driver/team performance
<pre>
1) First calculate days since last race
2) Exponential delay function to get temporal weights:
        Half life of 180 days
        i.e. A race of 0 days ago will have weight of 1
             A race of 180 days ago will have weight of 0.5  
3) Calculate season weights as recent season performances are more important
        Recent 2025 season has weight 1
        2024 season has weight 0.7
        Rest all (if any) have weight 0.5
4) Multiply temporal and season weights to get final sample weights                     

In [8]:
df['race_date'] = pd.to_datetime(df['race_date'])
most_recent_race = df['race_date'].max()
df['days_since_race'] = (most_recent_race - df['race_date']).dt.days

half_life = 180
df['temporal_weight'] = np.exp(-np.log(2) * df['days_since_race'] / half_life)
df['season_weight'] = df['season'].apply(lambda x: {2024:0.7 , 2025:1}.get(x,0.5))
df['sample_weight'] = df['temporal_weight'] * df['season_weight']

In [9]:
print(df[['season','temporal_weight','season_weight','sample_weight']].head())

   season  temporal_weight  season_weight  sample_weight
0    2024         0.098073            0.7       0.068651
1    2024         0.098073            0.7       0.068651
2    2024         0.098073            0.7       0.068651
3    2024         0.098073            0.7       0.068651
4    2024         0.098073            0.7       0.068651


### Qualifying feature Engineering:
- Incorporates qualifying performance (Q1–Q3), grid penalties, and relative speed to pole.

<pre>
1) Clean data : If any driver is found having a mmissing quali position (happens if they crashed in q1/q2 or didnt set a time),
                it is filled with their grid position.
2) Grid penalty is calculated 
3) Normalize qualifying position.
        Pole Position (P1) becomes 1/20 = 0.05 (the "best" score).
        Last Place (P20) becomes 20/20 = 1.0 (the "worst" score).   
4) Conver lap time strings to numerical format for calculation
5) Calculate gap_to_pole to measure pure,relative pace against the fastest car                     

In [10]:
df['quali_position'].fillna(df['grid_position'],inplace=True)
df['grid_penalty'] = df['grid_position'] - df['quali_position']
df['quali_position_normalized'] = df['quali_position']/20

laptime_pattern = re.compile(r'(\d{2}):(\d{2}):(\d{2}\.\d+)')

def parse_laptime_from_string(time_str):
    if pd.isna(time_str):
        return None
    
    try:
        match = laptime_pattern.search(str(time_str))
        
        if match:
            hours = int(match.group(1))   
            minutes = int(match.group(2)) 
            seconds = float(match.group(3)) 
            
            return (hours * 3600) + (minutes * 60) + seconds
        else:
            return None
    except:
        return None

df['q3_seconds'] = df['q3_time'].apply(parse_laptime_from_string)
df['q3_gap_to_pole'] = df.groupby('race_id')['q3_seconds'].transform(
    lambda x: x - x.min() if x.notna().sum() > 0 else np.nan
)

In [11]:
print(df[['quali_position','grid_position','grid_penalty','q3_time','q3_seconds','q3_gap_to_pole']].head())


   quali_position  grid_position  grid_penalty                 q3_time  \
0               1              1             0  0 days 00:01:29.179000   
1               5              5             0  0 days 00:01:29.537000   
2               4              4             0  0 days 00:01:29.507000   
3               2              2             0  0 days 00:01:29.407000   
4               3              3             0  0 days 00:01:29.485000   

   q3_seconds  q3_gap_to_pole  
0      89.179           0.000  
1      89.537           0.358  
2      89.507           0.328  
3      89.407           0.228  
4      89.485           0.306  


### Circuit Features
<pre>
1) Checks if street circuit or not
    Why : Street circuits (like Monaco, Baku, or Singapore) are fundamentally different from permanent tracks (like Silverstone or Suzuka).
          They are often bumpier, have lower grip, and punish mistakes with walls instead of gravel.
          This leads to more safety cars, different setup priorities, and can favor certain "specialist" drivers.

2) Fills any null circuit length values with median of length

3) Checks if circuit is >5km.
    Why : Instead of passing individual length for each track just long/nnt long helps model learn simpler rules

In [12]:
df['is_street_circuit'] = df['circuit_type'].fillna('').str.contains('Street', case=False).astype(int)
df['circuit_length'].fillna(df['circuit_length'].median(), inplace=True)
df['is_long_circuit'] = (df['circuit_length'] > 5.0).astype(int)
df['has_sprint'] = df['has_sprint'].fillna(0).astype(int)

In [13]:
print(df[['circuit_location','is_street_circuit','is_long_circuit','has_sprint']].sample(5))


      circuit_location  is_street_circuit  is_long_circuit  has_sprint
576      Miami Gardens                  1                1           0
696  Spa-Francorchamps                  0                1           0
151             Monaco                  1                0           0
570      Miami Gardens                  1                1           0
208          Spielberg                  0                0           0


### Advanced Historical Features

1) Drivers:
- Two helper columns(podium and dnf) to measure success and reliability
- Weighted average finishing position over the last 5 races
- Weighted average total points over the last 5 races
- Weighted average podiums(eg : 0.6 = 60% podium rate) over last 5 races
- Weighted DNF rate over last 5 matches
- Recent form (A "hotter," more sensitive version of average position, looking at only the last 3 races.)
- Unweighted avg of quali position ober last 5 races

2) Teams: (Gives "car performance feature")
- Weighted avg finishing position
- Weighted avg total points

In [14]:
def calculate_weighted_rolling_stats(group, window=5):
    group = group.sort_values(['season', 'round_number'])
    weights = group['sample_weight']

    group['dnf'] = (group['status'] == 'retired').astype(int)
    group['podium'] = (group['position'] <= 3).astype(int)

    def weighted_avg(x):
        return np.average(x, weights=weights[x.index])
    
    group['weighted_avg_position_5'] = group['position'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    group['weighted_points_5'] = group['points'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    group['weighted_podium_rate_5'] = group['podium'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    group['weighted_dnf_rate_5'] = group['dnf'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    group['recent_form_3'] = group['position'].rolling(
        window=3, min_periods=1  
    ).apply(weighted_avg, raw=False).shift(1)
    
    if 'quali_position' in group.columns:
        group['avg_quali_position_5'] = group['quali_position'].rolling(
            window=window, min_periods=1
        ).mean().shift(1)

    new_cols = [
        'weighted_avg_position_5', 'weighted_points_5', 'weighted_podium_rate_5',
        'weighted_dnf_rate_5', 'recent_form_3', 'avg_quali_position_5'
    ]
    final_cols = [col for col in new_cols if col in group.columns]
    return group[final_cols]


def calculate_team_weighted_stats(group, window=5):
    group = group.sort_values(['season', 'round_number'])
    weights = group['sample_weight']

    def weighted_avg(x):
        return np.average(x, weights=weights[x.index])
    
    group['team_weighted_avg_position_5'] = group['position'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    group['team_weighted_points_5'] = group['points'].rolling(
        window=window, min_periods=1
    ).apply(weighted_avg, raw=False).shift(1)
    
    return group[['team_weighted_avg_position_5', 'team_weighted_points_5']]


print("Calculating driver weighted rolling stats...")
driver_stats = df.groupby('driver_id', group_keys=False).apply(calculate_weighted_rolling_stats)

print("Calculating team weighted rolling stats...")
team_stats = df.groupby('team_id', group_keys=False).apply(calculate_team_weighted_stats)

print("Merging stats back to main DataFrame...")
df = df.join(driver_stats)
df = df.join(team_stats)

print("\nAll leak-proof rolling features calculated and merged successfully.")

Calculating driver weighted rolling stats...
Calculating team weighted rolling stats...
Merging stats back to main DataFrame...

All leak-proof rolling features calculated and merged successfully.


### Head to head & teammate comparision
<pre>
It pairs each driver with their teammate in the same race,
computes who beat whom and qualifying gap, then calculates a rolling teammate battle win rate (last 5 races) —
shifted by one race to avoid data leakage — and merges these features back into the main DataFrame.

In [15]:
teammate_df = df[['race_id','team_id','driver_id','position','quali_position','points']].copy()
teammate_df.columns = ['race_id','team_id','teammate_id','teammate_position','teammate_quali','teammate_points']

teammate_battles_df = df[['race_id', 'team_id', 'driver_id', 'position', 'quali_position', 'season', 'round_number']]

teammate_battles_df = teammate_battles_df.merge(
    teammate_df,
    on=['race_id', 'team_id'],
    how='left'
)

teammate_battles_df = teammate_battles_df[
    teammate_battles_df['driver_id'] != teammate_battles_df['teammate_id']
]

teammate_battles_df['beat_teammate'] = (teammate_battles_df['position'] < teammate_battles_df['teammate_position']).astype(float)
teammate_battles_df['quali_gap_to_teammate'] = teammate_battles_df['quali_position'] - teammate_battles_df['teammate_quali']

teammate_battles_df = teammate_battles_df.sort_values(['driver_id', 'season', 'round_number'])
teammate_battles_df['teammate_battle_rate_5'] = teammate_battles_df.groupby('driver_id')['beat_teammate'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean().shift(1)
)

features_to_join = [
    'race_id', 
    'driver_id', 
    'teammate_id', 
    'beat_teammate', 
    'quali_gap_to_teammate', 
    'teammate_battle_rate_5'
]

df = df.merge(
    teammate_battles_df[features_to_join],
    on=['race_id', 'driver_id'],
    how='left'
)

print(f"Robust teammate features created and merged.")
print(f"Teammate data available: {df['teammate_id'].notna().sum()} records")

Robust teammate features created and merged.
Teammate data available: 810 records


### Circuit Specific Performance


In [16]:
le_circuit = LabelEncoder()

df['circuit_encoded'] = le_circuit.fit_transform(df['circuit_location'])
df = df.sort_values(['season','round_number'])

def weighted_expanding_average(group_data, val_col, w_col):

    vals = group_data[val_col]
    weights = group_data[w_col]

    expanding_sum = (vals*weights).expanding().sum()

    expanding_w_sum = weights.expanding().sum()

    return(expanding_sum/expanding_w_sum).shift(1)

print("   Calculating leak-proof career fallback stats...")
df['career_weighted_pos'] = df.groupby('driver_id').apply(
    lambda x: weighted_expanding_average(x, 'position', 'sample_weight')
).reset_index(level=0, drop=True)

df['career_weighted_points'] = df.groupby('driver_id').apply(
    lambda x: weighted_expanding_average(x, 'points', 'sample_weight')
).reset_index(level=0, drop=True)


print("   Calculating circuit-specific stats...")
grouped = df.groupby(['circuit_encoded', 'driver_id'])


df['driver_circuit_weighted_pos'] = grouped.apply(
    lambda x: weighted_expanding_average(x, 'position', 'sample_weight')
).reset_index(level=[0,1], drop=True)

df['driver_circuit_weighted_points'] = grouped.apply(
    lambda x: weighted_expanding_average(x, 'points', 'sample_weight')
).reset_index(level=[0,1], drop=True)

print("   Filling missing circuit-specific stats...")
df['driver_circuit_weighted_pos'].fillna(df['career_weighted_pos'], inplace=True)
df['driver_circuit_weighted_points'].fillna(df['career_weighted_points'], inplace=True)

df['driver_circuit_weighted_pos'].fillna(df['position'].expanding().mean().shift(1), inplace=True)
df['driver_circuit_weighted_points'].fillna(0, inplace=True)

df = df.drop(columns=['career_weighted_pos', 'career_weighted_points'])

print(f"✅ Leak-proof circuit-specific features created")

   Calculating leak-proof career fallback stats...
   Calculating circuit-specific stats...
   Filling missing circuit-specific stats...
✅ Leak-proof circuit-specific features created


### Championship position features

In [17]:
df['championship_points_before_race'] = df.groupby(['season', 'driver_id'])['points'].transform(
    lambda x: x.cumsum().shift(1)
).fillna(0)

print("   Calculating championship rank...")
df['championship_position_before_race'] = df.groupby(['season', 'round_number'])['championship_points_before_race'].rank(
    method='min', ascending=False
)

print("   Calculating gap to leader...")
df['points_to_leader'] = df.groupby(['season', 'round_number'])['championship_points_before_race'].transform('max') - df['championship_points_before_race']

print(f"✅ Championship features created")

   Calculating championship rank...
   Calculating gap to leader...
✅ Championship features created


<hr>
<hr>

### Encoding

In [18]:
print("Label Encoding high cardinality categorical features(circuit(done before), nationality)...")

le_nationality = LabelEncoder()
df['nationality_encoded'] = le_nationality.fit_transform(df['driver_nationality'].fillna('Unknown'))

print("One hot encoding low cardinality categorical features(driver,team)...")
df = pd.get_dummies(
    df,
    columns = ['driver_code','team_name'],
    prefix = ['driver','team'],
    dtype = int
)

print("   Saving encoders for pipeline...")
os.makedirs('ml/models/encoders', exist_ok=True)
joblib.dump(le_nationality, 'ml/models/encoders/nationality_encoder.pkl')
joblib.dump(le_circuit, 'ml/models/encoders/circuit_encoder.pkl')

print("✅ Encoding complete. New dummy columns created.")

Label Encoding high cardinality categorical features(circuit(done before), nationality)...
One hot encoding low cardinality categorical features(driver,team)...
   Saving encoders for pipeline...
✅ Encoding complete. New dummy columns created.


### Creating target variables

In [19]:
df['win'] = (df['position'] == 1).astype(int)
df['podium'] = (df['position'] <= 3).astype(int)
df['top5'] = (df['position'] <= 5).astype(int)
df['points_finish'] = (df['position'] <= 10).astype(int)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 845 entries, 0 to 844
Data columns (total 99 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   result_id                          845 non-null    int64         
 1   position                           845 non-null    int64         
 2   grid_position                      845 non-null    int64         
 3   points                             845 non-null    float64       
 4   laps_completed                     845 non-null    int64         
 5   fastest_lap_rank                   821 non-null    float64       
 6   status                             845 non-null    object        
 7   is_sprint                          845 non-null    bool          
 8   driver_id                          845 non-null    int64         
 9   driver_number                      845 non-null    int64         
 10  driver_nationality                 845

### Final feature selection 

In [21]:
print("Filling NaN values")

df['q3_gap_to_pole'].fillna(99, inplace = True)
df['teammate_battle_rate_5'].fillna(0.5,inplace = True)
df['quali_gap_to_teammate'].fillna(0.0,inplace = True)
df['avg_quali_position_5'].fillna(10.0,inplace=True)

impute_with_median = [
    'weighted_avg_position_5', 'weighted_points_5', 'weighted_podium_rate_5',
    'weighted_dnf_rate_5', 'recent_form_3', 'team_weighted_avg_position_5',
    'team_weighted_points_5', 'driver_circuit_weighted_pos', 'driver_circuit_weighted_points'
]
for col in impute_with_median:
    if col in df.columns:
        df[col].fillna(df[col].median(), inplace=True)


base_features = [
    'grid_position',
    'quali_position',
    'driver_age',
    'driver_championships',
    'nationality_encoded',  
    'circuit_encoded',    
    'round_number',
    
    'grid_penalty',
    'q3_gap_to_pole',
    'avg_quali_position_5',
    
    'is_street_circuit',
    'is_long_circuit',
    'has_sprint',
    
    'weighted_avg_position_5',
    'weighted_points_5',
    'weighted_podium_rate_5',
    'weighted_dnf_rate_5',
    'recent_form_3',
    
    'team_weighted_avg_position_5', 
    'team_weighted_points_5',
    
    'quali_gap_to_teammate',
    'teammate_battle_rate_5',
    
    'driver_circuit_weighted_pos',
    'driver_circuit_weighted_points',
    
    'championship_position_before_race',
    'points_to_leader',
]

ohe_driver_cols = [col for col in df.columns if col.startswith('driver_')]
ohe_team_cols = [col for col in df.columns if col.startswith('team_')]

feature_columns = base_features + ohe_driver_cols + ohe_team_cols

target_columns = ['win', 'podium', 'points_finish', 'top5', 'position']

final_nan_count = df[feature_columns].isnull().sum().sum()
if final_nan_count > 0:
    print(f"\n⚠️ WARNING: {final_nan_count} NaNs still found in feature columns.")
else:
    print("\n✅ All NaNs successfully imputed.")

print(f"\n✅ Feature set prepared:")
print(f"   Total Features: {len(feature_columns)}")
print(f"   Base Features: {len(base_features)}")
print(f"   One-Hot Driver Features: {len(ohe_driver_cols)}")
print(f"   One-Hot Team Features: {len(ohe_team_cols)}")

Filling NaN values

✅ All NaNs successfully imputed.

✅ Feature set prepared:
   Total Features: 73
   Base Features: 26
   One-Hot Driver Features: 33
   One-Hot Team Features: 14


### Train/Test Split (With Weights)

In [22]:
train_df = df[df['season'] == 2024].copy()
test_df = df[df['season'] == 2025].copy()

train_weights = train_df['sample_weight'].values
test_weights = test_df['sample_weight'].values

print(f"✅ Time-based split with temporal weighting:")
print(f"   Training set (2024): {len(train_df)} samples")
print(f"   - Avg weight: {train_weights.mean():.3f}")
print(f"   - Weight range: {train_weights.min():.3f} - {train_weights.max():.3f}")
print(f"   Test set (2025): {len(test_df)} samples")
print(f"   - Avg weight: {test_weights.mean():.3f}")

✅ Time-based split with temporal weighting:
   Training set (2024): 460 samples
   - Avg weight: 0.123
   - Weight range: 0.069 - 0.203
   Test set (2025): 385 samples
   - Avg weight: 0.655


### Saving data

In [None]:
import json

output_dir = "ml/data/processed/"
os.makedirs(output_dir, exist_ok=True)

print(f"   Saving DataFrames to {output_dir}...")
df.to_parquet(
    os.path.join(output_dir, 'full_data_v2.parquet'), 
    index=False, 
    engine='fastparquet' 
)
train_df.to_parquet(
    os.path.join(output_dir, 'train_data_v2.parquet'), 
    index=False, 
    engine='fastparquet'  
)
test_df.to_parquet(
    os.path.join(output_dir, 'test_data_v2.parquet'), 
    index=False, 
    engine='fastparquet'  
)

print(f"   Saving weights to {output_dir}...")
np.save(os.path.join(output_dir, 'train_weights.npy'), train_weights)
np.save(os.path.join(output_dir, 'test_weights.npy'), test_weights)

print(f"   Saving metadata to {output_dir}...")
metadata = {
    'feature_columns': feature_columns,
    'target_columns': target_columns,
    'train_size': len(train_df),
    'test_size': len(test_df),
    'has_qualifying_data': True,
    'has_temporal_weights': True,
    'has_teammate_features': True,
    'temporal_weighting': {
        '2024_weight': 0.7,
        '2025_weight': 1.0,
        'half_life_days': 180
    }
}
with open(os.path.join(output_dir, 'metadata_v2.json'), 'w') as f:
    json.dump(metadata, f, indent=2)

print("\n✅ All data saved (V2)!")
print("\n🎯 Key Improvements:")
print("   1. ✅ Qualifying data included")
print("   2. ✅ Temporal weighting (recent > old)")
print("   3. ✅ Season weighting (2025 > 2024)")
print("   4. ✅ Teammate comparison features")
print("   5. ✅ Circuit characteristics")
print("   6. ✅ Championship position features")
print("   7. ✅ Weighted rolling statistics")
print("\n💡 Models will now use sample_weight parameter for training!")

   Saving DataFrames to ml/data/processed/...
   Saving weights to ml/data/processed/...
   Saving metadata to ml/data/processed/...

✅ All data saved (V2)!

🎯 Key Improvements:
   1. ✅ Qualifying data included
   2. ✅ Temporal weighting (recent > old)
   3. ✅ Season weighting (2025 > 2024)
   4. ✅ Teammate comparison features
   5. ✅ Circuit characteristics
   6. ✅ Championship position features
   7. ✅ Weighted rolling statistics

💡 Models will now use sample_weight parameter for training!
