In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import joblib

print("Imports loaded ✔")


Imports loaded ✔


In [2]:
BASE = Path("C:/Users/HP/Desktop/f1/f1_world_championship_data/processed")

results = pd.read_csv(BASE / "results.csv")
races = pd.read_csv(BASE / "races.csv")
drivers = pd.read_csv(BASE / "drivers.csv")
constructors = pd.read_csv(BASE / "constructors.csv")
pitstops = pd.read_csv(BASE / "pit_stops.csv")
qualifying = pd.read_csv(BASE / "qualifying.csv")
driver_stand = pd.read_csv(BASE / "driver_standings.csv")
constructor_stand = pd.read_csv(BASE / "constructor_standings.csv")
circuits = pd.read_csv(BASE / "circuits.csv")

print("All CSVs loaded ✔")


All CSVs loaded ✔


In [3]:
df = results.copy()

df = df.merge(races[['raceId','year','round','circuitId','name']], on="raceId", how="left")
df = df.merge(drivers[['driverId','surname']], on='driverId', how='left')
df = df.merge(constructors[['constructorId','name']], on='constructorId', how='left', suffixes=("", "_constructor"))

print(df.head())
print("Base merge done ✔")


   resultId  raceId  driverId  constructorId number  grid position  \
0         1      18         1              1     22     1        1   
1         2      18         2              2      3     5        2   
2         3      18         3              3      7     7        3   
3         4      18         4              4      5    11        4   
4         5      18         5              1     23     3        5   

  positionText  positionOrder  points  ...  rank fastestLapTime  \
0            1              1    10.0  ...     2       1:27.452   
1            2              2     8.0  ...     3       1:27.739   
2            3              3     6.0  ...     5       1:28.090   
3            4              4     5.0  ...     7       1:28.603   
4            5              5     4.0  ...     1       1:27.418   

  fastestLapSpeed statusId  year round circuitId                   name  \
0         218.300        1  2008     1         1  Australian Grand Prix   
1         217.586        1

In [4]:
df['position'] = pd.to_numeric(df['position'], errors='coerce')
df['top10'] = (df['position'] <= 10).astype(int)

print(df[['position','top10']].head())
print("Target prepared ✔")


   position  top10
0       1.0      1
1       2.0      1
2       3.0      1
3       4.0      1
4       5.0      1
Target prepared ✔


In [5]:
qual_small = qualifying[['raceId','driverId','position']].rename(columns={'position':'qual_pos'})
df = df.merge(qual_small, on=['raceId','driverId'], how='left')

df['grid'] = df['grid'].fillna(99)
df['qual_pos'] = df['qual_pos'].fillna(df['grid'])

print(df[['grid','qual_pos']].head())
print("Qualifying merged ✔")


   grid  qual_pos
0     1       1.0
1     5       5.0
2     7       7.0
3    11      12.0
4     3       3.0
Qualifying merged ✔


In [6]:
df = df.sort_values(["driverId","year","round"])

df['recent_mean_finish'] = df.groupby("driverId")['position'].transform(lambda x: x.rolling(3, min_periods=1).mean())
df['recent_top10_rate'] = df.groupby("driverId")['top10'].transform(lambda x: x.rolling(3, min_periods=1).mean())

df['recent_mean_finish'] = df['recent_mean_finish'].fillna(20)
df['recent_top10_rate'] = df['recent_top10_rate'].fillna(0)

print("Rolling stats added ✔")


Rolling stats added ✔


In [7]:
df = df.sort_values(["constructorId","year","round"])

df['constr_recent_mean_finish'] = df.groupby("constructorId")['position'].transform(lambda x: x.rolling(3, min_periods=1).mean())
df['constr_recent_mean_finish'] = df['constr_recent_mean_finish'].fillna(20)

print("Constructor rolling stats ✔")


Constructor rolling stats ✔


In [8]:
pit_agg = pitstops.groupby(['raceId','driverId']).agg(
    pit_stop_count=('stop','count'),
    total_pit_time=('milliseconds','sum'),
    avg_pit_time=('milliseconds','mean')
).reset_index()

df = df.merge(pit_agg, on=['raceId','driverId'], how='left')

df['pit_stop_count'] = df['pit_stop_count'].fillna(0)
df['total_pit_time'] = df['total_pit_time'].fillna(0)
df['avg_pit_time'] = df['avg_pit_time'].fillna(0)

df['slow_pit_flag'] = (df['avg_pit_time'] > 30000).astype(int)

print("Pit stop features added ✔")


Pit stop features added ✔


In [9]:
ds = driver_stand.groupby(['raceId','driverId']).agg(
    driver_points_so_far=('points','max'),
    driver_rank_season=('position','min')
).reset_index()

df = df.merge(ds, on=['raceId','driverId'], how='left')

df['driver_points_so_far'] = df['driver_points_so_far'].fillna(0)
df['driver_rank_season'] = df['driver_rank_season'].fillna(20)

print("Driver standings merged ✔")


Driver standings merged ✔


In [10]:
cs = constructor_stand.groupby(['raceId','constructorId']).agg(
    constructor_points_so_far=('points','max'),
    constructor_rank_season=('position','min')
).reset_index()

df = df.merge(cs, on=['raceId','constructorId'], how='left')

df['constructor_points_so_far'] = df['constructor_points_so_far'].fillna(0)
df['constructor_rank_season'] = df['constructor_rank_season'].fillna(10)

print("Constructor standings merged ✔")


Constructor standings merged ✔


In [13]:
# --- Inspect circuits to see available columns (debug) ---
print("circuits columns:", list(circuits.columns))
display(circuits.head(8))

# --- If 'is_street' missing, try to create/infer it ---
if 'is_street' not in circuits.columns:
    print("`is_street` column not found in circuits. Creating an inferred `is_street` column.")
    # Lowercase helper for text matching
    def infer_is_street_row(row):
        text = ""
        for c in ['name','circuitRef','location','country']:
            if c in row and pd.notna(row[c]):
                text += str(row[c]).lower() + " "
        # common substrings that indicate street circuits
        street_indicators = [
            "street", "street circuit", "city", "harbour", "harbor", "downtown",
            "monaco", "baku", "singapore", "jeddah", "saudi", "marina", "urban"
        ]
        # if any indicator appears in combined text -> mark as street
        for s in street_indicators:
            if s in text:
                return True
        return False

    # Apply inference
    circuits['is_street'] = circuits.apply(infer_is_street_row, axis=1).astype(bool)

    # OPTIONAL: override some false positives/negatives with a small manual list
    # (add circuit names or circuitId values if you know them)
    manual_street_names = [
        "monaco", "baku", "singapore", "jeddah", "brazil (saint ?)","marina bay"
    ]
    circuits['name_lower'] = circuits['name'].astype(str).str.lower()
    for nm in manual_street_names:
        circuits.loc[circuits['name_lower'].str.contains(nm, na=False), 'is_street'] = True
    circuits.drop(columns=['name_lower'], inplace=True)

else:
    # ensure boolean
    circuits['is_street'] = circuits['is_street'].astype(bool)
    print("`is_street` exists; converted to bool.")

# Quick check
print("Summary of inferred is_street values:")
print(circuits['is_street'].value_counts(dropna=False))
display(circuits[['circuitId','name','location','country','is_street']].head(12))

# --- Now merge into df (safe) ---
# keep only needed circuit columns
cir_small = circuits[['circuitId','is_street','name','location','country']].drop_duplicates(subset=['circuitId'])

# If df doesn't already have circuitId, try to merge race->circuit mapping first
if 'circuitId' not in df.columns:
    if 'raceId' in df.columns and 'circuitId' in races.columns:
        df = df.merge(races[['raceId','circuitId']], on='raceId', how='left')
    else:
        print("Warning: df does not have 'circuitId' and races doesn't provide it — skipping circuit merge.")
        
# Merge
if 'circuitId' in df.columns:
    df = df.merge(cir_small, on='circuitId', how='left', suffixes=('','_c'))
    # If merge leaves NaNs, fill with False (non-street)
    df['is_street'] = df['is_street'].fillna(False).astype(int)
    print("Merged circuit features into df. Missing values filled with 0.")
else:
    # create column with default 0 so downstream feature-list won't break
    df['is_street'] = 0
    print("circuitId not available; created df['is_street']=0 (default).")

# --- Ensure features exist before building X ---
for feat in [
    'grid','qual_pos','recent_mean_finish','recent_top10_rate','constr_recent_mean_finish',
    'pit_stop_count','total_pit_time','avg_pit_time','slow_pit_flag',
    'is_street','driver_points_so_far','driver_rank_season','constructor_points_so_far','constructor_rank_season'
]:
    if feat not in df.columns:
        print(f"Note: feature '{feat}' missing in df — creating default zeros.")
        df[feat] = 0

# Now build X and y safely
features = [
    'grid','qual_pos',
    'recent_mean_finish','recent_top10_rate',
    'constr_recent_mean_finish',
    'pit_stop_count','total_pit_time','avg_pit_time','slow_pit_flag',
    'is_street',
    'driver_points_so_far','driver_rank_season',
    'constructor_points_so_far','constructor_rank_season'
]

X = df[features].fillna(0)
y = df['top10']

print("Features dataframe X shape:", X.shape)
print("Done — you can continue training/evaluating.")


circuits columns: ['circuitId', 'circuitRef', 'name', 'location', 'country', 'lat', 'lng', 'alt', 'url']


Unnamed: 0,circuitId,circuitRef,name,location,country,lat,lng,alt,url
0,1,albert_park,Albert Park Grand Prix Circuit,Melbourne,Australia,-37.8497,144.968,10,http://en.wikipedia.org/wiki/Melbourne_Grand_P...
1,2,sepang,Sepang International Circuit,Kuala Lumpur,Malaysia,2.76083,101.738,18,http://en.wikipedia.org/wiki/Sepang_Internatio...
2,3,bahrain,Bahrain International Circuit,Sakhir,Bahrain,26.0325,50.5106,7,http://en.wikipedia.org/wiki/Bahrain_Internati...
3,4,catalunya,Circuit de Barcelona-Catalunya,Montmeló,Spain,41.57,2.26111,109,http://en.wikipedia.org/wiki/Circuit_de_Barcel...
4,5,istanbul,Istanbul Park,Istanbul,Turkey,40.9517,29.405,130,http://en.wikipedia.org/wiki/Istanbul_Park
5,6,monaco,Circuit de Monaco,Monte-Carlo,Monaco,43.7347,7.42056,7,http://en.wikipedia.org/wiki/Circuit_de_Monaco
6,7,villeneuve,Circuit Gilles Villeneuve,Montreal,Canada,45.5,-73.5228,13,http://en.wikipedia.org/wiki/Circuit_Gilles_Vi...
7,8,magny_cours,Circuit de Nevers Magny-Cours,Magny Cours,France,46.8642,3.16361,228,http://en.wikipedia.org/wiki/Circuit_de_Nevers...


`is_street` column not found in circuits. Creating an inferred `is_street` column.
Summary of inferred is_street values:
is_street
False    65
True     12
Name: count, dtype: int64


  circuits.loc[circuits['name_lower'].str.contains(nm, na=False), 'is_street'] = True


Unnamed: 0,circuitId,name,location,country,is_street
0,1,Albert Park Grand Prix Circuit,Melbourne,Australia,False
1,2,Sepang International Circuit,Kuala Lumpur,Malaysia,False
2,3,Bahrain International Circuit,Sakhir,Bahrain,False
3,4,Circuit de Barcelona-Catalunya,Montmeló,Spain,False
4,5,Istanbul Park,Istanbul,Turkey,False
5,6,Circuit de Monaco,Monte-Carlo,Monaco,True
6,7,Circuit Gilles Villeneuve,Montreal,Canada,False
7,8,Circuit de Nevers Magny-Cours,Magny Cours,France,False
8,9,Silverstone Circuit,Silverstone,UK,False
9,10,Hockenheimring,Hockenheim,Germany,False


Merged circuit features into df. Missing values filled with 0.
Features dataframe X shape: (26759, 14)
Done — you can continue training/evaluating.


In [None]:
train_df = df[df["year"] < df["year"].max()]
test_df = df[df["year"] == df["year"].max()]

X_train = train_df[features]
y_train = train_df['top10']
X_test = test_df[features]
y_test = test_df['top10']

model = RandomForestClassifier(
    n_estimators=500,
    max_depth=12,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)

pred_proba = model.predict_proba(X_test)[:, 1]

roc = roc_auc_score(y_test, pred_proba)
roc


In [None]:
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(exist_ok=True)

joblib.dump(model, MODEL_DIR / "rf_top10_model.joblib")
joblib.dump({"features": features}, MODEL_DIR / "feature_list.joblib")

print("Model + features saved ✔")
