In [8]:
# F1 Race Winner Prediction Project - Input-Based Win Probability Estimator

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load CSVs
races = pd.read_csv("races.csv")
results = pd.read_csv("results.csv")
status = pd.read_csv("status.csv")
drivers = pd.read_csv("drivers.csv")
circuits = pd.read_csv("circuits.csv")

# Rename circuit name column to avoid conflict with races name
circuits.rename(columns={'name': 'circuit_name'}, inplace=True)

# Merge core datasets
merged_df = results.merge(
    races[['raceId', 'year', 'round', 'name', 'circuitId', 'date']],
    on='raceId', how='left'
)
merged_df = merged_df.merge(status, on='statusId', how='left')

# Drop unnecessary columns
columns_to_drop = [
    'resultId', 'number', 'position', 'positionText', 'time', 'milliseconds',
    'fastestLap', 'rank', 'fastestLapTime', 'fastestLapSpeed', 'statusId', 'url'
]
merged_df.drop(columns=[col for col in columns_to_drop if col in merged_df.columns], inplace=True)

# Create target column
merged_df['winner'] = (merged_df['positionOrder'] == 1).astype(int)
merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')
merged_df.sort_values(by=['year', 'round'], inplace=True)

# Feature Engineering: Win Rates
df = merged_df.copy()
df['driver_total_races'] = df.groupby('driverId').cumcount() + 1
df['driver_total_wins'] = df.groupby('driverId')['winner'].cumsum()
df['driver_win_rate'] = df['driver_total_wins'] / df['driver_total_races']
df['constructor_total_races'] = df.groupby('constructorId').cumcount() + 1
df['constructor_total_wins'] = df.groupby('constructorId')['winner'].cumsum()
df['constructor_win_rate'] = df['constructor_total_wins'] / df['constructor_total_races']

# Join driver and circuit names
drivers['fullName'] = drivers['forename'] + ' ' + drivers['surname']
df = df.merge(drivers[['driverId', 'fullName']], on='driverId', how='left')
df = df.merge(circuits[['circuitId', 'circuit_name', 'location', 'country']], on='circuitId', how='left')

# Filter only above-average drivers and circuits
avg_driver_win_rate = df['driver_win_rate'].mean()
avg_circuit_count = df['circuit_name'].value_counts().mean()

valid_drivers = df[df['driver_win_rate'] >= avg_driver_win_rate]['fullName'].unique()
valid_circuits = df['circuit_name'].value_counts()
valid_circuits = valid_circuits[valid_circuits >= avg_circuit_count].index.tolist()

# Filter dataset accordingly
df = df[df['fullName'].isin(valid_drivers) & df['circuit_name'].isin(valid_circuits)]

# Encode categorical variables
le_driver = LabelEncoder()
le_circuit = LabelEncoder()
df['driver_encoded'] = le_driver.fit_transform(df['fullName'])
df['circuit_encoded'] = le_circuit.fit_transform(df['circuit_name'])

# Feature columns
features = ['grid', 'driver_win_rate', 'constructor_win_rate', 'driver_encoded', 'circuit_encoded']
X = df[features]
y = df['winner']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Prediction Function
def predict_win_probability():
    print("\nAvailable Drivers:")
    for name in sorted(df['fullName'].unique()):
        print(" -", name)

    print("\nAvailable Circuits:")
    for name in sorted(df['circuit_name'].unique()):
        print(" -", name)

    driver_name = input("\nEnter Driver Name exactly as shown: ")
    circuit_name = input("Enter Circuit Name exactly as shown: ")
    grid_position = int(input("Enter Grid Position (e.g., 1 for pole): "))

    if driver_name not in le_driver.classes_:
        print("\n❌ Driver not found. Please check the name.")
        return

    if circuit_name not in le_circuit.classes_:
        print("\n❌ Circuit not found. Please check the name.")
        return

    driver_encoded = le_driver.transform([driver_name])[0]
    circuit_encoded = le_circuit.transform([circuit_name])[0]

    # Average win rates as fallback values
    driver_win_rate = df[df['fullName'] == driver_name]['driver_win_rate'].mean()
    constructor_win_rate = df[df['fullName'] == driver_name]['constructor_win_rate'].mean()

    input_data = pd.DataFrame({
        'grid': [grid_position],
        'driver_win_rate': [driver_win_rate],
        'constructor_win_rate': [constructor_win_rate],
        'driver_encoded': [driver_encoded],
        'circuit_encoded': [circuit_encoded]
    })

    win_prob = model.predict_proba(input_data)[0][1]
    print(f"\n🔍 {driver_name} at {circuit_name} starting from grid {grid_position} has a {win_prob*100:.2f}% chance of winning.")

# Run the prediction with user inputs
predict_win_probability()


Available Drivers:
 - Alain Prost
 - Alan Jones
 - Alberto Ascari
 - Ayrton Senna
 - Bill Vukovich
 - Bob Sweikert
 - Bruce McLaren
 - Carlos Reutemann
 - Charles Leclerc
 - Clay Regazzoni
 - Damon Hill
 - Dan Gurney
 - Daniel Ricciardo
 - David Coulthard
 - Denny Hulme
 - Didier Pironi
 - Eddie Irvine
 - Emerson Fittipaldi
 - Felipe Massa
 - Fernando Alonso
 - François Cevert
 - Gerhard Berger
 - Giancarlo Baghetti
 - Gilles Villeneuve
 - Graham Hill
 - Gunnar Nilsson
 - Heikki Kovalainen
 - Innes Ireland
 - Jack Brabham
 - Jackie Stewart
 - Jacky Ickx
 - Jacques Laffite
 - Jacques Villeneuve
 - James Hunt
 - Jean-Pierre Jabouille
 - Jenson Button
 - Jim Clark
 - Jim Rathmann
 - Jimmy Bryan
 - Jo Bonnier
 - Jochen Mass
 - Jochen Rindt
 - Jody Scheckter
 - John Surtees
 - John Watson
 - Johnnie Parsons
 - José Froilán González
 - Juan Fangio
 - Juan Pablo Montoya
 - Keke Rosberg
 - Kimi Räikkönen
 - Lee Wallard
 - Lewis Hamilton
 - Lorenzo Bandini
 - Ludovico Scarfiotti
 - Luigi Fagio