In [1]:
from IPython import get_ipython
from IPython.display import display
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

# --- Training Data Configuration ---
# Set this flag to control the data used for training:
# 1: Use all available historical data before 2025
# 2: Use data from 2000 to 2024
# 3: Use data from 2020 to 2024
TRAINING_DATA_RANGE = 2

# --- 1. Load Data ---
# Assuming the file 'f1_main_table_canada_full.csv' is in the same directory
try:
    # Using low_memory=False to avoid DtypeWarning
    df_full = pd.read_csv('f1_main_table_canada_full.csv', sep=';', low_memory=False)
except FileNotFoundError:
    print("Error: 'f1_main_table_canada_full.csv' not found. Please ensure the file is in the correct directory.")
    exit()
except Exception as e:
    print(f"Error loading CSV: {e}")
    exit()

# --- 2. Preprocess Data ---
# Select relevant columns
relevant_cols = [
    'year', 'round', 'circuitRef', 'driverRef', 'constructorRef',
    'race_grid', 'race_position', 'qualy_position', 'statusId'
]
df = df_full[relevant_cols].copy()

# Filter for Canadian GP (Circuit Gilles Villeneuve)
df = df[df['circuitRef'] == 'villeneuve'].copy()
if df.empty:
    print("Error: No data found for circuitRef 'villeneuve' (Canadian GP).")
    exit()

# Convert columns to numeric, coercing errors
numeric_cols = ['race_grid', 'race_position', 'qualy_position', 'year', 'round']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Handle missing crucial values
df.dropna(subset=['race_grid', 'race_position', 'qualy_position', 'driverRef', 'constructorRef', 'year'], inplace=True)

# Create target variable: is_winner (1 if race_position is 1, 0 otherwise)
df['is_winner'] = np.where(df['race_position'] == 1, 1, 0)

# Ensure year is an integer for easier filtering
df['year'] = df['year'].astype(int)

# --- 3. Feature Engineering & Encoding ---
features = ['year', 'round', 'driverRef', 'constructorRef', 'race_grid', 'qualy_position']
categorical_features = ['driverRef', 'constructorRef']

# Convert categorical features to 'category' dtype for LightGBM
# Store categories for consistent encoding in prediction data
category_mappings = {}
for col in categorical_features:
    df[col] = df[col].astype('category')
    category_mappings[col] = dict(enumerate(df[col].cat.categories))
    df[col + '_code'] = df[col].cat.codes # Use codes for training

# Update features list to use encoded columns
encoded_features = ['year', 'round', 'driverRef_code', 'constructorRef_code', 'race_grid', 'qualy_position']
X = df[encoded_features]
y = df['is_winner']

# --- 4. Train LightGBM Model ---
# Select data based on TRAINING_DATA_RANGE flag
if TRAINING_DATA_RANGE == 1:
    X_train = X[df['year'] < 2025]
    y_train = y[df['year'] < 2025]
    print("\nTraining on all historical data before 2025.")
elif TRAINING_DATA_RANGE == 2:
    X_train = X[(df['year'] >= 2000) & (df['year'] < 2025)]
    y_train = y[(df['year'] >= 2000) & (df['year'] < 2025)]
    print("\nTraining on data from 2000 to 2024.")
elif TRAINING_DATA_RANGE == 3:
    X_train = X[(df['year'] >= 2020) & (df['year'] < 2025)]
    y_train = y[(df['year'] >= 2020) & (df['year'] < 2025)]
    print("\nTraining on data from 2024 only.")
else:
    print("\nInvalid TRAINING_DATA_RANGE specified. Please use 1, 2, or 3.")
    exit()


if X_train.empty:
    print("Error: No training data available for the selected range.")
    print("Please check the 'f1_main_table_canada_full.csv' file and the TRAINING_DATA_RANGE setting.")
    exit()

# LightGBM Classifier
lgbm_model = lgb.LGBMClassifier(random_state=42, force_col_wise=True)
try:
    lgbm_model.fit(X_train, y_train, categorical_feature=['driverRef_code', 'constructorRef_code'])
    print(f"Model trained successfully on {len(X_train)} samples.")
except Exception as e:
    print(f"Error during model training: {e}")
    print("This can happen if categorical features have issues or data is too sparse for the selected training range.")
    exit()

# --- 5. Prepare 2025 Scenario Data ---
# Helper function to get the category code for a value - Not strictly needed anymore with astype(pd.CategoricalDtype)
# def get_code(value, col_name, mappings):
#     for code, cat_val in mappings[col_name].items():
#         if cat_val == value:
#             return code
#     return -1 # Or handle as unknown, LightGBM might need specific handling for this

# Helper function to find the most recent constructor for a driver
def get_latest_constructor(driver_ref, historical_df):
    # Use data before 2025 for the most recent constructor
    driver_data = historical_df[
        (historical_df['driverRef'] == driver_ref) & (historical_df['year'] < 2025)
    ].sort_values(by=['year', 'round'], ascending=[False, False])
    if not driver_data.empty:
        return driver_data.iloc[0]['constructorRef']
    return None # Fallback if driver not found or no constructor data before 2025

# Define drivers for scenarios
drivers_scenarios = {
    "VER": "max_verstappen",
    "NOR": "norris",
    "HAM": "hamilton"
}

predictions_2025 = {}

print("\n--- Predicting for 2025 Canadian GP ---")

for driver_short_name, driver_full_ref in drivers_scenarios.items():
    print(f"\nScenario for {driver_short_name} ({driver_full_ref}):")

    # Check if driver existed in the *full* historical data categories
    if driver_full_ref not in category_mappings['driverRef'].values():
        print(f"  Driver '{driver_full_ref}' not found in full historical data categories. Cannot make prediction.")
        predictions_2025[driver_short_name] = "N/A (Driver not in full historical data)"
        continue

    # Demo values for 2025
    # Use the full dataset before 2025 to find the latest constructor, regardless of the TRAINING_DATA_RANGE
    latest_constructor = get_latest_constructor(driver_full_ref, df_full[relevant_cols].copy())
    if latest_constructor is None:
        print(f"  Could not determine latest constructor for {driver_full_ref} from historical data before 2025. Using a placeholder 'unknown_constructor'.")
        latest_constructor = 'unknown_constructor'

    # Check if the assumed constructor was in the *full* historical data categories
    if latest_constructor not in category_mappings['constructorRef'].values() and latest_constructor != 'unknown_constructor':
         print(f"  Constructor '{latest_constructor}' for {driver_full_ref} was not in the full historical training data categories. Prediction might be unreliable or impossible depending on model handling of unseen categories.")
         # LightGBM handles unseen categories based on its configuration (e.g., zero_as_missing)
         # If 'unknown_constructor' is used and wasn't a category, it will be treated as missing.
         # If a known constructor from before 2025 is new to the training subset, LightGBM will treat it as unknown if its code is -1.

    scenario_data = {
        'year': [2025],
        'round': [10], # Hypothetical round number for Canadian GP in 2025
        'driverRef': [driver_full_ref],
        'constructorRef': [latest_constructor],
        'race_grid': [1], # Hypothetical: Driver starts P1
        'qualy_position': [1] # Hypothetical: Driver qualifies P1
    }
    scenario_df = pd.DataFrame(scenario_data)

    # Encode categorical features for the scenario using the *full* historical categories
    try:
        # Use the categories learned from the full dataset for consistent encoding
        scenario_df['driverRef_code'] = scenario_df['driverRef'].astype(pd.CategoricalDtype(categories=category_mappings['driverRef'].values())).cat.codes
        scenario_df['constructorRef_code'] = scenario_df['constructorRef'].astype(pd.CategoricalDtype(categories=category_mappings['constructorRef'].values())).cat.codes

        # Check if any codes are -1 (unknown category relative to the full historical data)
        if scenario_df['driverRef_code'].iloc[0] == -1:
            print(f"  Warning: Driver '{driver_full_ref}' was not in the full original historical categories after mapping. Prediction may be inaccurate.")
        if scenario_df['constructorRef_code'].iloc[0] == -1:
             print(f"  Warning: Constructor '{latest_constructor}' was not in the full original historical categories after mapping. Prediction may be inaccurate.")

    except Exception as e:
        print(f"  Error encoding scenario data for {driver_full_ref}: {e}")
        predictions_2025[driver_short_name] = f"N/A (Encoding error: {e})"
        continue

    # Select features for prediction - ensure order and names match training data
    X_scenario = scenario_df[encoded_features]

    # --- 6. Make Predictions ---
    try:
        # predict_proba returns probabilities for both classes (0 and 1).
        # We want the probability of winning, which is the probability of class 1.
        win_probability = lgbm_model.predict_proba(X_scenario)[:, 1]
        predictions_2025[driver_short_name] = f"{win_probability[0]*100:.2f}%"
        print(f"  Assumed Constructor for 2025: {latest_constructor}")
        print(f"  Assumed Starting Grid: P1")
        print(f"  Assumed Qualifying position: P1")
        print(f"  Predicted Win Probability: {predictions_2025[driver_short_name]}")
    except Exception as e:
        print(f"  Error during prediction for {driver_full_ref}: {e}")
        predictions_2025[driver_short_name] = f"N/A (Prediction error: {e})"


# --- 7. Output ---
print("\n--- Summary of 2025 Canadian GP Win Predictions (assuming P1 start) ---")
print(f"Note: Model trained using TRAINING_DATA_RANGE = {TRAINING_DATA_RANGE}")
print("\n1: Use all available historical data before 2025")
print("2: Use data from 2000 to 2024")
print("3: Use data from 2020 to 2024\n\n")

for driver_name, prob in predictions_2025.items():
    # Use the full driver name from the dictionary
    print(f"  {drivers_scenarios[driver_name]}: {prob}")

print("\n--- Explanation of Model Features ---")
print("The LightGBM model uses the following features to predict the probability of winning:")
print(f"- **{encoded_features[0]}**: The year of the race.")
print(f"- **{encoded_features[1]}**: The round number of the race within the season.")
print(f"- **{encoded_features[2]}**: A numerical code representing the driver.")
print(f"- **{encoded_features[3]}**: A numerical code representing the constructor (team).")
print(f"- **{encoded_features[4]}**: The driver's starting position on the grid for the race.")
print(f"- **{encoded_features[5]}**: The driver's qualifying position from the qualifying session.")
print("\nNote: Categorical features (Driver and Constructor) are numerically encoded by pandas and LightGBM handles them efficiently.")
print("\nDisclaimer:")
print("These predictions are based on historical data and the specific features used in the model.")
print("Actual race outcomes can be influenced by numerous factors not captured in this model (e.g., real-time car performance, strategy decisions, driver form fluctuations, race incidents, weather conditions, rule changes, etc.).")
print("The assumed constructor for 2025 is based on the driver's most recent team in the dataset prior to 2025.")
print("The prediction scenario assumes the driver starts and qualifies in P1 (first position).")


Training on data from 2000 to 2024.
[LightGBM] [Info] Number of positive: 2098, number of negative: 29604
[LightGBM] [Info] Total Bins 189
[LightGBM] [Info] Number of data points in the train set: 31702, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.066179 -> initscore=-2.646925
[LightGBM] [Info] Start training from score -2.646925
Model trained successfully on 31702 samples.

--- Predicting for 2025 Canadian GP ---

Scenario for VER (max_verstappen):
  Assumed Constructor for 2025: red_bull
  Assumed Starting Grid: P1
  Assumed Qualifying position: P1
  Predicted Win Probability: 94.47%

Scenario for NOR (norris):
  Assumed Constructor for 2025: mclaren
  Assumed Starting Grid: P1
  Assumed Qualifying position: P1
  Predicted Win Probability: 0.01%

Scenario for HAM (hamilton):
  Assumed Constructor for 2025: mercedes
  Assumed Starting Grid: P1
  Assumed Qualifying position: P1
  Predicted Win Probability: 94.47%

--- Summary of 2025 Canadian GP Win Pr