In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display

# Load the data (assuming you have already mounted your drive and are in the correct directory)
try:
    # Added low_memory=False to handle potential mixed types and suppress the warning
    f1_main_table_barcelona_full = pd.read_csv('f1_main_table_barcelona_full.csv', sep=';', na_values='NULL', low_memory=False)
except FileNotFoundError:
    print("Ensure 'f1_main_table_barcelona_full.csv' is in the current directory.")
    exit()

# Convert 'race_position' to numeric, coercing errors to NaN
f1_main_table_barcelona_full['race_position'] = pd.to_numeric(f1_main_table_barcelona_full['race_position'], errors='coerce')

# Define the target variable: 1 if finished in 1st place (winner), 0 otherwise
# We'll use 'race_position' for this
# Handle NaN values in 'race_position' by assigning 0 (not the winner)
f1_main_table_barcelona_full['is_winner'] = f1_main_table_barcelona_full['race_position'].apply(lambda x: 1 if pd.notna(x) and x == 1 else 0)

# Select features (choose relevant columns available before a race)
# Let's select a few illustrative features
features = ['year', 'round', 'driver_points', 'driver_wins', 'constructors_points', 'constructors_wins',
            'driverRef', 'constructors_name']

X = f1_main_table_barcelona_full[features].copy()
# Use the new target variable 'is_winner'
y = f1_main_table_barcelona_full['is_winner'].copy()

# Separate data for training and for the "2025 prediction" (using 2024 data as proxy)
# Training data includes all historical data up to the year BEFORE the prediction year (2024)
X_train = X[X['year'] < 2024]
y_train = y[X['year'] < 2024]
# Prediction data is for the most recent year available (2024)
X_predict_2025 = X[X['year'] == 2024].copy() # Use .copy() to avoid SettingWithCopyWarning later

# Check if there is any data for the prediction year
if X_predict_2025.empty:
    print("No data found for the year 2024. Cannot make predictions for '2025' race based on 2024 data.")
else:
    # Store driverRef and constructors_name for final output before dropping
    predict_drivers_constructors = X_predict_2025[['driverRef', 'constructors_name']].copy()

    # Drop the 'year' column as it's not a predictive feature itself after splitting
    X_train = X_train.drop('year', axis=1)
    X_predict_2025_processed = X_predict_2025.drop('year', axis=1)

    # Define categorical and numerical features
    categorical_features = ['driverRef', 'constructors_name']
    numerical_features = [col for col in X_train.columns if col not in categorical_features]

    # Create preprocessing pipelines for numerical and categorical features
    numerical_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore') # handle_unknown='ignore' is important for unseen categories

    # Create a column transformer to apply different transformations to different columns
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)])

    # Create the KNN model pipeline
    model = Pipeline(steps=[('preprocessor', preprocessor),
                          ('classifier', KNeighborsClassifier(n_neighbors=5))]) # Using 5 neighbors as a starting point

    # Train the model using historical data (years < 2024)
    model.fit(X_train, y_train)

    # Make predictions for the "2025 race" (using 2024 data)
    # We need to ensure the columns in X_predict_2025 are the same as X_train before preprocessing
    # The column transformer handles the one-hot encoding consistently if 'handle_unknown' is 'ignore'
    predictions_proba_2025 = model.predict_proba(X_predict_2025_processed)
    predictions_2025 = model.predict(X_predict_2025_processed)

    # Add predictions back to the 2024 data for analysis
    # Create a temporary DataFrame to hold predictions aligned with the original 2024 data index
    predictions_df = pd.DataFrame({
        'predicted_winner': predictions_2025,
        'winner_proba': predictions_proba_2025[:, 1] # Probability of being the Winner
    }, index=X_predict_2025.index) # Use the index of X_predict_2025 to align

    # Concatenate the original relevant columns with the predictions
    X_predict_2025_results = pd.concat([predict_drivers_constructors, predictions_df], axis=1)


    # Get the drivers predicted to be the winner and their probabilities
    predicted_winner_candidates = X_predict_2025_results[X_predict_2025_results['predicted_winner'] == 1].sort_values(by='winner_proba', ascending=False)

    # --- Explanation Section ---
    print("--- Prediction Explanation ---")
    print("This output shows the predicted *winner* (1st place driver) for the '2025' Spain Grand Prix, based on a machine learning model trained on historical F1 data up to 2023 and applied to 2024 season data as a proxy for a future race.")
    print("\nFeatures considered for the prediction (available before a race):")
    print(f"- Numerical Features: {numerical_features} (These are scaled to have similar ranges)")
    print(f"- Categorical Features: {categorical_features} (These are converted into numerical format using One-Hot Encoding)")
    print("\nHow the prediction is made using K-Nearest Neighbors (KNN):")
    print("1.  **Data Preprocessing:**")
    print("    -   Numerical features (like points, wins, round number) are scaled so that features with larger values don't unfairly influence the distance calculation in KNN.")
    print("    -   Categorical features (like driver and constructor names) are converted into a numerical representation where each unique category becomes a new binary (0 or 1) column.")
    print("2.  **Model Training:** A KNN classifier is trained on historical data from years prior to 2024. The model learns the relationship between the processed features and whether a driver finished in 1st place.")
    print("3.  **Prediction:** For each driver entry in the 2024 data:")
    print("    -   The model finds the 5 most 'similar' historical race entries from the training data based on the preprocessed features.")
    print("    -   It then predicts the outcome (Winner or Not Winner) for the 2024 driver based on the most frequent outcome among these 5 neighbors.")
    print("4.  **Probability:** The 'Probability' score indicates the proportion of the 5 nearest neighbors that were winners in their historical races. A higher probability suggests a stronger prediction for winning.")
    print("-" * 30) # Separator line
    # --- End Explanation Section ---


    print("\nPredicted Winner for the '2025' Spain Grand Prix (based on 2024 data):")

    # Get the single top predicted winner based on probability
    predicted_winner = predicted_winner_candidates.head(1)

    if not predicted_winner.empty:
        # Get the details of the top predicted winner
        winner_row = predicted_winner.iloc[0]
        print(f"Predicted Winner: {winner_row['driverRef']} ({winner_row['constructors_name']}) - Probability: {winner_row['winner_proba']:.4f}")
    else:
        print("No driver predicted to be the winner.")

    # You can also evaluate the model's performance on the training data (optional but recommended)
    # This gives you an idea of how well the model performed on historical data
    # y_train_pred = model.predict(X_train)
    # print(classification_report(y_train, y_train_pred))

--- Prediction Explanation ---
This output shows the predicted *winner* (1st place driver) for the '2025' Spain Grand Prix, based on a machine learning model trained on historical F1 data up to 2023 and applied to 2024 season data as a proxy for a future race.

Features considered for the prediction (available before a race):
- Numerical Features: ['round', 'driver_points', 'driver_wins', 'constructors_points', 'constructors_wins'] (These are scaled to have similar ranges)
- Categorical Features: ['driverRef', 'constructors_name'] (These are converted into numerical format using One-Hot Encoding)

How the prediction is made using K-Nearest Neighbors (KNN):
1.  **Data Preprocessing:**
    -   Numerical features (like points, wins, round number) are scaled so that features with larger values don't unfairly influence the distance calculation in KNN.
    -   Categorical features (like driver and constructor names) are converted into a numerical representation where each unique category bec