In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Miami Grand Prix data
f1_main_table_miami = pd.read_excel("f1_main_table_miami.xlsx", sheet_name='miami')

# Define drivers who will NOT participate in 2025
drivers_not_in_2025 = ['peres', 'bottas', 'kevin_magnussen', 'latifi', 'mick_schumacher', 'zhou', 'de_vries', 'sargeant'] # Replace with actual driverRefs

# Filter out the drivers not participating in 2025
f1_main_table_miami = f1_main_table_miami[~f1_main_table_miami['driverRef'].isin(drivers_not_in_2025)]

f1_main_table_miami['driver_points_last_race'] = f1_main_table_miami.groupby('driverRef')['driver_points'].shift(1)
f1_main_table_miami['position_last_race'] = f1_main_table_miami.groupby('driverRef')['position'].shift(1)

# Prepare the data
features = ['driver_points', 'driver_points_last_race', 'position', 'position_last_race', 'qualy_position', 'sprint_position', 'sprint_points', 'fastestLap', 'points', 'driverRef']  # Add more relevant features
target = 'position'

# Before dropping NaNs, make a copy to preserve 'driverRef'
X = f1_main_table_miami[features].copy()
X.dropna(inplace=True)
y = f1_main_table_miami.loc[X.index, target]

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['driverRef'] = le.fit_transform(X['driverRef'])
X.info()
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Choose and train a model RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

# Predict the winner for 2025 (example)
vDriver = 'max_verstappen'
new_data_2025 = pd.DataFrame({'driver_points': [100], 'driver_points_last_race': [24], 'position': [2], 'position_last_race': [1], 'qualy_position': [1], 'sprint_position': [2], 'sprint_points':[3], 'fastestLap': [1], 'points': [25], 'driverRef': [le.transform([vDriver])]})

# Prediction
predicted_position = model.predict(new_data_2025)[0]  # Get the predicted position
print(f"Predicted position for {vDriver} in 2025: {predicted_position}")

# Find the driverRef associated with the predicted position
predicted_driverRef = le.inverse_transform([X.loc[X['position'] == predicted_position, 'driverRef'].iloc[0]])[0]
print(f"Predicted Winner for 2025: {predicted_driverRef}")

<class 'pandas.core.frame.DataFrame'>
Index: 29 entries, 1 to 57
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   driver_points            29 non-null     int64  
 1   driver_points_last_race  29 non-null     float64
 2   position                 29 non-null     int64  
 3   position_last_race       29 non-null     float64
 4   qualy_position           29 non-null     int64  
 5   sprint_position          29 non-null     int64  
 6   sprint_points            29 non-null     int64  
 7   fastestLap               29 non-null     int64  
 8   points                   29 non-null     int64  
 9   driverRef                29 non-null     int64  
dtypes: float64(2), int64(8)
memory usage: 2.5 KB
Model Accuracy: 1.0
Predicted position for max_verstappen in 2025: 0
Predicted Winner for 2025: hamilton
