In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

file_path = './data/raceresults2024carp.csv'
race_data = pd.read_csv(file_path)

# Display the first 10 rows in the notebook
race_data.head(10)


Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,carPerformance
0,26286,1121,830,9,33,1,1,1,1,26,57,31:44.7,5504742,39,1,01:32.6,210.383,1,1.0
1,26287,1121,815,9,11,5,2,2,2,18,57,22.457,5527199,40,4,01:34.4,206.468,1,1.0
2,26288,1121,832,6,55,4,3,3,3,15,57,25.11,5529852,44,6,01:34.5,206.156,1,0.96
3,26289,1121,844,6,16,2,4,4,4,12,57,39.669,5544411,36,2,01:34.1,207.069,1,0.96
4,26290,1121,847,131,63,3,5,5,5,10,57,46.788,5551530,40,12,01:35.1,204.946,1,0.95
5,26291,1121,846,1,4,7,6,6,6,8,57,48.458,5553200,1,5,01:34.5,206.223,1,0.97
6,26292,1121,1,131,44,9,7,7,7,6,57,50.324,5555066,39,7,01:34.7,205.688,1,0.95
7,26293,1121,857,1,81,8,8,8,8,4,57,56.082,5560824,1,11,01:35.0,205.123,1,0.97
8,26294,1121,4,117,14,6,9,9,9,2,57,+1:14.887,5579629,48,3,01:34.2,206.83,1,0.88
9,26295,1121,840,117,18,12,10,10,10,1,57,+1:33.216,5597958,30,16,01:35.6,203.73,1,0.88


In [2]:
latest_race_id = race_data['raceId'].max()

latest_race_id

latest_race_data = race_data[race_data['raceId'] == latest_race_id]

latest_drivers = latest_race_data['driverId'].unique()

latest_drivers  

# Filter data to include only the drivers from the most recent race
relevant_data = race_data[race_data['driverId'].isin(latest_drivers)]

# Feature Engineering: Calculate average finish position, total points, average starting grid, and top-10 finishes
driver_features = relevant_data.groupby('driverId').agg(
    driver_number = ('number', 'first'),
    avg_finish_position=('positionOrder', 'mean'),
    total_points=('points', 'sum'),
    avg_start_position=('grid', 'mean'),
    top_10_finishes=('positionOrder', lambda x: (x <= 10).sum()),  # Count finishes within top 10
    avg_car_performance=('carPerformance', 'mean')
).reset_index()

driver_features



Unnamed: 0,driverId,driver_number,avg_finish_position,total_points,avg_start_position,top_10_finishes,avg_car_performance
0,1,44,6.95,176,7.35,18,0.9575
1,4,14,10.15,62,9.25,12,0.865
2,807,27,11.35,28,11.65,8,0.9105
3,815,11,8.65,137,7.95,15,0.939
4,822,77,16.0,0,14.5,0,0.85
5,825,20,13.526316,12,14.0,4,0.909474
6,830,33,3.7,330,2.85,19,0.939
7,832,55,5.473684,221,5.210526,17,0.973684
8,839,31,13.8,5,12.7,4,0.866
9,840,18,12.75,24,12.4,6,0.8635


In [3]:
# Define features and target
X = driver_features[['total_points', 'avg_start_position', 'top_10_finishes', 'avg_car_performance']]
#X = driver_features[['total_points', 'avg_start_position', 'top_10_finishes']]
y = driver_features['avg_finish_position']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest Regressor
model = Ridge()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [4]:
# Evaluate the model using Mean Absolute Error (MAE)
mean_absolute_error(y_test, y_pred)

np.float64(0.5751701922097194)

In [5]:
#Predict the average finish position for the drivers in the next race
predicted_finish_positions = model.predict(X)
driver_features['predicted_avg_finish_position'] = predicted_finish_positions

# Sort the drivers based on predicted finish position
driver_features = driver_features.sort_values('predicted_avg_finish_position')
driver_features

#normalize the predicted finish positions from 1 to 20
driver_features['predicted_finish_rank'] = driver_features['predicted_avg_finish_position'].rank(method='min')
driver_features

Unnamed: 0,driverId,driver_number,avg_finish_position,total_points,avg_start_position,top_10_finishes,avg_car_performance,predicted_avg_finish_position,predicted_finish_rank
6,830,33,3.7,330,2.85,19,0.939,3.334597,1.0
12,846,4,4.0,300,3.5,19,0.9915,3.968668,2.0
11,844,16,4.75,272,4.8,17,0.973,5.015902,3.0
17,857,81,4.75,239,5.4,19,0.9915,5.357172,4.0
7,832,55,5.473684,221,5.210526,17,0.973684,6.035805,5.0
0,1,44,6.95,176,7.35,18,0.9575,6.976089,6.0
13,847,63,7.4,167,6.25,17,0.9605,7.191905,7.0
3,815,11,8.65,137,7.95,15,0.939,8.325871,8.0
1,4,14,10.15,62,9.25,12,0.865,10.4604,9.0
2,807,27,11.35,28,11.65,8,0.9105,12.08882,10.0


In [6]:
mean_squared_error(y_test, y_pred)

np.float64(0.5767975697416269)

In [7]:
r2_score(y_test, y_pred)

0.9499633424644002

In [8]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [9]:
from sklearn.model_selection import GridSearchCV

In [10]:
model_cv = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)

In [11]:
model_cv.fit(X_train, y_train)
y_pred = model_cv.predict(X_test)

ValueError: Invalid parameter 'max_depth' for estimator Ridge(). Valid parameters are: ['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'positive', 'random_state', 'solver', 'tol'].

In [41]:
mean_absolute_error(y_test, y_pred)

np.float64(0.5968125000000075)

In [42]:
mean_squared_error(y_test, y_pred)

np.float64(0.4069593474290245)

In [43]:
r2_score(y_test, y_pred)

0.9646966517086077

In [44]:
# Predict the average finish position for the drivers in the next race
predicted_finish_positions = model_cv.predict(X)
driver_features['predicted_avg_finish_position'] = predicted_finish_positions

# Sort the drivers based on predicted finish position
driver_features = driver_features.sort_values('predicted_avg_finish_position')
driver_features

#normalize the predicted finish positions from 1 to 20
driver_features['predicted_finish_rank'] = driver_features['predicted_avg_finish_position'].rank(method='min')
driver_features

Unnamed: 0,driverId,driver_number,avg_finish_position,total_points,avg_start_position,top_10_finishes,avg_car_performance,predicted_avg_finish_position,predicted_finish_rank
15,852,22,13.75,21,11.55,7,0.889,4.193408,1.0
0,1,44,6.95,176,7.35,18,0.9575,4.431329,2.0
9,840,18,12.75,24,12.4,6,0.8635,4.867079,3.0
10,842,10,13.7,9,13.15,6,0.866,5.040961,4.0
3,815,11,8.65,137,7.95,15,0.939,5.49925,5.0
5,825,20,13.526316,12,14.0,4,0.909474,7.050816,6.0
12,846,4,4.0,300,3.5,19,0.9915,7.423487,7.0
17,857,81,4.75,239,5.4,19,0.9915,8.217947,8.0
6,830,33,3.7,330,2.85,19,0.939,10.994908,9.0
16,855,24,16.15,0,17.15,0,0.85,11.296697,10.0
