In [2]:
#import
# Data Processing
import pandas as pd
import numpy as np

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.preprocessing import LabelEncoder

# Tree Visualisation
from sklearn.tree import export_graphviz
from IPython.display import Image
#import graphviz

import copy




In [3]:

lap_data = pd.read_csv('output.csv')
lap_data.shape

(289962, 11)

In [4]:
lap_data.describe()

Unnamed: 0,lap_num,position_during_lap,age,start_pos
count,289962.0,289952.0,289962.0,289962.0
mean,30.254002,9.919956,27.957484,10.477163
std,18.281548,5.569781,5.358625,6.091154
min,1.0,1.0,17.0,0.0
25%,15.0,5.0,24.0,5.0
50%,29.0,10.0,27.0,10.0
75%,45.0,14.0,32.0,16.0
max,87.0,24.0,44.0,24.0


In [5]:
lap_data.values

array([['00:01:40.424', 1, 1.0, ..., 1, '2024-09-22', nan],
       ['00:01:38.48', 2, 1.0, ..., 1, '2024-09-22', nan],
       ['00:01:38.406', 3, 1.0, ..., 1, '2024-09-22', nan],
       ...,
       ['00:01:31.812', 56, 7.0, ..., 22, '2012-03-18', nan],
       ['00:01:32.353', 57, 7.0, ..., 22, '2012-03-18', nan],
       ['00:01:40.163', 58, 8.0, ..., 22, '2012-03-18', nan]],
      dtype=object)

In [6]:
print(lap_data['session_status'].unique())

['Finished' '+1 Lap' 'Power Unit' 'Accident' 'Retired' 'Overheating'
 '+2 Laps' 'Disqualified' 'Hydraulics' 'Water pressure' '+7 Laps'
 'Collision' 'Engine' 'Gearbox' 'Brakes' 'Collision damage' 'Withdrew'
 'Undertray' 'Radiator' 'Illness' 'Rear wing' 'Technical' 'Steering'
 'Oil leak' 'Power loss' 'Mechanical' 'Electrical' 'Water leak' 'Spun off'
 'Differential' '+6 Laps' 'Fuel pump' 'Vibrations' 'Turbo' 'Suspension'
 'Front wing' 'Fuel leak' 'Water pump' 'Cooling system' 'Fuel pressure'
 'Damage' 'Puncture' '+3 Laps' 'Wheel nut' 'Transmission' 'Debris'
 'Electronics' '+5 Laps' 'Wheel' 'Exhaust' 'Out of fuel' 'Battery' 'Tyre'
 'Spark plugs' 'Oil pressure' 'Seat' 'Driveshaft' '+4 Laps' 'Excluded'
 'Brake duct' 'Throttle' 'ERS' '+8 Laps' 'Drivetrain' 'Clutch'
 'Alternator' 'Pneumatics' '+11 Laps']


In [7]:
#Add field laps no nth since last pit of race start
lap_data['laps_since_last_pit_or_start'] = 0

last_pit_lap = 0
last_lap_num = 0;
for idx,  row in lap_data.iterrows():
    if row['lap_num'] == 1:
        last_pit_lap = 1
        last_lap_num = 1;
        lap_data.at[idx, 'laps_since_last_pit_or_start'] = last_pit_lap
        last_pit_lap += 1
    elif pd.notna(row['pitstop_duration']):
        lap_data.at[idx, 'laps_since_last_pit_or_start'] = last_pit_lap
        last_pit_lap = 1
    else:
        lap_data.at[idx, 'laps_since_last_pit_or_start'] = last_pit_lap
        last_pit_lap += (row['lap_num'] - last_lap_num)
    last_lap_num = row['lap_num'];
lap_data



Unnamed: 0,laptime,lap_num,position_during_lap,driver_ref,age,team_ref,circuit_ref,session_status,start_pos,race_date,pitstop_duration,laps_since_last_pit_or_start
0,00:01:40.424,1,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,,1
1,00:01:38.48,2,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,,2
2,00:01:38.406,3,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,,3
3,00:01:38.446,4,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,,4
4,00:01:37.938,5,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,,5
...,...,...,...,...,...,...,...,...,...,...,...,...
289957,00:01:31.621,54,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,,30
289958,00:01:31.559,55,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,,31
289959,00:01:31.812,56,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,,32
289960,00:01:32.353,57,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,,33


In [8]:
#Clean data

#take the laps where position during lap is NOT null and drop pitstop_duration
data_to_clean = ['lap_num', 'position_during_lap', 'driver_ref', 'age', 'team_ref','circuit_ref','start_pos','race_date', 'laps_since_last_pit_or_start']
for label in data_to_clean:
    lap_data = lap_data[lap_data[label].notna()]

lap_data = lap_data.drop('pitstop_duration', axis = 1)


lap_data

Unnamed: 0,laptime,lap_num,position_during_lap,driver_ref,age,team_ref,circuit_ref,session_status,start_pos,race_date,laps_since_last_pit_or_start
0,00:01:40.424,1,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,1
1,00:01:38.48,2,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,2
2,00:01:38.406,3,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,3
3,00:01:38.446,4,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,4
4,00:01:37.938,5,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,5
...,...,...,...,...,...,...,...,...,...,...,...
289957,00:01:31.621,54,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,30
289958,00:01:31.559,55,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,31
289959,00:01:31.812,56,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,32
289960,00:01:32.353,57,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,33


In [9]:
#Encode label



to_encode = ['driver_ref', 'team_ref', 'circuit_ref', 'session_status']
d ={}
for col in to_encode:
    d[col]=LabelEncoder().fit(lap_data[col])

for col in to_encode:
    lap_data[col] = d[col].transform(lap_data[col])


In [10]:
lap_data.describe()

Unnamed: 0,lap_num,position_during_lap,driver_ref,age,team_ref,circuit_ref,session_status,start_pos,laps_since_last_pit_or_start
count,289952.0,289952.0,289952.0,289952.0,289952.0,289952.0,289952.0,289952.0,289952.0
mean,30.254187,9.919956,34.79548,27.957393,11.187921,15.908354,20.042035,10.477065,13.459552
std,18.281511,5.569781,19.711102,5.358645,6.175307,10.299827,14.916492,6.091164,10.121645
min,1.0,1.0,0.0,17.0,0.0,0.0,0.0,0.0,1.0
25%,15.0,5.0,19.0,24.0,5.0,7.0,0.0,5.0,6.0
50%,29.0,10.0,36.0,27.0,12.0,15.0,30.0,10.0,11.0
75%,45.0,14.0,51.0,32.0,16.0,25.0,30.0,16.0,19.0
max,87.0,24.0,68.0,44.0,20.0,34.0,67.0,24.0,78.0


In [11]:


#test inverse transform
test_lap_data = copy.deepcopy(lap_data)
for col in to_encode:
    test_lap_data[col] = d[col].inverse_transform(test_lap_data[col])
test_lap_data

Unnamed: 0,laptime,lap_num,position_during_lap,driver_ref,age,team_ref,circuit_ref,session_status,start_pos,race_date,laps_since_last_pit_or_start
0,00:01:40.424,1,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,1
1,00:01:38.48,2,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,2
2,00:01:38.406,3,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,3
3,00:01:38.446,4,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,4
4,00:01:37.938,5,1.0,norris,25,mclaren,marina_bay,Finished,1,2024-09-22,5
...,...,...,...,...,...,...,...,...,...,...,...
289957,00:01:31.621,54,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,30
289958,00:01:31.559,55,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,31
289959,00:01:31.812,56,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,32
289960,00:01:32.353,57,7.0,perez,22,sauber,albert_park,Finished,22,2012-03-18,33


In [12]:
#split the data

X = lap_data.drop(columns=['laptime'])
y = lap_data['laptime'];
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [13]:
#Train

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

ValueError: could not convert string to float: '2013-05-12'

In [None]:
#Test accuracy

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)