In [106]:
import sys
sys.path.append('../../')

import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score

In [107]:
from src.transformer_utils import encode_labels

## 1. Load the data

### 1.1. The model

In [108]:
# Charger le modèle à partir du fichier
with open("../../models/v0.1/modelv01.pkl", "rb") as f:
    loaded_model = pickle.load(f)

### 1.2. The databases

In [109]:
df_drivers = pd.read_csv('../../data/db/01_driver.csv')
df_constructors = pd.read_csv('../../data/db/02_constructors.csv')
df_races_results = pd.read_csv('../../data/outputs/03_races_results.csv')
df_circuits= pd.read_csv('../../data/outputs/02_circuits_processed.csv')

## 2. Inputs data

In [110]:
driverId = 847 #
constructorId = 131
race_round = 1 #
circuitId = 3 #
year = 2024 #

### 2.1. Get the previous races in the circuit

In [115]:
prev_races = df_races_results[(df_races_results.driverId == driverId) & (df_races_results.constructorId == constructorId) & (df_races_results.circuitId == circuitId)].sort_values('year', ascending=False)
prev_races

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,points,laps,milliseconds,fastestLap,fastestLapTime,fastestLapSpeed,statusId,year,round,circuitId,name
25846,25852,1098,847,131,6,7,6.0,57,5692609,33,97221,200.401,1,2023,1,3,Bahrain Grand Prix
25403,25409,1074,847,131,9,4,12.0,57,5864795,56,96301,202.313,1,2022,1,3,Bahrain Grand Prix
24928,24934,1046,847,131,2,9,3.0,87,5493670,80,55404,230.214,1,2020,16,3,Sakhir Grand Prix


### 2.2. Fill the race results required fields

In [116]:
import math

if prev_races.shape[0] > 0:
    grid = prev_races.iloc[0]['grid']
    race_rank = prev_races.iloc[0]['race_rank'] # value to predict
    points = prev_races['points'].mean() # by default is 0
    laps = prev_races['laps'].max()
    milliseconds = int(prev_races['milliseconds'].median())
    fastestLap = int(prev_races['fastestLap'].median())
    fastestLapTime = int(prev_races['fastestLapTime'].median())
    fastestLapSpeed = prev_races['fastestLapSpeed'].mean()
    statusId = 1 # We suppose thats is true
else:
    grid = math.ceil(df_races_results['grid'].mean())
    race_rank = df_races_results['race_rank'].median()
    points = 0 # by default is 0
    laps = df_races_results['laps'].median()
    milliseconds = int(df_races_results['milliseconds'].median())
    fastestLap = int(df_races_results['fastestLap'].median())
    fastestLapTime = int(df_races_results['fastestLapTime'].median())
    fastestLapSpeed = df_races_results['fastestLapSpeed'].mean()
    statusId = 1 # We suppose thats is true

data = [1, 2, driverId, constructorId, grid, race_rank, points, laps, milliseconds, fastestLap, fastestLapTime, fastestLapSpeed, 
        statusId, year, race_round, circuitId, 'Circuit dummy name']

## 3. Compose the predictable dataset

In [111]:
curr_driver_df = df_drivers[df_drivers.driverId == driverId]

In [112]:
curr_driver_df.head()

Unnamed: 0,driverId,number,nationality,driver_is_active,age,full_name,driver_avg_point,driver_avg_speed,race_end_bf_2015,race_end_in_2015,...,race_end_in_2017,race_end_in_2018,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10
845,847,63,British,1,26,George Russell,3.968085,209.749033,0,0,...,0,0,2,4,4,20,10,18,1,35


In [113]:
curr_constructor_df = df_constructors[df_constructors.constructorId == constructorId]

In [114]:
curr_constructor_df.head()

Unnamed: 0,constructorId,constructor_name,constructor_country,constructor_is_active,constructor_races_won,constructor_avg_point,constructor_times_in_top_10
129,131,Mercedes,German,1,125,12.259658,483


In [117]:
col_name = ['resultId', 'raceId', 'driverId', 'constructorId', 'grid', 'race_rank', 'points', 'laps', 'milliseconds', 'fastestLap', 
            'fastestLapTime','fastestLapSpeed', 'statusId', 'year', 'round', 'circuitId', 'name']

In [118]:
df = pd.DataFrame([data], columns=col_name)
df

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,points,laps,milliseconds,fastestLap,fastestLapTime,fastestLapSpeed,statusId,year,round,circuitId,name
0,1,2,847,131,6,7,7.0,87,5692609,56,96301,210.976,1,2024,1,3,Circuit dummy name


In [119]:
df = df.merge(df_drivers, on='driverId', how='inner')
df

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,points,laps,milliseconds,fastestLap,...,race_end_in_2017,race_end_in_2018,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10
0,1,2,847,131,6,7,7.0,87,5692609,56,...,0,0,2,4,4,20,10,18,1,35


In [120]:
df = df.merge(curr_constructor_df, on='constructorId', how='inner')
df

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,points,laps,milliseconds,fastestLap,...,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,constructor_name,constructor_country,constructor_is_active,constructor_races_won,constructor_avg_point,constructor_times_in_top_10
0,1,2,847,131,6,7,7.0,87,5692609,56,...,10,18,1,35,Mercedes,German,1,125,12.259658,483


In [121]:
df = df.merge(df_circuits[['circuitId', 'circuits_is_active']], on='circuitId', how='inner')
df

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,points,laps,milliseconds,fastestLap,...,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,constructor_name,constructor_country,constructor_is_active,constructor_races_won,constructor_avg_point,constructor_times_in_top_10,circuits_is_active
0,1,2,847,131,6,7,7.0,87,5692609,56,...,18,1,35,Mercedes,German,1,125,12.259658,483,1


## 4. Make the data processing

In [122]:
#  Not corr columns
not_corr_cols = ['constructorId', 'raceId', 'resultId', 'year', 'round', 'circuitId', 'age', 'driver_most_won_circuit_id']

# Retirer comme à la base les variable de base n'étant pas fortement corrélées
df_pred = df.drop(not_corr_cols, axis=1)

cols = df_pred.select_dtypes(np.object_).columns.to_list()
df_pred = df_pred.drop(cols, axis=1)
df_pred

Unnamed: 0,driverId,grid,race_rank,points,laps,milliseconds,fastestLap,fastestLapTime,fastestLapSpeed,statusId,...,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_nber_of_races_won,driver_nber_of_times_in_top_10,constructor_is_active,constructor_races_won,constructor_avg_point,constructor_times_in_top_10,circuits_is_active
0,847,6,7,7.0,87,5692609,56,96301,210.976,1,...,4,20,10,1,35,1,125,12.259658,483,1


In [123]:
df_pred = encode_labels(df_pred, ['grid', 'race_rank', 'laps', 'fastestLap', 'statusId'])
df_pred

Unnamed: 0,driverId,grid,race_rank,points,laps,milliseconds,fastestLap,fastestLapTime,fastestLapSpeed,statusId,...,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_nber_of_races_won,driver_nber_of_times_in_top_10,constructor_is_active,constructor_races_won,constructor_avg_point,constructor_times_in_top_10,circuits_is_active
0,847,0,0,7.0,0,5692609,0,96301,210.976,0,...,4,20,10,1,35,1,125,12.259658,483,1


## 5. Let's predict

In [124]:
# Faire des prédictions avec le modèle chargé
target = 'race_rank'
features = [x for x in df_pred.columns if x not in [target]]

loaded_predictions = loaded_model.predict(df_pred[features])
loaded_accuracy = accuracy_score(df_pred[target], loaded_predictions)
print("Accuracy of loaded model:", loaded_accuracy)

Accuracy of loaded model: 0.0


In [125]:
loaded_model.predict(df_pred[features])

array([6], dtype=int32)