In [1]:
import pandas as pd
from sklearn.preprocessing import Normalizer, LabelEncoder

In [2]:
from libs.predictor_util import make_classification, plot_confusion_matrix, analysing_feature_importance
from libs.transformer_utils import process_features_standardisation

## 1. Load the Data set

In [3]:
df = pd.read_csv('../data/outputs/06_final_dataset.csv')

In [4]:
df = df[df['race_rank'] != 0]
# df = df.drop(df[df['race_rank'] == 0].sample(frac=0.1).index)
df

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age
0,1,18,1,1,1,2,1,2008,1,1,...,193,21,16,21,19,12,14,63,292,23
1,2,18,2,2,5,3,1,2008,1,1,...,66,0,0,0,0,0,2,2,131,31
2,3,18,3,3,7,5,1,2008,1,1,...,145,0,0,0,0,0,3,20,158,23
3,4,18,4,4,11,7,1,2008,1,1,...,202,0,0,11,12,12,9,22,285,27
4,5,18,5,1,3,1,1,2008,1,1,...,39,0,0,0,0,0,1,2,37,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26074,26080,1110,825,210,16,14,1,2023,12,13,...,28,6,3,0,9,4,15,2,62,31
26075,26081,1110,817,213,19,15,1,2023,12,13,...,93,8,12,13,14,1,1,16,128,34
26076,26082,1110,858,3,18,9,1,2023,12,13,...,0,0,0,0,0,4,0,0,2,23
26077,26083,1110,807,210,0,4,1,2023,12,13,...,83,7,2,0,2,5,15,2,91,36


In [5]:
le = LabelEncoder()
df['race_rank'] = le.fit_transform(df['race_rank'])

## 2. Normalizing the year column

In [6]:
df.columns

Index(['resultId', 'raceId', 'driverId', 'constructorId', 'grid', 'race_rank',
       'statusId', 'year', 'round', 'circuitId', 'constructor_is_active',
       'constructor_races_won', 'constructor_avg_point',
       'constructor_times_in_top_10', 'driver_is_active', 'driver_avg_point',
       'driver_avg_speed', 'race_end_bf_2019', 'race_end_in_2019',
       'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022',
       'race_end_in_2023', 'driver_most_won_circuit_id',
       'driver_nber_of_races_won', 'driver_nber_of_times_in_top_10', 'age'],
      dtype='object')

In [7]:
cols = ['year', 'driver_avg_speed', 'constructor_avg_point', 'driver_avg_point', 'constructor_races_won', 'race_end_bf_2019', 'race_end_in_2019', 'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022', 'race_end_in_2023', 'driver_nber_of_races_won', 'driver_nber_of_times_in_top_10', 'age']
df[cols] = process_features_standardisation(df, cols, Normalizer)

In [8]:
df.drop(['raceId', 'resultId'], axis=1, inplace=True)

In [9]:
# df['statusId'] = pd.Categorical(df['statusId'])

## Diviser df en 2

In [10]:
# Diviser le DataFrame en deux parties
part1, part2 = df[:len(df)//2], df[len(df)//2:]

## 3. Apply the model model

In [11]:
target = 'race_rank'
features:list[str] = df.columns.to_list()
features.remove(target)

In [12]:
df.head()

Unnamed: 0,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,constructor_is_active,constructor_races_won,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age
0,1,1,1,1,1,0.979134,1,1,1,0.024381,...,0.09411,0.01024,0.007802,0.01024,0.009265,0.005851,14,0.03072,0.142384,0.011215
1,2,2,5,2,1,0.992141,1,1,0,0.000988,...,0.03261,0.0,0.0,0.0,0.0,0.0,2,0.000988,0.064726,0.015317
2,3,3,7,4,1,0.989365,1,1,1,0.003449,...,0.071443,0.0,0.0,0.0,0.0,0.0,3,0.009854,0.077848,0.011332
3,4,4,11,6,1,0.980065,1,1,0,0.006833,...,0.098592,0.0,0.0,0.005369,0.005857,0.005857,9,0.010738,0.139103,0.013178
4,5,1,3,0,1,0.994413,1,1,1,0.024761,...,0.019314,0.0,0.0,0.0,0.0,0.0,1,0.00099,0.018323,0.013371


In [14]:
model = make_classification(df, features, target)

Accuracy: 0.12378094523630907


In [15]:
print("Classification Report:")
print(model[4])

Classification Report:
              precision    recall  f1-score   support

           0       0.24      0.28      0.26       138
           1       0.16      0.23      0.19       132
           2       0.13      0.15      0.14       136
           3       0.11      0.09      0.10       150
           4       0.11      0.09      0.10       138
           5       0.05      0.05      0.05       126
           6       0.13      0.09      0.11       132
           7       0.10      0.09      0.10       128
           8       0.05      0.04      0.05       113
           9       0.10      0.09      0.10       118
          10       0.05      0.05      0.05       131
          11       0.10      0.11      0.10       121
          12       0.10      0.07      0.08       138
          13       0.11      0.13      0.12       126
          14       0.08      0.07      0.07       133
          15       0.08      0.09      0.09       127
          16       0.14      0.13      0.14       138
    

In [16]:
plot_confusion_matrix((model[1], model[2]))

In [17]:
analysing_feature_importance(model[0], model[3])

In [23]:
df.iloc[:1]

Unnamed: 0,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,constructor_is_active,constructor_races_won,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age
0,1,1,1,1,1,0.979134,1,1,1,0.024381,...,0.09411,0.01024,0.007802,0.01024,0.009265,0.005851,14,0.03072,0.142384,0.011215


In [21]:
df.iloc[:1].drop('race_rank', axis=1)

Unnamed: 0,driverId,constructorId,grid,statusId,year,round,circuitId,constructor_is_active,constructor_races_won,constructor_avg_point,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age
0,1,1,1,1,0.979134,1,1,1,0.024381,0.00164,...,0.09411,0.01024,0.007802,0.01024,0.009265,0.005851,14,0.03072,0.142384,0.011215


In [22]:
model[0].predict(df.iloc[:1].drop('race_rank', axis=1))

array([1])