In [1]:
import pandas as pd
from sklearn.preprocessing import Normalizer, LabelEncoder, RobustScaler

In [2]:
from libs.predictor_util import make_classification, plot_confusion_matrix, analysing_feature_importance
from libs.transformer_utils import process_features_standardisation

## 1. Load the Data set

In [3]:
df = pd.read_csv('../data/outputs/06_final_dataset.csv')

In [4]:
df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age
0,1,18,1,1,1,2,1,2008,1,1,...,179,21,16,21,19,12,14,61,276,23
1,2,18,2,2,5,3,1,2008,1,1,...,35,0,0,0,0,0,2,2,31,31
2,3,18,3,3,7,5,1,2008,1,1,...,132,0,0,0,0,0,1,19,136,23
3,4,18,4,4,11,7,1,2008,1,1,...,131,0,0,11,12,12,9,12,183,27
4,5,18,5,1,3,1,1,2008,1,1,...,28,0,0,0,0,0,1,2,25,27


In [5]:
le = LabelEncoder()
df['race_rank'] = le.fit_transform(df['race_rank'])

## 2. Normalizing the year column

In [6]:
df.columns

Index(['resultId', 'raceId', 'driverId', 'constructorId', 'grid', 'race_rank',
       'statusId', 'year', 'round', 'circuitId', 'constructor_is_active',
       'constructor_races_won', 'constructor_avg_point',
       'constructor_times_in_top_10', 'driver_is_active', 'driver_avg_point',
       'driver_avg_speed', 'race_end_bf_2019', 'race_end_in_2019',
       'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022',
       'race_end_in_2023', 'driver_most_won_circuit_id',
       'driver_nber_of_races_won', 'driver_nber_of_times_in_top_10', 'age'],
      dtype='object')

In [7]:
cols = ['year', 'driver_avg_speed', 'constructor_avg_point', 'driver_avg_point', 'constructor_races_won', 'race_end_bf_2019', 'race_end_in_2019', 'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022', 'race_end_in_2023', 'driver_nber_of_races_won', 'driver_nber_of_times_in_top_10', 'age']
df[cols] = process_features_standardisation(df, cols, Normalizer)

In [8]:
df.drop(['raceId', 'resultId', 'statusId', 'constructor_races_won', 'constructor_avg_point', 'constructor_times_in_top_10'], axis=1, inplace=True)

In [9]:
# df['statusId'] = pd.Categorical(df['statusId'])

In [10]:
df_val = df.iloc[:5]
df = df.iloc[5:]
df.head()


Unnamed: 0,driverId,constructorId,grid,race_rank,year,round,circuitId,constructor_is_active,driver_is_active,driver_avg_point,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age
5,6,3,13,14,0.994916,1,1,1,0,0.000127,...,0.007432,0.0,0.0,0.0,0.0,0.0,0,0.0,0.002973,0.011396
6,7,5,17,12,0.994994,1,1,0,0,0.00011,...,0.005451,0.0,0.0,0.0,0.0,0.0,0,0.0,0.00446,0.01437
7,8,6,15,4,0.988964,1,1,1,0,0.003034,...,0.065504,0.002955,0.002955,0.002463,0.0,0.0,1,0.010343,0.080279,0.014283
8,9,2,2,9,0.994523,1,1,0,0,0.001473,...,0.019316,0.000991,0.0,0.000495,0.0,0.0,7,0.000495,0.018821,0.011887
9,10,7,18,13,0.995082,1,1,0,0,0.000267,...,0.011893,0.0,0.0,0.0,0.0,0.0,12,0.000496,0.010902,0.012885


## 3. Apply the model model

In [11]:
target = 'race_rank'
features:list[str] = df.columns.to_list()
features.remove(target)

In [12]:
df.head()

Unnamed: 0,driverId,constructorId,grid,race_rank,year,round,circuitId,constructor_is_active,driver_is_active,driver_avg_point,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age
5,6,3,13,14,0.994916,1,1,1,0,0.000127,...,0.007432,0.0,0.0,0.0,0.0,0.0,0,0.0,0.002973,0.011396
6,7,5,17,12,0.994994,1,1,0,0,0.00011,...,0.005451,0.0,0.0,0.0,0.0,0.0,0,0.0,0.00446,0.01437
7,8,6,15,4,0.988964,1,1,1,0,0.003034,...,0.065504,0.002955,0.002955,0.002463,0.0,0.0,1,0.010343,0.080279,0.014283
8,9,2,2,9,0.994523,1,1,0,0,0.001473,...,0.019316,0.000991,0.0,0.000495,0.0,0.0,7,0.000495,0.018821,0.011887
9,10,7,18,13,0.995082,1,1,0,0,0.000267,...,0.011893,0.0,0.0,0.0,0.0,0.0,12,0.000496,0.010902,0.012885


In [13]:
model = make_classification(df, features, target, True)

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits
[CV] END colsample_bytree=0.5, eta=0.01, learning_rate=0.01, max_depth=3, n_estimators=50, objective=multi:softmax, seed=12, subsample=0.8; total time=   2.2s
[CV] END colsample_bytree=0.5, eta=0.01, learning_rate=0.01, max_depth=3, n_estimators=50, objective=multi:softmax, seed=12, subsample=0.5; total time=   2.2s
[CV] END colsample_bytree=0.5, eta=0.01, learning_rate=0.01, max_depth=3, n_estimators=50, objective=multi:softmax, seed=12, subsample=0.5; total time=   2.2s
[CV] END colsample_bytree=0.5, eta=0.01, learning_rate=0.01, max_depth=3, n_estimators=50, objective=multi:softmax, seed=12, subsample=0.5; total time=   2.2s
[CV] END colsample_bytree=0.5, eta=0.01, learning_rate=0.01, max_depth=3, n_estimators=50, objective=multi:softmax, seed=12, subsample=0.5; total time=   2.2s
[CV] END colsample_bytree=0.5, eta=0.01, learning_rate=0.01, max_depth=3, n_estimators=50, objective=multi:softmax, seed=12, subsample=0.8;

In [14]:
print("Classification Report:")
print(model[4])

Classification Report:
              precision    recall  f1-score   support

           0       0.12      0.09      0.10        70
           1       0.17      0.19      0.18        79
           2       0.17      0.20      0.19        74
           3       0.12      0.14      0.13        71
           4       0.09      0.11      0.10        75
           5       0.11      0.11      0.11        80
           6       0.04      0.02      0.03        87
           7       0.04      0.04      0.04        77
           8       0.08      0.09      0.08        66
           9       0.05      0.06      0.05        71
          10       0.03      0.04      0.04        70
          11       0.02      0.01      0.01        78
          12       0.02      0.02      0.02        88
          13       0.04      0.04      0.04        71
          14       0.05      0.04      0.04        73
          15       0.10      0.09      0.10        74
          16       0.07      0.08      0.08        75
    

In [15]:
plot_confusion_matrix((model[1], model[2]))

In [16]:
analysing_feature_importance(model[0], model[3])

In [17]:
df_val

Unnamed: 0,driverId,constructorId,grid,race_rank,year,round,circuitId,constructor_is_active,driver_is_active,driver_avg_point,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age
0,1,1,1,2,0.981035,1,1,1,1,0.007099,...,0.087453,0.01026,0.007817,0.01026,0.009283,0.005863,14,0.029802,0.134843,0.011237
1,2,2,5,3,0.994705,1,1,0,0,0.001156,...,0.017338,0.0,0.0,0.0,0.0,0.0,2,0.000991,0.015357,0.015357
2,3,3,7,5,0.990598,1,1,1,0,0.004531,...,0.065119,0.0,0.0,0.0,0.0,0.0,1,0.009373,0.067092,0.011346
3,4,4,11,7,0.98866,1,1,0,1,0.003183,...,0.064499,0.0,0.0,0.005416,0.005908,0.005908,9,0.005908,0.090102,0.013294
4,5,1,3,1,0.994865,1,1,1,0,0.000391,...,0.013873,0.0,0.0,0.0,0.0,0.0,1,0.000991,0.012386,0.013377


In [18]:
model[0].predict(df_val.drop('race_rank', axis=1))

array([ 3,  1,  6, 13,  6], dtype=int32)