In [1]:
import pandas as pd
from sklearn.preprocessing import Normalizer
import xgboost as xgb

In [2]:
from libs.predictor_util import modelfit
from libs.plot_utils import plot_confusion_matrix, analysing_feature_importance
from libs.transformer_utils import process_features_standardisation, encode_labels

## 1. Load the Data set

In [3]:
df = pd.read_csv('../data/outputs/06_final_dataset.csv')

In [4]:
df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,...,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age,season_age
0,1,18,1,1,1,2,1,2008,1,1,...,21,16,21,19,12,14,63,292,39,23
1,2,18,2,2,5,3,1,2008,1,1,...,0,0,0,0,0,2,2,131,47,31
2,3,18,3,3,7,5,1,2008,1,1,...,0,0,0,0,0,3,20,158,39,23
3,4,18,4,4,11,7,1,2008,1,1,...,0,0,11,12,12,9,22,285,43,27
4,5,18,5,1,3,1,1,2008,1,1,...,0,0,0,0,0,1,2,37,43,27


In [5]:
df.shape

(26080, 28)

## 2. Encode categorials variables

In [6]:
cat_cols = ['race_rank', 'constructorId', 'grid', 'driver_most_won_circuit_id', 'statusId', 'circuitId']

In [7]:
df = encode_labels(df, cat_cols)

## 3. Normalizing columns

In [8]:
cols = ['year', 'driver_avg_speed', 'constructor_avg_point', 'driver_avg_point', 'constructor_races_won', 'race_end_bf_2019', 
        'race_end_in_2019', 'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022', 'race_end_in_2023', 'driver_nber_of_races_won', 
        'driver_nber_of_times_in_top_10', 'age']
df[cols] = process_features_standardisation(df, cols, Normalizer)

## 4. Delete unused columns

In [9]:
df.drop(['raceId', 'resultId', 'age'], axis=1, inplace=True)

In [10]:
df_to_predict_later = df.iloc[:5]
df = df.iloc[5:]
df.head()


Unnamed: 0,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,constructor_is_active,constructor_races_won,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,season_age
5,6,2,13,14,10,0.994774,1,0,1,0.003468,...,0.007431,0.0,0.0,0.0,0.0,0.0,0,0.0,0.003468,23
6,7,4,17,12,4,0.994849,1,0,0,0.000495,...,0.00545,0.0,0.0,0.0,0.0,0.0,0,0.0,0.004459,29
7,8,5,15,4,4,0.979127,1,0,1,0.04486,...,0.099473,0.002926,0.002926,0.002438,0.0,0.0,1,0.02048,0.135556,29
8,9,1,2,9,3,0.994001,1,0,0,0.00099,...,0.026236,0.00099,0.0,0.000495,0.0,0.0,7,0.000495,0.026731,24
9,10,6,18,13,2,0.994918,1,0,0,0.001486,...,0.011891,0.0,0.0,0.0,0.0,0.0,9,0.000495,0.010901,26


## 5. Fit the model

In [11]:
target = 'race_rank'
id_col = 'resultId'
features = [x for x in df.columns if x not in [target, id_col]]

In [12]:
xgb_model = xgb.XGBClassifier(
    n_estimators=205,
    learning_rate=0.001,
    max_depth=5,
    eta= 0.01,
    subsample= 0.8,
    colsample_bytree= 0.8,
    objective= 'multi:softmax',
    eval_metric='merror',
    min_child_weight = 1,
    gamma = 0,
    n_jobs=-1,
    num_class= 25,
    enable_categorical=True
)
# best_params = {'colsample_bytree': 0.8, 'eta': 0.01, 'learning_rate': 0.05, 'max_depth': 12, 'n_estimators': 205, 'objective': 'multi:softmax', 'seed': 80, 'subsample': 0.5}

In [13]:
model = modelfit(df, features, target, xgb_model)


Model Report
Accuracy : 0.7335
AUC Score (Train): 0.954054
AUC Score (Test): 0.930038


## 6. Analysing the model output

### 6.1. Classification report

In [14]:
print("Classification Report:")
print(model[4])

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.99      0.95      4641
           1       0.20      0.43      0.27        83
           2       0.13      0.18      0.15        84
           3       0.08      0.07      0.08        81
           4       0.09      0.06      0.07       109
           5       0.06      0.02      0.04        81
           6       0.00      0.00      0.00       103
           7       0.08      0.07      0.08        83
           8       0.15      0.09      0.11        92
          10       0.09      0.06      0.07        78
          11       0.07      0.05      0.06        92
          12       0.13      0.06      0.09        93
          13       0.05      0.01      0.02        93
          14       0.10      0.05      0.07        95
          15       0.07      0.04      0.05       101
          16       0.08      0.10      0.09        94
          17       0.12      0.09      0.10       101
    

### 6.2. Confuxion Matrix

In [15]:
plot_confusion_matrix((model[1], model[2]))

### 6.4. Feature importance

In [16]:
feature_imp_serie = pd.Series(model[0].get_booster().get_fscore()).sort_values(ascending=False)

In [17]:
analysing_feature_importance(feature_imp_serie)

## 7. Test the model with specifics data

In [18]:
df_to_predict_later

Unnamed: 0,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,constructor_is_active,constructor_races_won,...,race_end_bf_2019,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,season_age
0,1,0,1,2,0,0.979019,1,0,1,0.024378,...,0.094099,0.010239,0.007801,0.010239,0.009264,0.005851,11,0.030716,0.142367,23
1,2,1,5,3,0,0.99199,1,0,0,0.000988,...,0.032605,0.0,0.0,0.0,0.0,0.0,2,0.000988,0.064717,31
2,3,2,7,5,0,0.989246,1,0,1,0.003449,...,0.071435,0.0,0.0,0.0,0.0,0.0,3,0.009853,0.077839,23
3,4,3,11,7,0,0.979935,1,0,0,0.006832,...,0.098579,0.0,0.0,0.005368,0.005856,0.005856,8,0.010736,0.139084,27
4,5,0,3,1,0,0.994276,1,0,1,0.024758,...,0.019311,0.0,0.0,0.0,0.0,0.0,1,0.00099,0.018321,27


In [19]:
model[0].predict(df_to_predict_later.drop('race_rank', axis=1))

array([2, 0, 0, 0, 7], dtype=int32)