In [16]:
import pandas as pd
from sklearn.preprocessing import Normalizer, LabelEncoder
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV

In [17]:
from libs.predictor_util import modelfit
from libs.transformer_utils import process_features_standardisation

## 1. Load the Data set

In [18]:
df = pd.read_csv('../data/outputs/06_final_dataset.csv')

In [19]:
df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,...,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age,season_age
0,1,18,1,1,1,2,1,2008,1,1,...,21,16,21,19,12,14,63,292,39,23
1,2,18,2,2,5,3,1,2008,1,1,...,0,0,0,0,0,2,2,131,47,31
2,3,18,3,3,7,5,1,2008,1,1,...,0,0,0,0,0,3,20,158,39,23
3,4,18,4,4,11,7,1,2008,1,1,...,0,0,11,12,12,9,22,285,43,27
4,5,18,5,1,3,1,1,2008,1,1,...,0,0,0,0,0,1,2,37,43,27


#### Get only the data from 2005

In [20]:
df = df[df['year'] > 2005]

In [21]:
le = LabelEncoder()
df['race_rank'] = le.fit_transform(df['race_rank'])

## 2. Normalizing the year column

In [22]:
df.columns

Index(['resultId', 'raceId', 'driverId', 'constructorId', 'grid', 'race_rank',
       'statusId', 'year', 'round', 'circuitId', 'constructor_is_active',
       'constructor_races_won', 'constructor_avg_point',
       'constructor_times_in_top_10', 'driver_is_active', 'driver_avg_point',
       'driver_avg_speed', 'race_end_bf_2019', 'race_end_in_2019',
       'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022',
       'race_end_in_2023', 'driver_most_won_circuit_id',
       'driver_nber_of_races_won', 'driver_nber_of_times_in_top_10', 'age',
       'season_age'],
      dtype='object')

In [23]:
cols = ['year', 'driver_avg_speed', 'constructor_avg_point', 'driver_avg_point', 'constructor_races_won', 'race_end_bf_2019', 'race_end_in_2019', 'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022', 'race_end_in_2023', 'driver_nber_of_races_won', 'driver_nber_of_times_in_top_10', 'age']
df[cols] = process_features_standardisation(df, cols, Normalizer)

In [24]:
# df.boxplot(column=['driver_avg_speed'], return_type='axes')

In [25]:
df.drop(['raceId', 'resultId', 'statusId', 'constructor_races_won', 'constructor_avg_point', 'constructor_times_in_top_10'], axis=1, inplace=True)

In [26]:
df_val = df.iloc[:5]
df = df.iloc[5:]
df.head()


Unnamed: 0,driverId,constructorId,grid,race_rank,year,round,circuitId,constructor_is_active,driver_is_active,driver_avg_point,...,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age,season_age
5,6,3,13,14,0.994774,1,1,1,0,0.000124,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.003468,0.019321,23
6,7,5,17,12,0.994849,1,1,0,0,0.00011,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.004459,0.022295,29
7,8,6,15,4,0.979127,1,1,1,0,0.002595,...,0.002926,0.002926,0.002438,0.0,0.0,1,0.02048,0.135556,0.021943,29
8,9,2,2,9,0.994001,1,1,0,0,0.00137,...,0.00099,0.0,0.000495,0.0,0.0,7,0.000495,0.026731,0.019801,24
9,10,7,18,13,0.994918,1,1,0,0,0.000266,...,0.0,0.0,0.0,0.0,0.0,12,0.000495,0.010901,0.02081,26


## 3. Apply the model model

In [27]:
target = 'race_rank'
id_col = 'resultId'
features = [x for x in df.columns if x not in [target, id_col]]

In [28]:
# Splitting the dataset 
X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [69]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [75]:
# Définir les paramètres du modèle
param = {
    'objective': 'multi:softmax',  # objectif de classification multiclasse
    'num_class': 25,  # nombre total de classes (de 0 à 24)
    'eval_metric': 'merror',  # métrique d'évaluation de l'erreur de classification
   
    'n_estimators':205,
    'learning_rate':0.001,
    'max_depth':5,
    'eta': 0.01,
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'booster':'gbtree',
    'min_child_weight': 1,
    'gamma': 0,
    'n_jobs':-1,
    'enable_categorical':True
}

In [76]:
# Entraîner le modèle
num_round = 100
bst = xgb.train(param, dtrain, num_round)

Parameters: { "enable_categorical", "n_estimators" } are not used.



In [77]:
from sklearn.metrics import accuracy_score
# Faire des prédictions sur les données de test
preds = bst.predict(dtest)

# Calculer l'exactitude des prédictions
accuracy = accuracy_score(y_test, preds)
print("Accuracy: %.2f%%" % (accuracy * 100))

Accuracy: 10.29%


In [29]:
xgb_model = xgb.XGBClassifier(
    n_estimators=205,
    learning_rate=0.001,
    max_depth=5,
    eta= 0.01,
    subsample= 0.8,
    colsample_bytree= 0.5,
    objective= 'multi:softmax',
    eval_metric='auc',
    booster='gbtree',
    min_child_weight = 1,
    gamma = 0,
    n_jobs=-1,
    num_class= 25,
    enable_categorical=True
)

In [80]:
#Grid seach on subsample and max_features
#Choose all predictors except target & IDcols
param_test1 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch1 = GridSearchCV(
    estimator = xgb.XGBClassifier(n_estimators=205, learning_rate=0.001, max_depth=5, eta= 0.01, subsample= 0.8, colsample_bytree= 0.5, 
                                  objective= 'multi:softmax', eval_metric='auc', booster='gbtree', min_child_weight = 1, gamma = 0, 
                                  n_jobs=-1, num_class= 25, enable_categorical=True), 
    param_grid = param_test1, scoring='roc_auc', n_jobs=-1, cv=5)
gsearch1.fit(X_train,y_train)

Traceback (most recent call last):
  File "/Users/stephcyrille/Projects/Personal/project_ia_f1_gp_predictor/.venv/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 977, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/Users/stephcyrille/Projects/Personal/project_ia_f1_gp_predictor/.venv/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/Users/stephcyrille/Projects/Personal/project_ia_f1_gp_predictor/.venv/lib/python3.9/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/Users/stephcyrille/Projects/Personal/project_ia_f1_gp_predictor/.venv/lib/python3.9/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/Users/stephcyrille/Projects/Personal/project_ia_f1_gp

In [81]:
f"Mean: {gsearch1.cv_results_['mean_test_score'].mean()}", f"Std: {gsearch1.cv_results_['std_test_score'].mean()}", gsearch1.best_params_, gsearch1.best_score_

('Mean: nan', 'Std: nan', {'colsample_bytree': 0.6, 'subsample': 0.6}, nan)