In [1]:
import pandas as pd
from sklearn.preprocessing import Normalizer, LabelEncoder
from xgboost.sklearn import XGBClassifier

In [2]:
from libs.predictor_util import make_classification, modelfit
from libs.transformer_utils import process_features_standardisation

## 1. Load the Data set

In [3]:
df = pd.read_csv('../data/outputs/06_final_dataset.csv')

In [4]:
df.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,statusId,year,round,circuitId,...,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age,season_age
0,1,18,1,1,1,2,1,2008,1,1,...,21,16,21,19,12,14,63,292,39,23
1,2,18,2,2,5,3,1,2008,1,1,...,0,0,0,0,0,2,2,131,47,31
2,3,18,3,3,7,5,1,2008,1,1,...,0,0,0,0,0,3,20,158,39,23
3,4,18,4,4,11,7,1,2008,1,1,...,0,0,11,12,12,9,22,285,43,27
4,5,18,5,1,3,1,1,2008,1,1,...,0,0,0,0,0,1,2,37,43,27


#### Get only the data from 2005

In [5]:
df = df[df['year'] > 2005]

In [6]:
le = LabelEncoder()
df['race_rank'] = le.fit_transform(df['race_rank'])

## 2. Normalizing the year column

In [7]:
df.columns

Index(['resultId', 'raceId', 'driverId', 'constructorId', 'grid', 'race_rank',
       'statusId', 'year', 'round', 'circuitId', 'constructor_is_active',
       'constructor_races_won', 'constructor_avg_point',
       'constructor_times_in_top_10', 'driver_is_active', 'driver_avg_point',
       'driver_avg_speed', 'race_end_bf_2019', 'race_end_in_2019',
       'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022',
       'race_end_in_2023', 'driver_most_won_circuit_id',
       'driver_nber_of_races_won', 'driver_nber_of_times_in_top_10', 'age',
       'season_age'],
      dtype='object')

In [8]:
cols = ['year', 'driver_avg_speed', 'constructor_avg_point', 'driver_avg_point', 'constructor_races_won', 'race_end_bf_2019', 'race_end_in_2019', 'race_end_in_2020', 'race_end_in_2021', 'race_end_in_2022', 'race_end_in_2023', 'driver_nber_of_races_won', 'driver_nber_of_times_in_top_10', 'age']
df[cols] = process_features_standardisation(df, cols, Normalizer)

In [9]:
# df.boxplot(column=['driver_avg_speed'], return_type='axes')

In [10]:
df.drop(['raceId', 'resultId', 'statusId', 'constructor_races_won', 'constructor_avg_point', 'constructor_times_in_top_10'], axis=1, inplace=True)

In [11]:
df_val = df.iloc[:5]
df = df.iloc[5:]
df.head()


Unnamed: 0,driverId,constructorId,grid,race_rank,year,round,circuitId,constructor_is_active,driver_is_active,driver_avg_point,...,race_end_in_2019,race_end_in_2020,race_end_in_2021,race_end_in_2022,race_end_in_2023,driver_most_won_circuit_id,driver_nber_of_races_won,driver_nber_of_times_in_top_10,age,season_age
5,6,3,13,14,0.994774,1,1,1,0,0.000124,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.003468,0.019321,23
6,7,5,17,12,0.994849,1,1,0,0,0.00011,...,0.0,0.0,0.0,0.0,0.0,0,0.0,0.004459,0.022295,29
7,8,6,15,4,0.979127,1,1,1,0,0.002595,...,0.002926,0.002926,0.002438,0.0,0.0,1,0.02048,0.135556,0.021943,29
8,9,2,2,9,0.994001,1,1,0,0,0.00137,...,0.00099,0.0,0.000495,0.0,0.0,7,0.000495,0.026731,0.019801,24
9,10,7,18,13,0.994918,1,1,0,0,0.000266,...,0.0,0.0,0.0,0.0,0.0,12,0.000495,0.010901,0.02081,26


## 3. Apply the model model

In [12]:
target = 'race_rank'
id_col = 'resultId'
features = [x for x in df.columns if x not in [target, id_col]]

In [13]:
# model = make_classification(df, features, target)
# Accuracy: 10.730088495575222

In [14]:
xgb1 = XGBClassifier(
        learning_rate =0.01,
        n_estimators=1000,
        max_depth=5,
        min_child_weight=1,
        gamma=0,
        subsample=0.8,
        colsample_bytree=0.8,
        objective= 'binary:logistic',
        nthread=4,
        scale_pos_weight=1,
        seed=27)
modelfit(xgb1, features, target)

NameError: name 'X' is not defined