In [1]:
import pandas as pd
from import_data import import_training_data
import numpy as np
from sklearn.model_selection import train_test_split
import time
import xgboost
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,KFold


In [2]:
# importing and splitting formatted data

df = import_training_data()
label = df.pop('finalLapTime')

X_train, X_test, y_train, y_test = train_test_split(df, label, test_size=0.2, random_state=42)

X_train.head()

df.info()

OperationalError: no such table: TrainingData

In [4]:
#for tuning parameters
parameters_for_testing = {
   'colsample_bytree':[0.4,0.6,0.8],
   'gamma':[0,0.03,0.1,0.3],
   'min_child_weight':[1.5,6,10],
   'learning_rate':[0.1,0.07],
   'max_depth':[3,5],
   'n_estimators':[500],
   'subsample':[0.6,0.95]
}


Now we want to run a grid search on the parameters defined above to find the optimal parameters for our XgBoost model

In [5]:


xgb_model = xgboost.XGBRegressor(learning_rate =0.1, n_estimators=1000, max_depth=5,
    min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,tree_method='gpu_hist', nthread=6, scale_pos_weight=1, seed=27)

gsearch1 = GridSearchCV(estimator = xgb_model, param_grid = parameters_for_testing, n_jobs=6,iid=False, verbose=10,scoring='neg_mean_squared_error')
gsearch1.fit(X_train,y_train)




Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:   49.4s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:   52.0s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:  2.5min
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:  3.4min
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:  4.6min
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:  6.7min
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:  8.9min
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed: 10.5min
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed: 12.7min
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed: 14.7min
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed: 17.1min
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed: 20.5min
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed: 22.8min
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed: 25.2min
[Parallel(n_jobs=6)]: Done 169 tasks      | elapsed: 29.2min
[Parallel(

GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None, colsample_bytree=0.8,
                                    gamma=0, gpu_id=None,
                                    importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=0.1, max_delta_step=None,
                                    max_depth=5, min_child_weight=1,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=1000, n_jobs=None, nthread=6,...
                                    scale_pos_weight=1, seed=27, subsample=0.8,
                                    tree_method='gpu_hist',
                                    validate_parameters=None, verbosity=None),
             iid=False, n_jobs=6,
             param_grid={'colsample_bytree': [0.4, 0

In [6]:
gsearch1.best_params_



{'colsample_bytree': 0.8,
 'gamma': 0.03,
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_weight': 1.5,
 'n_estimators': 500,
 'subsample': 0.95}

# Best Model Parameters

{'colsample_bytree': 0.8,
 'gamma': 0.1,
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_weight': 1.5,
 'n_estimators': 500,
 'subsample': 0.6}

In [7]:
print(df.info())

df = df.drop(['sessionUID'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(df, label, test_size=0.2, random_state=42)


tuned_model = xgboost.XGBRegressor(colsample_bytree = 0.8, gamma=0.03, learning_rate = 0.1, max_depth = 5,
                                   min_child_weight = 1.5, n_estimators = 10000, subsample=0.95, 
                                   tree_method = 'gpu_hist')

tuned_model.fit(X_train, y_train)
pred = tuned_model.predict(X_test)
err = mae(y_test, pred)
display(err)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1612247 entries, 0 to 96921
Data columns (total 48 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   lastLapTime                1612247 non-null  float64
 1   currentLapTime             1612247 non-null  float64
 2   currentLapNum              1612247 non-null  int64  
 3   lapDistance                1612247 non-null  float64
 4   carPosition                1612247 non-null  int64  
 5   sector                     1612247 non-null  int64  
 6   sessionUID                 1612247 non-null  object 
 7   worldPositionX             1612247 non-null  float64
 8   worldPositionY             1612247 non-null  float64
 9   worldPositionZ             1612247 non-null  float64
 10  worldVelocityX             1612247 non-null  float64
 11  worldVelocityY             1612247 non-null  float64
 12  worldVelocityZ             1612247 non-null  float64
 13  yaw           

0.019292900640292765

In [8]:
import pickle
file_name = 'xgboost_model.pkl'
pickle.dump(tuned_model, open(file_name, 'wb'))

In [9]:
from time import time
s = time()
pred = tuned_model.predict(X_test)
print(time()-s)
err = mae(y_test, pred)
display(err)

2.532291889190674


0.019292900640292765