# XGBoost

### Sam Timmins

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')
df

Unnamed: 0,Driver,LapNumber,Sector1Time,Sector2Time,Sector3Time,SpeedI1,SpeedI2,SpeedFL,Compound,TyreLife,...,Team,Track,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed,NextSectorTime
0,8.0,3.0,31.582,42.159,24.112,226.0,250.0,273.0,3.0,2.0,...,4.0,14.0,23.9,26.0,1010.4,0.0,29.0,13.0,0.3,31.635
1,8.0,3.0,42.159,24.112,31.635,226.0,250.0,273.0,3.0,2.0,...,4.0,14.0,23.9,26.0,1010.4,0.0,29.0,13.0,0.3,42.404
2,8.0,3.0,24.112,31.635,42.404,226.0,250.0,273.0,3.0,2.0,...,4.0,14.0,23.9,26.0,1010.4,0.0,29.0,13.0,0.3,24.233
3,8.0,4.0,31.635,42.404,24.233,227.0,252.0,273.0,3.0,3.0,...,4.0,14.0,23.8,27.0,1010.4,0.0,29.0,12.0,0.5,31.619
4,8.0,4.0,42.404,24.233,31.619,227.0,252.0,273.0,3.0,3.0,...,4.0,14.0,23.8,27.0,1010.4,0.0,29.0,12.0,0.5,42.482
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33013,18.0,64.0,32.486,24.300,23.602,264.0,273.0,280.0,0.0,27.0,...,9.0,2.0,21.5,69.0,1001.2,0.0,26.8,46.0,1.5,32.127
33014,18.0,64.0,24.300,23.602,32.127,264.0,273.0,280.0,0.0,27.0,...,9.0,2.0,21.5,69.0,1001.2,0.0,26.8,46.0,1.5,24.251
33015,18.0,65.0,23.602,32.127,24.251,279.0,278.0,280.0,0.0,28.0,...,9.0,2.0,21.5,69.0,1001.4,0.0,27.1,55.0,2.2,23.633
33016,18.0,65.0,32.127,24.251,23.633,279.0,278.0,280.0,0.0,28.0,...,9.0,2.0,21.5,69.0,1001.4,0.0,27.1,55.0,2.2,32.186


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

no_norm = ['Driver', 'Compound', 'Team', 'Track', 'FreshTyre', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'NextSectorTime']
cols_to_norm = [col for col in df.columns if col not in no_norm]
ss = StandardScaler()
df[cols_to_norm] = ss.fit_transform(df[cols_to_norm].values)
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'NextSectorTime'].to_numpy(),
                                                    df['NextSectorTime'].to_numpy(), test_size=0.2, random_state=42)

## XGBoost

In [3]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
xgb.fit(X_train, y_train)

In [4]:
%%time
from sklearn.metrics import mean_squared_error, mean_absolute_error

pred = xgb.predict(X_test)
print('MSE:\t', mean_squared_error(y_test, pred))
print('MAE:\t', mean_absolute_error(y_test, pred))
for i in range(0, len(pred)):
    print('Predicted: ', round(pred[i], 3), '\tActual: ', y_test[i])

MSE:	 0.1098176097920513
MAE:	 0.2108057253864734
Predicted:  39.701 	Actual:  39.205
Predicted:  20.906 	Actual:  20.732
Predicted:  39.139 	Actual:  38.843
Predicted:  24.915 	Actual:  24.947
Predicted:  24.229 	Actual:  23.793
Predicted:  19.463 	Actual:  19.372
Predicted:  36.544 	Actual:  36.447
Predicted:  31.641 	Actual:  32.11
Predicted:  17.854 	Actual:  17.991
Predicted:  30.333 	Actual:  30.12
Predicted:  25.059 	Actual:  25.874
Predicted:  20.874 	Actual:  20.862
Predicted:  32.118 	Actual:  32.385
Predicted:  28.862 	Actual:  28.74
Predicted:  29.76 	Actual:  30.065
Predicted:  17.599 	Actual:  17.579
Predicted:  36.348 	Actual:  36.007
Predicted:  21.834 	Actual:  21.698
Predicted:  21.653 	Actual:  21.587
Predicted:  25.807 	Actual:  26.016
Predicted:  44.924 	Actual:  45.499
Predicted:  25.879 	Actual:  25.896
Predicted:  31.853 	Actual:  31.634
Predicted:  38.559 	Actual:  38.636
Predicted:  42.613 	Actual:  42.778
Predicted:  27.716 	Actual:  27.747
Predicted:  33.273

In [None]:
%%time
from sklearn.model_selection import GridSearchCV

# Define your parameter grid
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'eta': [0.1, 0.01, 0.001],
    'subsample': [0, 0.5, 1]
}

# Create an XGBoostRegressor object
xgb = XGBRegressor()

# Create a GridSearchCV object with scoring as 'neg_mean_squared_error'
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=4)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and best score
print("Best Hyperparameters: ", grid_search.best_params_)
print("Best Score (negative mean squared error): ", grid_search.best_score_)

Fitting 5 folds for each of 900 candidates, totalling 4500 fits
[CV 1/5] END eta=0.1, max_depth=1, n_estimators=100, subsample=0;, score=-863.096 total time=   0.0s
[CV 2/5] END eta=0.1, max_depth=1, n_estimators=100, subsample=0;, score=-858.410 total time=   0.0s
[CV 3/5] END eta=0.1, max_depth=1, n_estimators=100, subsample=0;, score=-859.952 total time=   0.0s
[CV 4/5] END eta=0.1, max_depth=1, n_estimators=100, subsample=0;, score=-856.020 total time=   0.0s
[CV 5/5] END eta=0.1, max_depth=1, n_estimators=100, subsample=0;, score=-864.428 total time=   0.0s
[CV 1/5] END eta=0.1, max_depth=1, n_estimators=100, subsample=0.5;, score=-0.215 total time=   0.0s
[CV 2/5] END eta=0.1, max_depth=1, n_estimators=100, subsample=0.5;, score=-0.220 total time=   0.0s
[CV 3/5] END eta=0.1, max_depth=1, n_estimators=100, subsample=0.5;, score=-0.235 total time=   0.0s
[CV 4/5] END eta=0.1, max_depth=1, n_estimators=100, subsample=0.5;, score=-0.239 total time=   0.0s
[CV 5/5] END eta=0.1, max_d