# XGBoost

### Sam Timmins

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')
df

Unnamed: 0,Driver,LapNumber,Sector1Time,Sector2Time,Sector3Time,SpeedI1,SpeedI2,SpeedFL,Compound,TyreLife,...,Team,Track,AirTemp,Humidity,Pressure,Rainfall,TrackTemp,WindDirection,WindSpeed,NextSectorTime
0,21.0,3.0,27.804,40.703,33.189,205.0,179.0,199.0,2.0,2.0,...,8.0,0.0,28.9,52.0,991.5,0.0,34.6,92.0,3.9,27.910
1,21.0,3.0,40.703,33.189,27.910,205.0,179.0,199.0,2.0,2.0,...,8.0,0.0,28.9,52.0,991.5,0.0,34.6,92.0,3.9,40.636
2,21.0,3.0,33.189,27.910,40.636,205.0,179.0,199.0,2.0,2.0,...,8.0,0.0,28.9,52.0,991.5,0.0,34.6,92.0,3.9,33.347
3,21.0,4.0,27.910,40.636,33.347,202.0,180.0,197.0,2.0,3.0,...,8.0,0.0,28.9,50.0,991.7,0.0,34.6,112.0,4.3,28.030
4,21.0,4.0,40.636,33.347,28.030,202.0,180.0,197.0,2.0,3.0,...,8.0,0.0,28.9,50.0,991.7,0.0,34.6,112.0,4.3,40.805
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50872,3.0,69.0,31.591,21.384,17.560,299.0,230.0,273.0,2.0,31.0,...,1.0,17.0,22.8,53.0,934.0,0.0,31.3,59.0,0.8,31.624
50873,3.0,69.0,21.384,17.560,31.624,299.0,230.0,273.0,2.0,31.0,...,1.0,17.0,22.8,53.0,934.0,0.0,31.3,59.0,0.8,21.217
50874,3.0,70.0,17.560,31.624,21.217,298.0,231.0,273.0,2.0,32.0,...,1.0,17.0,22.7,53.0,934.0,0.0,31.5,27.0,0.8,17.679
50875,3.0,70.0,31.624,21.217,17.679,298.0,231.0,273.0,2.0,32.0,...,1.0,17.0,22.7,53.0,934.0,0.0,31.5,27.0,0.8,31.726


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

no_norm = ['Driver', 'Compound', 'Team', 'Track', 'FreshTyre', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'NextSectorTime']
cols_to_norm = [col for col in df.columns if col not in no_norm]
ss = StandardScaler()
df[cols_to_norm] = ss.fit_transform(df[cols_to_norm].values)
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'NextSectorTime'].to_numpy(),
                                                    df['NextSectorTime'].to_numpy(), test_size=0.2, random_state=42)

## XGBoost

In [3]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=1000, learning_rate=0.05)
xgb.fit(X_train, y_train)

In [4]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

pred = xgb.predict(X_test)
print('MSE:\t', mean_squared_error(y_test, pred))
print('MAE:\t', mean_absolute_error(y_test, pred))
for i in range(0, len(pred)):
    print('Predicted: ', round(pred[i], 3), '\tActual: ', y_test[i])

MSE:	 0.1728191158986037
MAE:	 0.23550231022804788
Predicted:  17.527 	Actual:  17.425
Predicted:  30.368 	Actual:  30.32
Predicted:  35.157 	Actual:  37.538
Predicted:  28.231 	Actual:  28.18
Predicted:  30.03 	Actual:  30.384
Predicted:  25.015 	Actual:  25.078
Predicted:  32.59 	Actual:  32.689
Predicted:  23.585 	Actual:  25.623
Predicted:  23.472 	Actual:  23.438
Predicted:  43.548 	Actual:  43.378
Predicted:  24.771 	Actual:  24.789
Predicted:  26.328 	Actual:  25.891
Predicted:  41.612 	Actual:  41.467
Predicted:  20.83 	Actual:  20.646
Predicted:  31.058 	Actual:  30.789
Predicted:  21.6 	Actual:  22.033
Predicted:  30.05 	Actual:  29.802
Predicted:  25.066 	Actual:  24.487
Predicted:  29.922 	Actual:  30.23
Predicted:  32.072 	Actual:  32.094
Predicted:  25.611 	Actual:  25.772
Predicted:  23.091 	Actual:  23.378
Predicted:  32.758 	Actual:  33.478
Predicted:  25.824 	Actual:  25.91
Predicted:  43.326 	Actual:  43.545
Predicted:  25.299 	Actual:  25.316
Predicted:  42.825 	Act

In [5]:
# %%time
# from sklearn.model_selection import GridSearchCV

# # Define your parameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 10000],
#     'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
#     'eta': [0.1, 0.01, 0.001],
#     'subsample': [0.5, 1]
# }

# # Create an XGBoostRegressor object
# xgb = XGBRegressor()

# # Create a GridSearchCV object with scoring as 'neg_mean_squared_error'
# grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=4)

# # Fit the GridSearchCV object to the data
# grid_search.fit(X_train, y_train)

# # Print the best hyperparameters and best score
# print("Best Hyperparameters: ", grid_search.best_params_)
# print("Best Score (negative mean squared error): ", grid_search.best_score_)

In [6]:
xgb = XGBRegressor(n_estimators=10000, eta=0.01, max_depth=8, subsample=0.5)
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
print('MSE:\t', mean_squared_error(y_test, pred))
print('MAE:\t', mean_absolute_error(y_test, pred))
for i in range(0, len(pred)):
    print('Predicted: ', round(pred[i], 3), '\tActual: ', y_test[i])

MSE:	 0.15919964305635384
MAE:	 0.22144944445142206
Predicted:  17.566 	Actual:  17.425
Predicted:  30.332 	Actual:  30.32
Predicted:  35.19 	Actual:  37.538
Predicted:  28.165 	Actual:  28.18
Predicted:  30.012 	Actual:  30.384
Predicted:  25.0 	Actual:  25.078
Predicted:  32.747 	Actual:  32.689
Predicted:  25.009 	Actual:  25.623
Predicted:  23.538 	Actual:  23.438
Predicted:  43.442 	Actual:  43.378
Predicted:  24.766 	Actual:  24.789
Predicted:  26.504 	Actual:  25.891
Predicted:  41.514 	Actual:  41.467
Predicted:  20.645 	Actual:  20.646
Predicted:  31.023 	Actual:  30.789
Predicted:  21.703 	Actual:  22.033
Predicted:  29.983 	Actual:  29.802
Predicted:  25.062 	Actual:  24.487
Predicted:  29.92 	Actual:  30.23
Predicted:  31.903 	Actual:  32.094
Predicted:  25.708 	Actual:  25.772
Predicted:  23.325 	Actual:  23.378
Predicted:  33.268 	Actual:  33.478
Predicted:  25.877 	Actual:  25.91
Predicted:  43.366 	Actual:  43.545
Predicted:  25.311 	Actual:  25.316
Predicted:  42.952 	