# Exploration phase — further testing
Initial testing of different algorithms has already been completed. XGBRegressor and GradientBoostingRegressor, both tree-based boosting models are clearly the winners. In this phase of exploration, we will further optimize the two so as to train the best possible performance predicting models for each circuit.

In [1]:
import f1_pitstop_advisor.data_processing_utils as utils
import pandas as pd
import numpy as np
import pickle

from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

from sklearn.base import clone

import time

In [2]:
# Load in all sessions
with open("ig_sessions.pickle", "rb") as file:
    sessions = pickle.load(file)

In [3]:
# Aggregate laps from all sessions to the circuit they were performed on.

# We end up with a dictionary where the keys are circuit names, and the values 
# are DataFrames containing data for all laps for each circuit.
circuits = set()
for session in sessions:
    circuits.add(session.session_info["Meeting"]["Circuit"]["ShortName"])

dfs = {}
for circuit in circuits:
    dfs[circuit] = utils.get_refined_lap_data_with_z_score_for_circuit(sessions, circuit)

In [None]:
with open("ig_dfs_by_circuit.pickle", "wb") as file:
    pickle.dump(dfs, file)

In [None]:
with open("ig_initial_models.pickle", "rb") as file:
    tested_models = pickle.load(file)



In [None]:
# Show data point count for each circuit
circuit_sizes = {}
for circuit, df in dfs.items():
    circuit_sizes[circuit] = df.shape[0]

circuit_sizes = pd.DataFrame({"DataPointCount": circuit_sizes}).sort_values(by="DataPointCount", ascending=False)
circuit_sizes

Unnamed: 0,DataPointCount
Zandvoort,5486
Hungaroring,5357
Monte Carlo,5314
Montreal,5189
Catalunya,5054
Sakhir,4414
Singapore,4304
Monza,3898
Mexico City,3843
Melbourne,3780


In [24]:
best_params = {}
for model_type in ["GradientBoostingRegressor", "XGBRegressor"]:
    best_params[model_type] = {}
    for circuit in dfs.keys():
        best_params[model_type][circuit] = tested_models[model_type][circuit].best_params_

In [28]:
for model_type, params in best_params.items():
    best_params[model_type] = pd.DataFrame(params).T

## Exploring parameters from previous testing
Below are the best parameters from initial exploration for GradientBoostingRegressor, for each circuit. Based on this we will determine the parameter value ranges to explore when further optimizing the models.

In [29]:
best_params["GradientBoostingRegressor"]

Unnamed: 0,learning_rate,max_depth,n_estimators,subsample
Miami,0.1,3.0,200.0,0.8
Austin,0.1,3.0,200.0,0.8
Melbourne,0.1,5.0,200.0,1.0
Las Vegas,0.1,5.0,200.0,0.8
Mexico City,0.1,3.0,200.0,1.0
Jeddah,0.1,5.0,200.0,0.8
Yas Marina Circuit,0.1,3.0,100.0,1.0
Sakhir,0.1,3.0,200.0,0.8
Singapore,0.1,3.0,200.0,0.8
Imola,0.05,3.0,200.0,0.8


### Same for XGBRegressor

In [32]:
best_params["XGBRegressor"]

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,subsample
Miami,0.8,0.3,3.0,400.0,0.8
Austin,1.0,0.1,3.0,400.0,0.8
Melbourne,0.8,0.1,6.0,400.0,0.8
Las Vegas,0.8,0.3,3.0,400.0,0.8
Mexico City,0.8,0.01,10.0,400.0,0.8
Jeddah,0.8,0.3,3.0,400.0,1.0
Yas Marina Circuit,0.8,0.1,3.0,100.0,1.0
Sakhir,0.8,0.1,6.0,100.0,0.8
Singapore,1.0,0.1,3.0,400.0,1.0
Imola,1.0,0.3,3.0,200.0,1.0


In [30]:
param_ranges = {}
for model_type, param_df in best_params.items():
    param_ranges[model_type] = pd.DataFrame({
        "Min": param_df.min(axis="index"),
        "Max": param_df.max(axis="index")
    })

Below we look at the exact parameter value ranges for both regression algorithms:

In [31]:
param_ranges["GradientBoostingRegressor"]

Unnamed: 0,Min,Max
learning_rate,0.05,0.1
max_depth,3.0,5.0
n_estimators,100.0,200.0
subsample,0.8,1.0


In [33]:
param_ranges["XGBRegressor"]

Unnamed: 0,Min,Max
colsample_bytree,0.8,1.0
learning_rate,0.01,0.3
max_depth,3.0,10.0
n_estimators,100.0,400.0
subsample,0.8,1.0


## Parameter search grids
Based on the above, I came up with the grid searches below. These will be trained to find the best
parameters for each model, for each circuit.

In [34]:
model_searches = {
    "GradientBoostingRegressor": GridSearchCV(
        GradientBoostingRegressor(random_state=42),
        {
            "n_estimators": [100, 150, 200],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 4, 5],
            "subsample": [0.8, 0.9, 1.0],
            "min_samples_leaf": [1, 3]
        }
    ),

    "XGBRegressor": GridSearchCV(
        XGBRegressor(random_state=42, n_jobs=-1, objective="reg:squarederror", verbosity=0),
        {
            "n_estimators": [100, 200, 300, 400],
            "max_depth": [3, 6, 10],
            "learning_rate": [0.01, 0.1, 0.3],
            "subsample": [0.8, 1.0],
            "colsample_bytree": [0.8, 1.0],
            "min_child_weight": [1, 3]
        }
    ),
}

In [35]:
# Fit every single circuit/GridSearch configuration
models_and_circuits = {}

for name in model_searches.keys():
    models_and_circuits[name] = {}

for circuit, data in dfs.items():
    print(f"Fitting models for {circuit}")
    circuit_start = time.time()
    
    X, y = data.drop(["LapTimeZScore"], axis="columns"), data["LapTimeZScore"]
    for name, model_search in model_searches.items():
        print(f"Fitting {name};".ljust(50), end="")
        model_start = time.time()

        model_search_copy = clone(model_search)
        model_search_copy.fit(X, y)
        models_and_circuits[name][circuit] = model_search_copy

        print(f"took {round(time.time() - model_start, 2)} seconds")
    
    print(f"Took a total of {round(time.time() - circuit_start, 2)} seconds to fit all models for circuit \"{circuit}\"")
    print()

Fitting models for Miami
Fitting GradientBoostingRegressor;                took 231.22 seconds
Fitting XGBRegressor;                             took 539.47 seconds
Took a total of 770.69 seconds to fit all models for circuit "Miami"

Fitting models for Austin
Fitting GradientBoostingRegressor;                took 129.77 seconds
Fitting XGBRegressor;                             took 455.44 seconds
Took a total of 585.2 seconds to fit all models for circuit "Austin"

Fitting models for Melbourne
Fitting GradientBoostingRegressor;                took 648.63 seconds
Fitting XGBRegressor;                             took 747.75 seconds
Took a total of 1396.38 seconds to fit all models for circuit "Melbourne"

Fitting models for Las Vegas
Fitting GradientBoostingRegressor;                took 309.06 seconds
Fitting XGBRegressor;                             took 506.54 seconds
Took a total of 815.61 seconds to fit all models for circuit "Las Vegas"

Fitting models for Mexico City
Fitting Gra

In [36]:
with open("ig_second_models.pickle", "wb") as file:
    pickle.dump(models_and_circuits, file)

## Results
Below are the results of our testing. R^2 is used for score. XGBRegressor generally performs better, but for some circuits, GradientBoostingRegressor gets better scores.

In [None]:
# Show scores for each GridSearch and circuit
all_scores = {}
for key in models_and_circuits.keys():
    scores = {}
    for circuit, model in models_and_circuits[key].items():
        scores[circuit] = model.best_score_
    all_scores[key] = scores

all_scores: pd.DataFrame = pd.DataFrame(all_scores)

all_scores

Unnamed: 0,GradientBoostingRegressor,XGBRegressor
Miami,0.817214,0.826405
Austin,0.920322,0.920174
Melbourne,0.864192,0.868024
Las Vegas,0.85197,0.854329
Mexico City,0.829232,0.854387
Jeddah,0.840661,0.844275
Yas Marina Circuit,0.825207,0.81798
Sakhir,0.725395,0.740442
Singapore,0.558237,0.647718
Imola,0.842252,0.841116


Overall statistics for both algorithms. XGBRegressor is generally better.

In [38]:
# Show score statistics for each model
# MinScore is very important. A good model should perform reasonably well for all tracks.
model_scores_df = pd.DataFrame({
    "MeanScore": all_scores.mean(axis="index"),
    "MedianScore": all_scores.median(axis="index"),
    "ScoreVariance": all_scores.var(axis="index"),
    "MinScore": all_scores.min(axis="index")
})

model_scores_df.sort_values(by=["MeanScore"], ascending=False)

Unnamed: 0,MeanScore,MedianScore,ScoreVariance,MinScore
XGBRegressor,0.824751,0.842696,0.006097,0.647718
GradientBoostingRegressor,0.814336,0.834946,0.00829,0.558237


In [None]:

# Drop BestModelType if it already exists
all_scores.drop(labels=["BestModelType"], axis="columns", inplace=True, errors="ignore")
all_scores["BestModelType"] = all_scores.idxmax(axis="columns")
all_scores

Unnamed: 0,GradientBoostingRegressor,XGBRegressor,BestModelType
Miami,0.817214,0.826405,XGBRegressor
Austin,0.920322,0.920174,GradientBoostingRegressor
Melbourne,0.864192,0.868024,XGBRegressor
Las Vegas,0.85197,0.854329,XGBRegressor
Mexico City,0.829232,0.854387,XGBRegressor
Jeddah,0.840661,0.844275,XGBRegressor
Yas Marina Circuit,0.825207,0.81798,GradientBoostingRegressor
Sakhir,0.725395,0.740442,XGBRegressor
Singapore,0.558237,0.647718,XGBRegressor
Imola,0.842252,0.841116,GradientBoostingRegressor


Finally, we export the chosen models to a file for further use. The file contains a Python dictionary, where the keys are circuit names, and the values are the best model for each track. For most circuits, that will be an XGBRegressor, for others — a GradientBoostingRegressor.

In [52]:
final_regressor_dictionary = {}
for circuit in all_scores.index:
    best_model_type = all_scores.loc[circuit, "BestModelType"]
    final_regressor_dictionary[circuit] = models_and_circuits[best_model_type][circuit].best_estimator_
    
    

In [None]:
with open("ig_best_models_by_circuit.pickle", "wb") as file:
    pickle.dump(final_regressor_dictionary, file)