In [32]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns
from seaborn import set_style
set_style("whitegrid")

In [33]:
def powerset(s):
    power_set = [[]]
    for x in s:
        power_set += [s0+[x] for s0 in power_set]
    return power_set[1:]

In [34]:
df = pd.read_csv('WORKINGF1Data.csv')

In [35]:
F1_master_test = df[df['Season'] >= 2021]
F1_2020 = df[df['Season'] <= 2020]
F1 = F1_2020[F1_2020['Qualifying Time'] > 30]

In [36]:
potential_features = ['FP1 Time', 'FP1 Gap', 'FP1 Laps',
                      'FP2 Time', 'FP2 Gap', 'FP2 Laps',
                      'FP3 Time', 'FP3 Gap', 'FP3 Laps']

all_models = ['baseline']
all_models.extend(powerset(potential_features))

In [37]:
from sklearn.model_selection import train_test_split

In [38]:
F1_train, F1_test = train_test_split(F1, test_size = .2, random_state = 949, shuffle = True)

In [39]:
## import KFold, Linear Regression, and kNeighborsRegressor here
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

## import mean_squared_error
from sklearn.metrics import mean_squared_error

In [40]:
kfold = KFold(5, random_state = 818, shuffle = True)
mses = np.zeros((len(all_models), 5))

In [41]:
j = 0
for train_index, test_index in kfold.split(F1_train):
    F1_tt = F1_train.iloc[train_index]
    F1_ho = F1_train.iloc[test_index]
    
    i = 0
    for model in all_models:
        if model == "baseline":
            pred = F1_tt['Qualifying Time'].mean() * np.ones(len(F1_ho))
            mses[i, j] = mean_squared_error(F1_ho['Qualifying Time'], pred)
        else:
            if len(model) == 1:
                reg = LinearRegression()
                reg.fit(F1_tt[model].values.reshape(-1,1), F1_tt['Qualifying Time'])
                pred = reg.predict(F1_ho[model].values.reshape(-1,1))
                mses[i, j] = mean_squared_error(F1_ho['Qualifying Time'], pred)
            else:
                reg = LinearRegression()
                reg.fit(F1_tt[model].values, F1_tt['Qualifying Time'])
                pred = reg.predict(F1_ho[model].values)
                mses[i, j] = mean_squared_error(F1_ho['Qualifying Time'], pred)
        i = i + 1
    j = j + 1

In [42]:
np.min(np.mean(mses, axis = 1))

15.72329993956129

In [43]:
best_reg_model = all_models[np.argmin(np.mean(mses, axis=1))]

In [44]:
best_reg = LinearRegression()
best_reg.fit(F1_tt[best_reg_model].values, F1_tt['Qualifying Time'])
pred_reg = best_reg.predict(F1_test[best_reg_model].values)
mean_squared_error(F1_test['Qualifying Time'], pred_reg)

17.460150027942223

In [45]:
j = 0
for train_index, test_index in kfold.split(F1_train):
    F1_tt = F1_train.iloc[train_index]
    F1_ho = F1_train.iloc[test_index]
    
    i = 0
    for model in all_models:
        if model == "baseline":
            pred = F1_tt['Qualifying Time'].mean() * np.ones(len(F1_ho))
            mses[i, j] = mean_squared_error(F1_ho['Qualifying Time'], pred)
        else:
            if len(model) == 1:
                knn = KNeighborsRegressor(10)
                knn.fit(F1_tt[model].values.reshape(-1,1), F1_tt['Qualifying Time'])
                pred = knn.predict(F1_ho[model].values.reshape(-1,1))
                mses[i, j] = mean_squared_error(F1_ho['Qualifying Time'], pred)
            else:
                knn = KNeighborsRegressor(10)
                knn.fit(F1_tt[model].values, F1_tt['Qualifying Time'])
                pred = knn.predict(F1_ho[model].values)
                mses[i, j] = mean_squared_error(F1_ho['Qualifying Time'], pred)
        i = i + 1
    j = j + 1

In [46]:
np.min(np.mean(mses, axis = 1))

10.512633136187137

In [47]:
best_knn_model = all_models[np.argmin(np.mean(mses, axis=1))]

In [51]:
best_knn = KNeighborsRegressor(10)
best_knn.fit(F1_tt[best_knn_model].values, F1_tt['Qualifying Time'])
pred_knn = best_knn.predict(F1_test[best_knn_model].values)
mean_squared_error(F1_test['Qualifying Time'], pred_knn)

13.609621319804772

In [52]:
master_pred_knn = best_knn.predict(F1_master_test[best_knn_model].values)
mean_squared_error(F1_master_test['Qualifying Time'], master_pred_knn)

16.80922974336538