# Project - leading up to presentation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import sqrt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor


In [None]:
# Load our data into dataframes from the CSV.

# Only useful for converting circuitIds to names
df_circuits = pd.read_csv("data/circuits.csv")
df_constructors = pd.read_csv("data/constructors.csv")
df_costructor_standings = pd.read_csv("data/constructor_standings.csv")
df_constructor_results = pd.read_csv("data/constructor_results.csv")
# Only useful for converting IDs to names.
df_drivers = pd.read_csv("data/drivers.csv")
df_driver_standings = pd.read_csv("data/driver_standings.csv")
df_lap_times = pd.read_csv("data/lap_times.csv")
df_pit_stops = pd.read_csv("data/pit_stops.csv")
df_qualifying = pd.read_csv("data/qualifying.csv")
df_races = pd.read_csv("data/races.csv")
df_results = pd.read_csv("data/results.csv")
# Status of race - usefull for seeing how racers ended a race.
df_status = pd.read_csv("data/status.csv")

In [None]:
# Remove information that does not tell us anything interesting.
df_circuits = df_circuits.drop(columns=[
    "lat",
    "lng", 
    "alt", 
    "url"])
df_drivers = df_drivers.drop(columns=[
    "number",
    "url"])
df_races = df_races.drop(columns=[
    "url",
    "fp1_date",
    "fp1_time",
    "fp2_date",
    "fp2_time",
    "fp3_date",
    "fp3_time",
    "quali_date",
    "quali_time",
    "sprint_date",
    "sprint_time"])

In [None]:
def get_status(df_status, df_results, index: int) -> str:
    statusId = df_results.iloc[index - 1]["statusId"]
    status = df_status.iloc[statusId - 1]
    return status

def get_drivers(df_drivers, df_results, raceId):
    res = df_results[df_results["raceId"] == raceId]
    drivers = df_drivers.iloc[res["driverId"] - 1]
    return drivers

# Races past certain year
# TODO check if working?
def get_recent_races(df_races, df_results, min_year):
    races = df_races[df_races["year"] >= min_year]
    drivers = df_results[races["raceId"]]
    return drivers

In [None]:
#get_status(df_status, df_results, 1)
#get_drivers(df_drivers, df_results, 1)
#df_results.head()

In [None]:
# Remove rows where "position" is empty
# NOTE: Should this have been done after creating "merge_attempt"?
df_results.drop(df_results[df_results["position"] == "\\N"].index, inplace=True)
# No results captured for the 2023 season, so use 2022.
df_modify = df_results[df_results["raceId"].isin(df_races[df_races["year"] >= 2022]["raceId"].values)]
# Remove additional useless columns
df_prep = df_modify.drop(columns=["resultId","number", "positionText", "fastestLap"])
# Convert Lap time from time to float, eg: "1:00:00" to "60.00" so that it can be used
df_prep["fastestLapTime_seconds"] = df_prep["fastestLapTime"].apply(lambda x: float(x.split(':')[0])*60+float(x.split(':')[1]))

df_qualifying[(df_qualifying["raceId"] == 18) & (df_qualifying["driverId"] == 1)][["position", "q1", "q2", "q3"]]
merge_attempt = pd.merge(df_prep, df_qualifying[["raceId","driverId", "position", "q1", "q2", "q3"]], on=["raceId", "driverId"])

# Remove rows where "q1", "q2", "q3" are empty
merge_attempt = merge_attempt.dropna(subset=["q1","q2","q3"])
merge_attempt.drop(merge_attempt[merge_attempt["q1"] == "\\N"].index, inplace=True)
merge_attempt.drop(merge_attempt[merge_attempt["q2"] == "\\N"].index, inplace=True)
merge_attempt.drop(merge_attempt[merge_attempt["q3"] == "\\N"].index, inplace=True)

# More converting time to floats
merge_attempt["q1s"] = merge_attempt["q1"].apply(lambda x: float(x.split(':')[0])*60+float(x.split(':')[1]))
merge_attempt["q2s"] = merge_attempt["q2"].apply(lambda x: float(x.split(':')[0])*60+float(x.split(':')[1]))
merge_attempt["q3s"] = merge_attempt["q3"].apply(lambda x: float(x.split(':')[0])*60+float(x.split(':')[1]))

# Convert fastestLapSpeed from "object" to "numeric" -> required for classification.
merge_attempt["fastestLapSpeed"] = pd.to_numeric(merge_attempt["fastestLapSpeed"])

In [None]:
#plt.scatter(df_prep["grid"], df_prep["position"])
#plt.scatter(df_prep["fastestLapTime_seconds"], df_prep["grid"])

# # Initial Test
# X = df_prep["grid"]
# y = df_prep["position"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# X_train = X_train.values.reshape(-1,1)
# y_train = y_train.values.reshape(-1,)

# X_test = X_test.values.reshape(-1,1)
# y_test = y_test.values.reshape(-1,)

# model = KNeighborsClassifier()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# accuracy_score(y_test, y_pred)

In [None]:
merge_attempt.head()

In [None]:
def classifier_fit_train(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    try:
        mae = mean_absolute_error(y_test, y_pred)
    except:
        # Naive Bayes fails for some reason, so catch the exception it causes
        # and just assign a defualt value.
        mae = 9999 
    print(f"Accuracy: {acc*100:.2f}%")
    print(f"MAE: {mae}")

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    LogisticRegression(),
    SVC(),
    GaussianNB(),
    RandomForestClassifier(),
    GradientBoostingClassifier(), # Currently does not converge and will show a warning when run.
]

X = merge_attempt[["grid", "position_y", "fastestLapSpeed", "fastestLapTime_seconds", "q1s", "q2s", "q3s"]]
y = merge_attempt["position_x"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

y_train = y_train.values.reshape(-1,)
y_test = y_test.values.reshape(-1,)

for model in classifiers:
    print(f"====== {model.__class__.__name__} =======")
    classifier_fit_train(model, X_train, X_test, y_train, y_test)

In [None]:
# # TODO: See when we have no information of actual race results
# X = merge_attempt[["grid", "position_y", "q1s", "q2s", "q3s"]]
# y = merge_attempt["fastestLapTime_seconds"]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
def regressor_fit_train(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("RMSE:", sqrt(mean_squared_error(y_test, y_pred)))
    print("MAE", mean_absolute_error(y_test, y_pred))
    print("R2", r2_score(y_test, y_pred))

regressors = [
    KNeighborsRegressor(),
    LinearRegression(),
    Ridge(),
    Lasso(),
    DecisionTreeRegressor(),
    SVR(),
    RandomForestRegressor(),
    GradientBoostingRegressor()
]

print("\nFastest Lap Time\n")

X = merge_attempt[["grid", "position_x", "position_y", "fastestLapSpeed", "q1s", "q2s", "q3s"]]
y = merge_attempt["fastestLapTime_seconds"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train = y_train.values.reshape(-1,)
y_test = y_test.values.reshape(-1,)

for model in regressors:
    print(f"====== {model.__class__.__name__} =======")
    regressor_fit_train(model, X_train, X_test, y_train, y_test)


print("\nFastest Lap Speed\n")

X = merge_attempt[["grid", "position_x", "position_y", "fastestLapTime_seconds", "q1s", "q2s", "q3s"]]
y = merge_attempt["fastestLapSpeed"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train = y_train.values.reshape(-1,)
y_test = y_test.values.reshape(-1,)

for model in regressors:
    print(f"====== {model.__class__.__name__} =======")
    regressor_fit_train(model, X_train, X_test, y_train, y_test)