# Track Stats

On this Notebook, some part of the code are reused from previous notebook. The objective is to get some statistics about the track such as distance @full speed, number of gear change and so on.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import matplotlib.pyplot as plt
import seaborn as sns

from itertools import groupby

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dtypes = {
     'sessionTime' : "float32",
     'frameIdentifier' : "uint32",
     'pilot_index' : "uint8",
     'worldPositionX' : "float32",
     'worldPositionY' : "float32",
     'worldPositionZ' : "float32",
     'worldVelocityX' : "float32",
     'worldVelocityY' : "float32",
     'worldVelocityZ' : "float32",
     'worldForwardDirX' : "int32",
     'worldForwardDirY' : "int32",
     'worldForwardDirZ' : "int32",
     'worldRightDirX' : "int32",
     'worldRightDirY' : "int32",
     'worldRightDirZ' : "int32",
     'gForceLateral' : "float32",
     'gForceLongitudinal' : "float32",
     'gForceVertical' : "float32",
     'yaw' : "float32",
     'pitch' : "float32",
     'roll' : "float32",
     'speed' : "float32",
     'throttle' : "float32",
     'steer' : "float32",
     'brake' : "float32",
     'clutch': "uint8",
     'gear': "uint8",
     'engineRPM' : "uint32",
     'drs' : "bool",
     'engineTemperature': "uint8",
     'fuelMix': "uint8",
     'pitLimiterStatus': "bool",
     'fuelInTank' : "float32",
     'fuelRemainingLaps' : "float32",
     'ersStoreEnergy' : "uint32",
     'ersDeployMode' : "uint32",
     'ersHarvestedThisLapMGUK' : "uint32",
     'ersHarvestedThisLapMGUH' : "uint32",
     'ersDeployedThisLap' : "uint32",
     'carPosition' : "uint8",
     'currentLapTime' : "float32",
     'currentLapNum' : "uint8",
     'sector': "uint8",
     'lapDistance' : "float32",
     'totalDistance' : "float32",
}

fillnas = {
    'clutch' : 0,
    'gear' : 0,
    'engineRPM': 0,
    "engineTemperature" : 0,
    "fuelMix": 1,
    "pitLimiterStatus" : False,
    "ersStoreEnergy" : 4e7,
    "ersDeployMode" : 1,
    "ersHarvestedThisLapMGUK" : 0,
    "ersHarvestedThisLapMGUH" : 0,
    "ersDeployedThisLap" : 0,
    "sector" : 0
}

df = pd.read_csv("/kaggle/input/f1-2020-race-data/TelemetryData_3335673977098133433.csv")

for col, dtype in dtypes.items():
    if col in fillnas:
        df[col] = df[col].fillna(fillnas[col])
    df[col] = df[col].astype(dtype)

In [None]:
pilot = pd.read_csv("/kaggle/input/f1-2020-race-data/ParticipantData_3335673977098133433.csv")

In [None]:
session = pd.read_csv("/kaggle/input/f1-2020-race-data/SessionData_3335673977098133433.csv").iloc[0].to_dict()
print(session)

In [None]:
race = pd.read_csv("/kaggle/input/f1-2020-race-data/RaceTimeData_3335673977098133433.csv")

In [None]:
def remove_flashbacks(df, pilot=19):
    df2 = df[df["pilot_index"] == pilot]
    frame, X = df2["frameIdentifier"].values, df2[["worldPositionX", "worldPositionY", "worldPositionZ"]].values
    dist_sq = ((X[1:, :] - X[:-1, :])**2).sum(axis=1)
    idx_frame_after_flashback = np.argwhere(dist_sq > 1000).flatten() + 1 # to add the frame 0 shifted for the distance computation
    
    number_flashback = idx_frame_after_flashback.shape[0]
    pos_before_flashback = X[idx_frame_after_flashback-1]
    pos_after_flashback = X[idx_frame_after_flashback]  # position after validateing the flashback
    frames_before_flashback = frame[idx_frame_after_flashback-1]
    frames_after_flashback = frame[idx_frame_after_flashback] # first frame after validating the flashback
    
    for i in range(number_flashback):
        X_start = pos_after_flashback[i, :]
        frame_start = frames_after_flashback[i]
        idx_pos = idx_frame_after_flashback[i]
        d = ((X[idx_pos-500:idx_pos] - X_start)**2).sum(axis=1)
        start, stop = frame[idx_pos - 500 + np.argmin(d)], frame_start
        df = df[(df["frameIdentifier"] > stop) | (df["frameIdentifier"] <= start)]
        
    return df

df = remove_flashbacks(df, pilot=19)

# Percent flat out and breaking

Let's look at how often we are at full throttle and braking

In [None]:
subdf = df[(df["pilot_index"] == 19) & (df["currentLapNum"] == 2)][["worldPositionX", "worldPositionZ", "throttle", "brake", "lapDistance"]]

In [None]:
subdf.info()

In [None]:
c = [["r", "b"][x] for x in subdf["throttle"]>0.95]

plt.figure(figsize=(20, 12))
plt.scatter(subdf["worldPositionZ"], subdf["worldPositionX"], marker="o", s=1, c=c)
plt.axis('equal')
# plt.xlim(-500, -300)
# plt.ylim(-200, 0)
plt.show()

In [None]:
c = [["r", "b"][x] for x in subdf["brake"]>0.1]

plt.figure(figsize=(20, 12))
plt.scatter(subdf["worldPositionZ"], subdf["worldPositionX"], marker="o", s=1, c=c)
plt.axis('equal')
# plt.xlim(-500, -300)
# plt.ylim(-200, 0)
plt.show()

In [None]:
subdf["flat_out"] = subdf["throttle"]>0.95
subdf["braking_zone"] = subdf["brake"]>0.30

The measure will be in percent to be able to compare tracks but in percent of the distance not the time to be more consistent.

In [None]:
def get_distance(df, feature):
    dist = df["lapDistance"].values

    ans = 0
    for key, seq in groupby( df[feature].values):
        n = len(list(seq))
        if key:
            ans += dist[n-1] - dist[0] 
        dist = dist[n:]

    return ans

In [None]:
print(get_distance(subdf, "flat_out"))
print(get_distance(subdf, "braking_zone"))

Ok now we have the code for 1 pilot and 1 lap. L'ets do in for each pilot / lap to be able to see the spread and find an average.

In [None]:
df["flat_out"] = df["throttle"]>0.95
df["braking_zone"] = df["brake"]>0.30

In [None]:
flat = df.groupby(["pilot_index", "currentLapNum"]).apply(get_distance, feature="flat_out").reset_index()
brake = df.groupby(["pilot_index", "currentLapNum"]).apply(get_distance, feature="braking_zone").reset_index()

In [None]:
ans = pd.merge(flat, brake, how="left", left_on=["pilot_index", "currentLapNum"], right_on = ["pilot_index", "currentLapNum"])
ans.columns = ["pilot_index", "currentLapNum", "flat_out", "braking_zone"]
ans = pd.merge(ans, pilot, how="left", left_on=["pilot_index"], right_on = ["pilot_index"])

We need to remove the last laps in case I have taken 1 last to the lasts cars

In [None]:
ans = ans[ans["currentLapNum"]<28]

In [None]:
ans.head()

In [None]:
ans.quantile(0.5)

In [None]:
fig, (ax, ax2) = plt.subplots(1, 2, figsize=(30, 12))
sns.boxplot(x="teamId", y="flat_out", data=ans, ax=ax)
sns.boxplot(x="teamId", y="braking_zone", data=ans, ax=ax2)
plt.show()

Here it is, we can see that Mercedes is spending a lot more distance at full throttle than Alpha Romeo (around +300m per lap). In the other hand, they are often braking in less distance per lap (probably because they are slower or in traffic).
Now we can do the similar work for speed

# percent lap over 300 km/h & below 150

In [None]:
def get_distance(df, feature):
    dist = df["lapDistance"].values

    ans = 0
    for key, seq in groupby( df[feature].values):
        n = len(list(seq))
        if key:
            ans += dist[n-1] - dist[0] 
        dist = dist[n:]

    return ans

df["above300"] = df["speed"]>300
df["below150"] = df["speed"]<150
above300 = df.groupby(["pilot_index", "currentLapNum"]).apply(get_distance, "above300").rename("above300").reset_index()
below150 = df.groupby(["pilot_index", "currentLapNum"]).apply(get_distance, "below150").rename("below150").reset_index()

In [None]:
ans = pd.merge(above300, below150, how="left", left_on=["pilot_index", "currentLapNum"], right_on = ["pilot_index", "currentLapNum"])
ans = pd.merge(ans, pilot, how="left", left_on=["pilot_index"], right_on = ["pilot_index"])
ans = ans[ans["currentLapNum"]<28]

ans[["above300", "below150"]].median()

In [None]:
fig, (ax, ax2) = plt.subplots(1, 2, figsize=(30, 12))
sns.boxplot(x="teamId", y="above300", data=ans, ax=ax)
sns.boxplot(x="teamId", y="below150", data=ans, ax=ax2)
plt.show()

Strangely, the distance per lap at more than 300km is not very consistent. The race is Bahrein which contains a lot of straight lines. However, the distance at less than 150 km/h is quite packed except outliers due to pit stop.

# gear change

On fun metric provided during a real race was the average number of gear change per lap. This is quite simple to do:

In [None]:
def get_number_gear_change(df):
    ans = 0
    for key, seq in groupby(df["gear"]):
        ans += 1
    return ans

In [None]:
gear = df.groupby(["pilot_index", "currentLapNum"]).apply(get_number_gear_change).rename("gear_change").reset_index()
gear = pd.merge(gear, pilot, how="left", left_on=["pilot_index"], right_on = ["pilot_index"])
gear = gear[gear["currentLapNum"]<28]

In [None]:
gear.head()

In [None]:
gear["gear_change"].median()

In [None]:
fig, (ax, ax2) = plt.subplots(1, 2, figsize=(30, 12))
sns.boxplot(x="teamId", y="gear_change", data=gear, ax=ax)
sns.boxplot(x="currentLapNum", y="gear_change", data=gear, ax=ax2)
plt.show()

We can see 3/4 laps with more gear change, the first one is due to the start (+8 gears) and the pit stop (+5 gears in average)

In [None]:
subdf = df[(df["pilot_index"] == 19) & (df["currentLapNum"] == 2)][["worldPositionX", "worldPositionZ", "gear", "speed"]]

In [None]:
plt.figure(figsize=(20, 12))
plt.scatter(subdf["worldPositionZ"], subdf["worldPositionX"], marker="o", s=1, c=subdf["speed"], cmap="cool")
plt.axis('equal')
# plt.xlim(-500, -300)
# plt.ylim(-200, 0)
plt.title("Speed on track")
plt.show()

# Lap Time / Fastest Lap average speed

Easy one, let's look at the average speed on the track and also the average speed of the fastest lap. Due to outliers with pit stop and the lap lap, the madin will be used instead of the mean.

In [None]:
fig, ax = plt.subplots(1, figsize=(30, 12))
sns.scatterplot(x="currentLapNum", y="LapTime", hue="pilot_index", data=race, ax=ax)
ax.hlines(race["LapTime"].median(), 0, 30)
plt.show()

In [None]:
fastest_lap = race[race["currentLapNum"]<28]["LapTime"].min()
fastest_avg_speed = session["trackLength"] / fastest_lap * 3.6
print(f"Fastest Average Speed : {fastest_avg_speed:.2f}km/h")

In [None]:
speed = session["trackLength"] / race["LapTime"].median() * 3.6
print(f"Average Speed : {speed:.2f}km/h")

# Max Speed

Similarly, we can have a look at the max speed per lap and keep the information is it has been reach with or without DRS (in principle the DRS should give 15km/h advantage)

In [None]:
def get_max_speed(df):
    idx = df["speed"].argmax()
    return df.iloc[idx][["drs", "speed"]]

speed = df.groupby(["pilot_index", "currentLapNum"]).apply(get_max_speed).reset_index()
speed = speed[speed["currentLapNum"]<28]
speed = pd.merge(speed, pilot, how="left", left_on=["pilot_index"], right_on = ["pilot_index"])

In [None]:
plt.figure(figsize=(20, 12))
sns.boxplot(x="teamId", y="speed", hue="drs", data=speed)
plt.show()

We can see a difference of around 15 km/h with variation based on the team as they don't have the same engine.

# Gs

This one is a bit more tricky, I wanted to get the max G-force a triver is taking

In [None]:
subdf = df[(df["pilot_index"] == 19) & (df["currentLapNum"] == 2)][["worldPositionX", "worldPositionZ", 'gForceLateral', 'gForceLongitudinal', 'gForceVertical', "lapDistance"]]

In [None]:
plt.plot(subdf["lapDistance"], subdf["gForceLateral"])
plt.show()

In [None]:
subdf = df[(df["pilot_index"] == 19)][["worldPositionX", "worldPositionZ", 'gForceLateral', 'gForceLongitudinal', 'gForceVertical', "lapDistance", "currentLapNum"]]

plt.figure(figsize=(20, 12))
for i in range(20):
    plt.plot(subdf[subdf["currentLapNum"] == i]["lapDistance"], subdf[subdf["currentLapNum"] == i]["gForceLateral"])
plt.show()

In [None]:
plt.figure(figsize=(20, 12))
for i in range(20):
    plt.plot(subdf[subdf["currentLapNum"] == i]["lapDistance"], subdf[subdf["currentLapNum"] == i]["gForceLongitudinal"])
plt.ylim(-5, 3)
plt.show()

In [None]:
def get_max_g_lat(df):
    return df["gForceLateral"].max()

acc = df.groupby(["pilot_index", "currentLapNum"]).apply(get_max_g_lat).rename("max_lat_acc").reset_index()

In [None]:
acc = acc[acc["currentLapNum"]<28]
acc = pd.merge(acc, pilot, how="left", left_on=["pilot_index"], right_on = ["pilot_index"])

In [None]:
acc.head()

In [None]:
fig, ax = plt.subplots(figsize=(20, 12))
sns.boxplot(x="driverId", y="max_lat_acc", data=acc, ax=ax)
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
plt.show()

My suprise is that I (Valtteri Bottas) takes a lot less forces than my teammate. If we filter only between us, let's see the result

In [None]:
fig, ax = plt.subplots(figsize=(20, 12))
sns.boxplot(x="driverId", y="max_lat_acc", data=acc[acc["teamId"] == "Mercedes"], ax=ax)
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
plt.show()

That means, I'm probably driving in a too smooth way and I should be able to push more the car in curves. This is what I see when I drive vs 100 A.I. I'll maybe record other lap vs better A.I. than me to understand where I'm braking too much in order to improve my lap time. On those data, I'm already faster tahn AI except in Spain so it makes no sense to explore it here.

I hope you enjoyed it ! More to come