In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from itertools import groupby
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dtypes = {
     'sessionTime' : "float32",
     'frameIdentifier' : "uint32",
     'pilot_index' : "uint8",
     'worldPositionX' : "float32",
     'worldPositionY' : "float32",
     'worldPositionZ' : "float32",
     'worldVelocityX' : "float32",
     'worldVelocityY' : "float32",
     'worldVelocityZ' : "float32",
     'worldForwardDirX' : "int32",
     'worldForwardDirY' : "int32",
     'worldForwardDirZ' : "int32",
     'worldRightDirX' : "int32",
     'worldRightDirY' : "int32",
     'worldRightDirZ' : "int32",
     'gForceLateral' : "float32",
     'gForceLongitudinal' : "float32",
     'gForceVertical' : "float32",
     'yaw' : "float32",
     'pitch' : "float32",
     'roll' : "float32",
     'speed' : "float32",
     'throttle' : "float32",
     'steer' : "float32",
     'brake' : "float32",
     'clutch': "uint8",
     'gear': "uint8",
     'engineRPM' : "uint32",
     'drs' : "bool",
     'engineTemperature': "uint8",
     'fuelMix': "uint8",
     'pitLimiterStatus': "bool",
     'fuelInTank' : "float32",
     'fuelRemainingLaps' : "float32",
     'ersStoreEnergy' : "uint32",
     'ersDeployMode' : "uint32",
     'ersHarvestedThisLapMGUK' : "uint32",
     'ersHarvestedThisLapMGUH' : "uint32",
     'ersDeployedThisLap' : "uint32",
     'carPosition' : "uint8",
     'currentLapTime' : "float32",
     'currentLapNum' : "uint8",
     'sector': "uint8",
     'lapDistance' : "float32",
     'totalDistance' : "float32",
}

fillnas = {
    'clutch' : 0,
    'gear' : 0,
    'engineRPM': 0,
    "engineTemperature" : 0,
    "fuelMix": 1,
    "pitLimiterStatus" : False,
    "ersStoreEnergy" : 4e7,
    "ersDeployMode" : 1,
    "ersHarvestedThisLapMGUK" : 0,
    "ersHarvestedThisLapMGUH" : 0,
    "ersDeployedThisLap" : 0,
    "sector" : 0
}

In [None]:
df = pd.read_csv("/kaggle/input/f1-2020-race-data/TelemetryData_3335673977098133433.csv")

In [None]:
participants = pd.read_csv("/kaggle/input/f1-2020-race-data/ParticipantData_3335673977098133433.csv")
display(participants)

In [None]:
for col, dtype in dtypes.items():
    if col in fillnas:
        df[col] = df[col].fillna(fillnas[col])
    df[col] = df[col].astype(dtype)

In [None]:
df.head()

In [None]:
df.info()

# remove flashback

In case of accident / driving mistakes, the pilot can do a flashback to re-do the action. However, this is not deleting the frame (I was thinking it was overwriting it but no) so let's find them.

In [None]:
def remove_flashbacks(df, pilot=19):
    df2 = df[df["pilot_index"] == pilot]
    frame, X = df2["frameIdentifier"].values, df2[["worldPositionX", "worldPositionY", "worldPositionZ"]].values
    dist_sq = ((X[1:, :] - X[:-1, :])**2).sum(axis=1)
    idx_frame_after_flashback = np.argwhere(dist_sq > 1000).flatten() + 1 # to add the frame 0 shifted for the distance computation
    
    number_flashback = idx_frame_after_flashback.shape[0]
    pos_before_flashback = X[idx_frame_after_flashback-1]
    pos_after_flashback = X[idx_frame_after_flashback]  # position after validateing the flashback
    frames_before_flashback = frame[idx_frame_after_flashback-1]
    frames_after_flashback = frame[idx_frame_after_flashback] # first frame after validating the flashback
    
    for i in range(number_flashback):
        X_start = pos_after_flashback[i, :]
        frame_start = frames_after_flashback[i]
        idx_pos = idx_frame_after_flashback[i]
        d = ((X[idx_pos-500:idx_pos] - X_start)**2).sum(axis=1)
        start, stop = frame[idx_pos - 500 + np.argmin(d)], frame_start
        df = df[(df["frameIdentifier"] > stop) | (df["frameIdentifier"] <= start)]
        
    return df

In [None]:
plt.figure(figsize=(20, 12))
plt.plot(df[df["pilot_index"] == 19]["worldPositionZ"], df[df["pilot_index"] == 19]["worldPositionX"])
plt.axis('equal')
plt.show()

In [None]:
df = remove_flashbacks(df, pilot=19)

In [None]:
c = [["r", "b"][x] for x in df[df["pilot_index"] == 19]["pitStatus"].isnull().values]

plt.figure(figsize=(20, 12))
plt.scatter(df[df["pilot_index"] == 19]["worldPositionZ"], df[df["pilot_index"] == 19]["worldPositionX"], marker="o", s=1, c=c)
plt.axis('equal')
# plt.xlim(-500, -300)
# plt.ylim(-200, 0)
plt.show()

# Extract Pit time per pilot and pit stop

The logic here is quite simple. A pilot is on track, when he joins the pit, there is a first part of "pitting" which means he is the pit lane driving at a limited speed (60 km/h in general). When he reaches his stand, he stop the car the the "in pit area" status is starting. This is when tyres are changed. Then he can exit the pit lane still with limited speed ("this is the second "pitting" status).
In case of penalty, a driven can be asked to go through stands without being "in pit area". 

In [None]:
df["pitStatus"].value_counts()

In [None]:
def get_pit_durations(df):
    sessionTime = df["sessionTime"].values
    pitStatus = df["pitStatus"].fillna("on track").values

    pit_stop_duration = []
    pitting_duration = []

    current_index = 0
    was_on_pit = False
    for val, elems in groupby(pitStatus):
        nrows = len(list(elems))
        if val == "pitting" and not was_on_pit:
            start_pit_stand = sessionTime[0]
            was_on_pit = True
        elif val == "on track" and was_on_pit:  # in case on penalty of drive thru, we may not have a "in pit area" status
            stop_pit_stand = sessionTime[0]
            pitting_duration.append(stop_pit_stand - start_pit_stand)
            was_on_pit = False
        elif val == "in pit area":
            start_pit_stop = sessionTime[0]
            stop_pit_stop = sessionTime[nrows]
            pit_stop_duration.append(stop_pit_stop - start_pit_stop)
        sessionTime=sessionTime[nrows:]
    
    return pit_stop_duration, pitting_duration

In [None]:
all_pilots = df["pilot_index"].unique()
pit_areas, pit_lanes = [], []
for pilot_id in all_pilots:
    light_df = df[df["pilot_index"] == pilot_id][["sessionTime", "pilot_index", "pitStatus"]]
    pit_area, pit_lane = get_pit_durations(light_df)
    pit_areas += pit_area
    pit_lanes += pit_lane

In [None]:
plt.figure(figsize=(20, 12))
plt.boxplot([pit_areas, pit_lanes])
plt.xticks([1, 2], ['Time in pit area', 'Time in pit lane'])
plt.title("Time in pit")
plt.show()

Outlier are present where the front wing is broken and must be replaced.

In [None]:
import statistics

print(f"Median time in pit area : {statistics.median(pit_areas):.3f}s")
print(f"Median time in pit lane : {statistics.median(pit_lanes):.3f}s")

print(f"Fastest pit stop : {min(pit_areas):.3f}s")

When all maps will be presents, a more in-depth analysis will be done such as:

- Team comparison for time in pit area 
- Map comparison in term of pit lane 

# Extract time for the same portion without pit stop

Previously, we determined the time to go through the pit lane. To determine the time lost by a pit stop, we also need to know what is the time spend to go from the entry of the pit to the exit of the pit lane by staying on the track. The easiest would be to use the *ParticipantData* dataset but this will be less accurate as that means use the time of the inlap + outlap. Both lap can be faster/slower for several reasons.

On the new version of the dataset, there is now the Lap distance so we can find a good approximation of the pit entry and exit based on lap distance and then find the time spend on all laps without pit

In [None]:
df['in_pit'] = df["pitStatus"].notnull()

In [None]:
all_pilots = df["pilot_index"].unique()
dists_entry = []
dists_exit = []
for pilot_id in all_pilots:
    sub_df = df[df["pilot_index"] == pilot_id].copy()
    in_pit, lap_distance, lap_time = sub_df['in_pit'].values, sub_df['lapDistance'].values, sub_df['currentLapTime'].values
    for pit_bool, seq in groupby(in_pit):
        n_frames = len(list(seq))
        if pit_bool:
            if lap_distance[0] > 2000 and 300 < lap_distance[n_frames-1] < 1000: # entry is always at the end of a lap and a lap is more than 3km
                dists_entry.append(lap_distance[0])
                dists_exit.append(lap_distance[n_frames-1])
        lap_distance = lap_distance[n_frames:]

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 12))
ax1.boxplot(dists_entry)
ax2.boxplot(dists_exit)
ax1.set_title('Distance before entry')
ax2.set_title('Distance for the exit')
plt.show()

Except 1 datapoint, we can see that the spread if the pit entry / exit distance is very small (1 meter).

In [None]:
lap_distance = pd.read_csv("/kaggle/input/f1-2020-race-data/SessionData_3335673977098133433.csv").iloc[0]["trackLength"]
pit_distance = lap_distance - statistics.median(dists_entry) + statistics.median(dists_exit)
print(f"The pit distance is {round(pit_distance, 2)}m")

In [None]:
df_on_track = df.groupby(["pilot_index", "currentLapNum"]).filter(lambda x:x["in_pit"].sum()==0)

In [None]:
sub_df = df_on_track[["pilot_index", "currentLapNum", "lapDistance", "currentLapTime"]]

In [None]:
sub_df.head()

In [None]:
lap_time = pd.read_csv("/kaggle/input/f1-2020-race-data/RaceTimeData_3335673977098133433.csv")

In [None]:
def get_time_eq_entry_pit(df):
    x = df["lapDistance"].values
    t = df["currentLapTime"].values
    dx = x - statistics.median(dists_entry)
    idx_min = np.argmin(np.abs(dx))
    if dx[idx_min] < 10:  # when I finish the lap, AI behind are directly stopped so we may not find the real min
        return t[idx_min]
    else:
        return None
    
def get_time_eq_exit_pit(df):
    x = df["lapDistance"].values
    t = df["currentLapTime"].values
    dx = x - statistics.median(dists_exit)
    idx_min = np.argmin(np.abs(dx))
    if dx[idx_min] < 10:  # when I finish the lap, AI behind are directly stopped so we may not find the real min
        return t[idx_min]
    else:
        return None

In [None]:
entry = sub_df.groupby(["pilot_index", "currentLapNum"]).apply(get_time_eq_entry_pit).rename("entryTime").reset_index()
exit = sub_df.groupby(["pilot_index", "currentLapNum"]).apply(get_time_eq_exit_pit).rename("exitTime").reset_index()

In [None]:
time_info = pd.merge(entry, lap_time,  how='left', left_on=["pilot_index", "currentLapNum"], right_on = ["pilot_index", "currentLapNum"])
time_info = pd.merge(time_info, exit,  how='left', left_on=["pilot_index", "currentLapNum"], right_on = ["pilot_index", "currentLapNum"])

In [None]:
time_info["pit_duration_in_track"] = time_info["LapTime"] - time_info["entryTime"] + time_info["exitTime"].shift(1)  # shift of 1 car we enter in pit in lap i and exit it in lap i+1

In [None]:
time_info["pit_duration_in_track"].plot(kind="box")
plt.show()

The time is highly dependant on the team of course but also for most track the DRS. The pit is often covered by a DRS zone and it is disabled in the first 3 laps. Taking the mean is a good value (after removing the outlier < 4s)

In [None]:
t = time_info["pit_duration_in_track"].values
t = t[t>4]
print(f"Out of the lap, the time to go the pit lane is { round(t.mean()) }s")

# Now we are able for each track to find all the information, we need. A summary will be done when all tracks are available :)