# This notebook is destinated for EDA (with Points as outcome variable)

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier 

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objs as go
import statsmodels.formula.api as smf
from plotly.graph_objects import Layout

In [2]:
df = pd.read_csv("plotdata_points.csv")

In [10]:
df = df.drop("Unnamed: 0", axis = 1)

In [11]:
df['GridPosition'] = df['GridPosition'].replace(0, 20)

In [12]:
df.columns

Index(['Abbreviation', 'TeamName', 'GridPosition', 'Points', 'RaceCountry',
       'Year', 'AgeAtGP', 'BestQualiTime', 'FLap', 'AvgLapTime', 'SDLapTime',
       'AvgSplitTime', 'AvgPitTime', 'PitstopNo', 'HARD', 'INTERMEDIATE',
       'MEDIUM', 'SOFT', 'WET', 'Engine', 'Rain', 'AverageSpeed', 'MaxSpeed',
       'AverageRPM', 'MaxRPM', 'AverageThrottle', 'MaxThrottlePct', 'Brake',
       'raceID', 'CircuitType', 'carIssue', 'driverIssue', 'after_2020'],
      dtype='object')

In [13]:
df.describe()

Unnamed: 0,GridPosition,Points,Year,AgeAtGP,BestQualiTime,FLap,AvgLapTime,SDLapTime,AvgSplitTime,AvgPitTime,...,MaxSpeed,AverageRPM,MaxRPM,AverageThrottle,MaxThrottlePct,Brake,raceID,carIssue,driverIssue,after_2020
count,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,...,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0
mean,10.515837,5.213316,2020.539754,27.425339,84.480341,0.051713,93.111168,7.609556,3.363785,99.787053,...,329.505495,6615.800697,12771.70459,47.755554,13.210693,22.347367,40.318035,0.111829,0.0181,0.53329
std,5.791235,7.236425,1.154413,5.63119,12.498625,0.221519,12.188857,4.38139,1.533585,238.448638,...,14.612562,1467.822931,302.875962,9.343254,14.885145,11.288978,23.190107,0.315258,0.133355,0.499052
min,1.0,0.0,2019.0,19.0,53.377,0.0,62.46896,0.143536,0.085312,0.0,...,274.0,560.874158,11884.0,14.53716,0.002291,0.0,1.0,0.0,0.0,0.0
25%,5.0,0.0,2019.0,23.0,76.178,0.0,84.38544,3.815307,2.336711,22.227834,...,323.0,6382.366028,12566.0,42.761592,1.574367,15.409331,20.0,0.0,0.0,0.0
50%,10.0,1.0,2021.0,26.0,83.005,0.0,92.534571,6.759234,3.223513,24.1485,...,331.0,6851.240848,12763.0,47.417343,8.794342,19.25638,40.0,0.0,0.0,1.0
75%,16.0,10.0,2022.0,32.0,92.354,0.0,101.897754,10.746647,4.209282,29.947,...,338.0,7586.920174,12974.5,51.82219,18.717936,25.440253,60.0,0.0,0.0,1.0
max,20.0,25.0,2022.0,42.0,141.611,1.0,137.035,23.708907,13.0312,3055.732,...,370.0,10551.3054,13897.0,95.539949,87.616601,89.233473,80.0,1.0,1.0,1.0


In [14]:
df.groupby('after_2020')['MaxSpeed'].mean()

after_2020
0    331.909972
1    327.401212
Name: MaxSpeed, dtype: float64

In [15]:
df.groupby('after_2020')['BestQualiTime'].mean()

after_2020
0    84.414349
1    84.538095
Name: BestQualiTime, dtype: float64

In [16]:
df.groupby('Year')['BestQualiTime'].mean()

Year
2019.0    84.880014
2020.0    83.788422
2021.0    82.228406
2022.0    86.722493
Name: BestQualiTime, dtype: float64

In [17]:
df.groupby('Year')['MaxSpeed'].mean()

Year
2019.0    331.971014
2020.0    331.827922
2021.0    329.822943
2022.0    325.110849
Name: MaxSpeed, dtype: float64

In [18]:
df.groupby('after_2020')['AverageSpeed'].mean()

after_2020
0    135.182792
1    114.990237
Name: AverageSpeed, dtype: float64

In [19]:
df.groupby('Year')['AverageSpeed'].mean()

Year
2019.0    137.822057
2020.0    131.635209
2021.0    121.285631
2022.0    109.036339
Name: AverageSpeed, dtype: float64

In [20]:
df.groupby('after_2020')['AgeAtGP'].mean()

after_2020
0    27.123269
1    27.689697
Name: AgeAtGP, dtype: float64

In [21]:
df.groupby('Year')['AgeAtGP'].mean()

Year
2019.0    27.060386
2020.0    27.207792
2021.0    27.638404
2022.0    27.738208
Name: AgeAtGP, dtype: float64

In [22]:
df.groupby('after_2020')['AvgPitTime'].mean()

after_2020
0     61.158738
1    133.592681
Name: AvgPitTime, dtype: float64

In [23]:
df.groupby('Year')['AvgPitTime'].mean()

Year
2019.0     24.563519
2020.0    110.348415
2021.0    170.911820
2022.0     98.297930
Name: AvgPitTime, dtype: float64

In [24]:
df.groupby('after_2020')['PitstopNo'].mean()

after_2020
0    1.685596
1    1.932121
Name: PitstopNo, dtype: float64

In [25]:
df.groupby('Year')['PitstopNo'].mean()

Year
2019.0    1.562802
2020.0    1.850649
2021.0    1.972569
2022.0    1.893868
Name: PitstopNo, dtype: float64

In [26]:
df.groupby('after_2020')['carIssue'].value_counts()

after_2020  carIssue
0           0           637
            1            85
1           0           737
            1            88
Name: carIssue, dtype: int64

In [27]:
df.groupby('Year')['carIssue'].value_counts()

Year    carIssue
2019.0  0           365
        1            49
2020.0  0           272
        1            36
2021.0  0           367
        1            34
2022.0  0           370
        1            54
Name: carIssue, dtype: int64

In [28]:
df.groupby('after_2020')['driverIssue'].value_counts()

after_2020  driverIssue
0           0              708
            1               14
1           0              811
            1               14
Name: driverIssue, dtype: int64

In [29]:
df.groupby('Year')['driverIssue'].value_counts()

Year    driverIssue
2019.0  0              407
        1                7
2020.0  0              301
        1                7
2021.0  0              393
        1                8
2022.0  0              418
        1                6
Name: driverIssue, dtype: int64

In [30]:
df.groupby('Rain')['driverIssue'].value_counts()

Rain   driverIssue
False  0              1465
       1                25
True   0                54
       1                 3
Name: driverIssue, dtype: int64

In [31]:
df.groupby('Rain')['carIssue'].value_counts()

Rain   carIssue
False  0           1321
       1            169
True   0             53
       1              4
Name: carIssue, dtype: int64

In [32]:
p1 = df[df["Points"].isin([25, 18, 15])]
p1

Unnamed: 0,Abbreviation,TeamName,GridPosition,Points,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,AvgLapTime,...,AverageRPM,MaxRPM,AverageThrottle,MaxThrottlePct,Brake,raceID,CircuitType,carIssue,driverIssue,after_2020
9,VER,RedBullRacing,4.0,15.0,Australia,2019.0,21.0,81.320,0.0,88.658842,...,7722.791853,12461,50.394143,1.092949,13.533761,1,street,0,0,0
11,HAM,Mercedes,1.0,18.0,Australia,2019.0,33.0,80.486,0.0,88.654807,...,7700.247973,12811,51.783875,35.571123,11.876500,1,street,0,0,0
16,BOT,Mercedes,2.0,25.0,Australia,2019.0,29.0,80.598,1.0,88.307228,...,7738.053480,12428,51.196796,39.608873,11.091652,1,street,0,0,0
22,LEC,Ferrari,1.0,15.0,Bahrain,2019.0,21.0,87.866,1.0,99.427857,...,7500.414003,12641,54.121875,4.926349,20.056842,2,race,0,0,0
31,HAM,Mercedes,3.0,25.0,Bahrain,2019.0,33.0,88.190,0.0,98.348000,...,7501.931636,12843,48.209553,0.021392,15.136605,2,race,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1523,SAI,Ferrari,7.0,15.0,Brazil,2022.0,28.0,70.890,0.0,83.128571,...,6592.937587,12597,50.099952,12.594646,29.303082,79,race,0,0,1
1525,RUS,Mercedes,1.0,25.0,Brazil,2022.0,23.0,71.318,1.0,83.220985,...,6283.240829,12477,45.933721,10.213821,21.742028,79,race,0,0,1
1527,VER,RedBullRacing,1.0,25.0,AbuDhabi,2022.0,25.0,83.824,0.0,90.766947,...,6295.592029,12652,38.604095,0.852338,11.941373,80,race,0,0,1
1529,PER,RedBullRacing,2.0,15.0,AbuDhabi,2022.0,32.0,84.052,0.0,90.925543,...,6231.635424,12629,38.557936,0.279314,10.271251,80,race,0,0,1


In [33]:
df1 = df[df["carIssue"].isin([1])]
df1

Unnamed: 0,Abbreviation,TeamName,GridPosition,Points,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,AvgLapTime,...,AverageRPM,MaxRPM,AverageThrottle,MaxThrottlePct,Brake,raceID,CircuitType,carIssue,driverIssue,after_2020
8,RIC,Renault,12.0,0.0,Australia,2019.0,30.0,82.570,0.0,90.781814,...,4066.566842,12138,61.945190,34.844652,40.909386,1,street,1,0,0
13,SAI,McLaren,18.0,0.0,Australia,2019.0,25.0,83.084,0.0,91.466250,...,1561.997178,12079,71.075631,59.440877,63.193228,1,street,1,0,0
17,GRO,HaasF1Team,6.0,0.0,Australia,2019.0,32.0,81.826,0.0,91.456642,...,4204.766102,12715,61.395018,34.413310,42.427191,1,street,1,0,0
27,HUL,Renault,17.0,0.0,Bahrain,2019.0,31.0,90.034,0.0,98.449173,...,6953.410458,12386,51.185044,6.286291,20.564146,2,race,1,0,0
28,RIC,Renault,10.0,0.0,Bahrain,2019.0,30.0,89.488,0.0,98.710519,...,7031.573865,12481,50.352851,6.188497,19.833751,2,race,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,TSU,AlphaTauri,13.0,0.0,Mexico,2022.0,22.0,79.589,0.0,85.476918,...,4931.303959,12835,44.230054,17.758517,28.635096,78,race,1,0,1
1519,NOR,McLaren,6.0,0.0,Brazil,2022.0,23.0,71.377,0.0,82.431551,...,4658.892796,12675,56.223686,30.084249,41.143223,79,race,1,0,1
1530,ALO,Alpine,10.0,0.0,AbuDhabi,2022.0,41.0,85.096,0.0,92.898538,...,3166.917358,12585,51.827200,31.473163,38.018314,80,race,1,0,1
1540,HAM,Mercedes,5.0,0.0,AbuDhabi,2022.0,37.0,84.508,0.0,91.891166,...,6059.763793,13077,51.894322,12.338747,22.992974,80,race,1,0,1


In [34]:
import plotly.express as px
agg_data = df1.groupby(['RaceCountry', 'carIssue']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="RaceCountry", y="Count",
              barmode='group',
             height=400)
fig.show()

In [35]:
df2 = df[df["driverIssue"].isin([1])]
df2

Unnamed: 0,Abbreviation,TeamName,GridPosition,Points,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,AvgLapTime,...,AverageRPM,MaxRPM,AverageThrottle,MaxThrottlePct,Brake,raceID,CircuitType,carIssue,driverIssue,after_2020
37,GRO,HaasF1Team,11.0,0.0,Bahrain,2019.0,32.0,89.015,0.0,100.586533,...,2395.289499,12651,67.95309,52.505959,57.860155,2,race,0,1,0
156,GRO,HaasF1Team,16.0,0.0,France,2019.0,33.0,91.626,0.0,98.359093,...,6652.291065,12878,49.03374,8.934855,21.694235,8,race,0,1,0
200,LEC,Ferrari,10.0,0.0,Germany,2019.0,21.0,72.229,0.0,95.925846,...,3638.888775,12969,64.652483,43.71701,54.152752,11,race,0,1,0
205,HUL,Renault,9.0,0.0,Germany,2019.0,31.0,72.766,0.0,101.109947,...,4920.110713,12483,54.658946,28.339438,39.919746,11,race,0,1,0
214,BOT,Mercedes,3.0,0.0,Germany,2019.0,29.0,72.129,0.0,98.842418,...,6571.997038,13282,43.217683,9.441991,24.719918,11,race,0,1,0
255,GIO,AlfaRomeoRacing,18.0,0.0,Belgium,2019.0,25.0,105.637,0.0,111.166315,...,7400.322208,12976,56.278213,8.235813,20.196058,13,race,0,1,0
344,NOR,McLaren,8.0,0.0,Mexico,2019.0,19.0,76.322,0.0,83.93,...,5260.725418,12598,51.078627,22.271436,34.844366,18,race,0,1,0
457,KVY,AlphaTauri,19.0,0.0,UK,2020.0,26.0,86.744,0.0,108.201666,...,2008.8082,12956,70.206629,56.91486,60.442978,24,race,0,1,0
474,MAG,HaasF1Team,17.0,0.0,UK,2020.0,28.0,88.236,0.0,95.081095,...,6461.174394,12526,51.639725,11.294602,22.561274,25,race,0,1,0
529,GIO,AlfaRomeoRacing,18.0,0.0,Belgium,2020.0,26.0,103.95,0.0,112.675625,...,1889.17973,12701,67.51212,53.838982,57.484851,27,race,0,1,0


In [36]:
import plotly.express as px
agg_data = df2.groupby(['RaceCountry', 'driverIssue']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="RaceCountry", y="Count",
              barmode='group',
             height=400)
fig.show()

## Number of podiums before and after

In [37]:
import plotly.express as px
agg_data = p1.groupby(['TeamName', 'Points', 'after_2020']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="TeamName", y="Count",
             color='after_2020', barmode='group',
             height=400)
fig.show()

In [38]:
p1.rename(columns={'after_2020': 'after'}, inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [39]:
import plotly.express as px

agg_data = p1.groupby(['TeamName', 'Points', 'after']).size().reset_index(name='Count')

category_order = [0, 1]
colors = ['#171C54', '#93B8D6']  # Specify the desired colors for '0' and '1'

fig = px.histogram(agg_data, x="TeamName", y="Count",
                   color='after', barmode='group',
                   height=400, category_orders={"after": category_order},
                   color_discrete_sequence=colors)  # Use color_discrete_sequence instead

# Show the plot
fig.show()


In [40]:
import plotly.express as px
agg_data = p1.groupby(['Abbreviation', 'Points', 'after']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="Abbreviation", y="Count",
             color='after', barmode='group',
             height=400, color_discrete_sequence=colors)
fig.show()

In [42]:
fig = px.scatter(df, y="Points", x="GridPosition", trendline="ols")
fig.update_layout(barmode="group")
fig.show()

In [43]:
fig = px.scatter(df, y="Points", x="AvgPitTime", trendline="ols")
fig.update_layout(barmode="group")
fig.show()

In [44]:
fig = px.scatter(df, y="Points", x="BestQualiTime", trendline="ols")
fig.update_layout(barmode="group")
fig.show()

In [46]:
import plotly.express as px

# group data by race finish position and count occurrences of carIssue and driverIssue
df_grouped = df.groupby(['after_2020','carIssue', 'driverIssue']).size().reset_index(name='count')

In [47]:
# create stacked bar chart
fig = px.bar(df_grouped, x="after_2020", y="count", color="carIssue", barmode="stack")

fig.show()

In [48]:
df_grouped 

Unnamed: 0,after_2020,carIssue,driverIssue,count
0,0,0,0,623
1,0,0,1,14
2,0,1,0,85
3,1,0,0,723
4,1,0,1,14
5,1,1,0,88


## Box plot of AgeAtGP to show the distribution of ages for drivers.

In [49]:
fig = px.box(df, y="AgeAtGP")
fig.show()

In [85]:
fig = px.box(df, y="AgeAtGP", x="after_2020")
fig.show()

In [50]:
fig = px.scatter(df, y="AverageSpeed", x="MaxSpeed", trendline="ols")
fig.update_layout(barmode="group")
fig.show()

In [51]:
fig = px.box(df, y="BestQualiTime")
fig.show()

In [54]:
import plotly.express as px
agg_data = p1.groupby(['Engine', 'Points', 'after']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="Engine", y="Count",
             color='after', barmode='group',
             height=400)
fig.show()