# This notebook is destined for EDA (Exploratory Data Analysis)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.utils import resample
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import pickle
from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import FactorAnalysis
from factor_analyzer import FactorAnalyzer

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score

%matplotlib inline

from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
from lightgbm import LGBMClassifier 

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import TruncatedSVD

import plotly.express as px
import plotly.graph_objs as go
import statsmodels.formula.api as smf
from plotly.graph_objects import Layout

In [26]:
df = pd.read_csv("plotdata_position.csv")

In [27]:
df['GridPosition'] = df['GridPosition'].replace(0, 20)

In [28]:
df = df.drop("Unnamed: 0", axis = 1)

In [29]:
df.columns

Index(['Abbreviation', 'TeamName', 'Position', 'GridPosition', 'RaceCountry',
       'Year', 'AgeAtGP', 'BestQualiTime', 'FLap', 'SDLapTime', 'AvgPitTime',
       'PitstopNo', 'HARD', 'INTERMEDIATE', 'MEDIUM', 'SOFT', 'WET', 'Engine',
       'Rain', 'AverageSpeed', 'MaxSpeed', 'MaxRPM', 'AverageThrottle',
       'MaxThrottlePct', 'Brake', 'raceID', 'CircuitType', 'carIssue',
       'driverIssue', 'after_2020'],
      dtype='object')

In [30]:
from scipy.stats import pearsonr

a = df[df["after_2020"] == 1]
b = df[df["after_2020"] == 0]

# calculate Pearson's correlation
corr, _ = pearsonr(df["Position"], df["GridPosition"])
print('Pearsons correlation general: %.3f' % corr)

# calculate Pearson's correlation
corr, _ = pearsonr(b["Position"], b["GridPosition"])
print('Pearsons correlation before: %.3f' % corr)

# calculate Pearson's correlation
corr, _ = pearsonr(a["Position"], a["GridPosition"])
print('Pearsons correlation after: %.3f' % corr)


Pearsons correlation general: 0.628
Pearsons correlation before: 0.645
Pearsons correlation after: 0.613


In [31]:
df.describe()

Unnamed: 0,Position,GridPosition,Year,AgeAtGP,BestQualiTime,FLap,SDLapTime,AvgPitTime,PitstopNo,HARD,...,AverageSpeed,MaxSpeed,MaxRPM,AverageThrottle,MaxThrottlePct,Brake,raceID,carIssue,driverIssue,after_2020
count,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,...,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0,1547.0
mean,10.236587,10.515837,2020.539754,27.425339,84.480341,0.051713,7.609556,99.787053,1.817065,21.106012,...,124.4143,329.505495,12771.70459,47.755554,13.210693,22.347367,40.318035,0.111829,0.0181,0.53329
std,5.65213,5.791235,1.154413,5.63119,12.498625,0.221519,4.38139,238.448638,0.993911,18.519117,...,31.03673,14.612562,302.875962,9.343254,14.885145,11.288978,23.190107,0.315258,0.133355,0.499052
min,1.0,1.0,2019.0,19.0,53.377,0.0,0.143536,0.0,0.0,0.0,...,8.392944,274.0,11884.0,14.53716,0.002291,0.0,1.0,0.0,0.0,0.0
25%,5.0,5.0,2019.0,23.0,76.178,0.0,3.815307,22.227834,1.0,0.0,...,116.188451,323.0,12566.0,42.761592,1.574367,15.409331,20.0,0.0,0.0,0.0
50%,10.0,10.0,2021.0,26.0,83.005,0.0,6.759234,24.1485,2.0,24.0,...,128.946742,331.0,12763.0,47.417343,8.794342,19.25638,40.0,0.0,0.0,1.0
75%,15.0,16.0,2022.0,32.0,92.354,0.0,10.746647,29.947,2.0,36.0,...,144.945874,338.0,12974.5,51.82219,18.717936,25.440253,60.0,0.0,0.0,1.0
max,20.0,20.0,2022.0,42.0,141.611,1.0,23.708907,3055.732,6.0,70.0,...,190.53567,370.0,13897.0,95.539949,87.616601,89.233473,80.0,1.0,1.0,1.0


In [32]:
df.to_csv(r'summarystats.csv', index=True, header=True)

In [33]:
df.groupby('after_2020')['MaxSpeed'].mean()

after_2020
0    331.909972
1    327.401212
Name: MaxSpeed, dtype: float64

In [34]:
df.groupby('after_2020')['BestQualiTime'].mean()

after_2020
0    84.414349
1    84.538095
Name: BestQualiTime, dtype: float64

In [35]:
df.groupby('Year')['BestQualiTime'].mean()

Year
2019.0    84.880014
2020.0    83.788422
2021.0    82.228406
2022.0    86.722493
Name: BestQualiTime, dtype: float64

In [40]:
df.groupby('Year')['MaxSpeed'].mean()

Year
2019.0    331.971014
2020.0    331.827922
2021.0    329.822943
2022.0    325.110849
Name: MaxSpeed, dtype: float64

In [41]:
df.groupby('after_2020')['AverageSpeed'].mean()

after_2020
0    135.182792
1    114.990237
Name: AverageSpeed, dtype: float64

In [42]:
df.groupby('Year')['AverageSpeed'].mean()

Year
2019.0    137.822057
2020.0    131.635209
2021.0    121.285631
2022.0    109.036339
Name: AverageSpeed, dtype: float64

In [43]:
df.groupby('Year')['AvgPitTime'].mean()

Year
2019.0     24.563519
2020.0    110.348415
2021.0    170.911820
2022.0     98.297930
Name: AvgPitTime, dtype: float64

In [44]:
df.groupby('after_2020')['AgeAtGP'].mean()

after_2020
0    27.123269
1    27.689697
Name: AgeAtGP, dtype: float64

In [45]:
df.groupby('Year')['AgeAtGP'].mean()

Year
2019.0    27.060386
2020.0    27.207792
2021.0    27.638404
2022.0    27.738208
Name: AgeAtGP, dtype: float64

In [46]:
df.groupby('after_2020')['AvgPitTime'].mean()

after_2020
0     61.158738
1    133.592681
Name: AvgPitTime, dtype: float64

In [47]:
df.groupby('Year')['AvgPitTime'].mean()

Year
2019.0     24.563519
2020.0    110.348415
2021.0    170.911820
2022.0     98.297930
Name: AvgPitTime, dtype: float64

In [48]:
df.groupby('after_2020')['PitstopNo'].mean()

after_2020
0    1.685596
1    1.932121
Name: PitstopNo, dtype: float64

In [49]:
df.groupby('Year')['PitstopNo'].mean()

Year
2019.0    1.562802
2020.0    1.850649
2021.0    1.972569
2022.0    1.893868
Name: PitstopNo, dtype: float64

In [50]:
df.groupby('after_2020')['HARD'].mean()

after_2020
0    19.189751
1    22.783030
Name: HARD, dtype: float64

In [51]:
df.groupby('after_2020')['SOFT'].mean()

after_2020
0    13.013850
1     8.488485
Name: SOFT, dtype: float64

In [52]:
df.groupby('after_2020')['INTERMEDIATE'].mean()

after_2020
0    2.286704
1    3.842424
Name: INTERMEDIATE, dtype: float64

In [53]:
df.groupby('after_2020')['carIssue'].value_counts()

after_2020  carIssue
0           0           637
            1            85
1           0           737
            1            88
Name: carIssue, dtype: int64

In [54]:
df.groupby('Year')['carIssue'].value_counts()

Year    carIssue
2019.0  0           365
        1            49
2020.0  0           272
        1            36
2021.0  0           367
        1            34
2022.0  0           370
        1            54
Name: carIssue, dtype: int64

In [55]:
df.groupby('after_2020')['driverIssue'].value_counts()

after_2020  driverIssue
0           0              708
            1               14
1           0              811
            1               14
Name: driverIssue, dtype: int64

In [56]:
df.groupby('Year')['driverIssue'].value_counts()

Year    driverIssue
2019.0  0              407
        1                7
2020.0  0              301
        1                7
2021.0  0              393
        1                8
2022.0  0              418
        1                6
Name: driverIssue, dtype: int64

In [57]:
df.groupby('Rain')['driverIssue'].value_counts()

Rain   driverIssue
False  0              1465
       1                25
True   0                54
       1                 3
Name: driverIssue, dtype: int64

In [58]:
df.groupby('Rain')['carIssue'].value_counts()

Rain   carIssue
False  0           1321
       1            169
True   0             53
       1              4
Name: carIssue, dtype: int64

In [59]:
p1 = df[df["Position"].isin([1, 2, 3])]
p1

Unnamed: 0,Abbreviation,TeamName,Position,GridPosition,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,SDLapTime,...,MaxSpeed,MaxRPM,AverageThrottle,MaxThrottlePct,Brake,raceID,CircuitType,carIssue,driverIssue,after_2020
9,VER,RedBullRacing,3.0,4.0,Australia,2019.0,21.0,81.320,0.0,2.614185,...,325,12461,50.394143,1.092949,13.533761,1,street,0,0,0
11,HAM,Mercedes,2.0,1.0,Australia,2019.0,33.0,80.486,0.0,2.666781,...,314,12811,51.783875,35.571123,11.876500,1,street,0,0,0
16,BOT,Mercedes,1.0,2.0,Australia,2019.0,29.0,80.598,1.0,2.628517,...,314,12428,51.196796,39.608873,11.091652,1,street,0,0,0
22,LEC,Ferrari,3.0,1.0,Bahrain,2019.0,21.0,87.866,1.0,8.081631,...,333,12641,54.121875,4.926349,20.056842,2,race,0,0,0
31,HAM,Mercedes,1.0,3.0,Bahrain,2019.0,33.0,88.190,0.0,7.211253,...,331,12843,48.209553,0.021392,15.136605,2,race,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1523,SAI,Ferrari,3.0,7.0,Brazil,2022.0,28.0,70.890,0.0,15.207823,...,330,12597,50.099952,12.594646,29.303082,79,race,0,0,1
1525,RUS,Mercedes,1.0,1.0,Brazil,2022.0,23.0,71.318,1.0,15.845356,...,324,12477,45.933721,10.213821,21.742028,79,race,0,0,1
1527,VER,RedBullRacing,1.0,1.0,AbuDhabi,2022.0,25.0,83.824,0.0,2.847126,...,323,12652,38.604095,0.852338,11.941373,80,race,0,0,1
1529,PER,RedBullRacing,3.0,2.0,AbuDhabi,2022.0,32.0,84.052,0.0,3.964186,...,335,12629,38.557936,0.279314,10.271251,80,race,0,0,1


In [60]:
df1 = df[df["carIssue"].isin([1])]
df1

Unnamed: 0,Abbreviation,TeamName,Position,GridPosition,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,SDLapTime,...,MaxSpeed,MaxRPM,AverageThrottle,MaxThrottlePct,Brake,raceID,CircuitType,carIssue,driverIssue,after_2020
8,RIC,Renault,19.0,12.0,Australia,2019.0,30.0,82.570,0.0,1.592151,...,317,12138,61.945190,34.844652,40.909386,1,street,1,0,0
13,SAI,McLaren,20.0,18.0,Australia,2019.0,25.0,83.084,0.0,0.451487,...,323,12079,71.075631,59.440877,63.193228,1,street,1,0,0
17,GRO,HaasF1Team,18.0,6.0,Australia,2019.0,32.0,81.826,0.0,5.051781,...,321,12715,61.395018,34.413310,42.427191,1,street,1,0,0
27,HUL,Renault,17.0,17.0,Bahrain,2019.0,31.0,90.034,0.0,4.166440,...,335,12386,51.185044,6.286291,20.564146,2,race,1,0,0
28,RIC,Renault,18.0,10.0,Bahrain,2019.0,30.0,89.488,0.0,3.186570,...,334,12481,50.352851,6.188497,19.833751,2,race,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1496,TSU,AlphaTauri,20.0,13.0,Mexico,2022.0,22.0,79.589,0.0,4.248924,...,350,12835,44.230054,17.758517,28.635096,78,race,1,0,1
1519,NOR,McLaren,18.0,6.0,Brazil,2022.0,23.0,71.377,0.0,12.796702,...,332,12675,56.223686,30.084249,41.143223,79,race,1,0,1
1530,ALO,Alpine,20.0,10.0,AbuDhabi,2022.0,41.0,85.096,0.0,3.730609,...,330,12585,51.827200,31.473163,38.018314,80,race,1,0,1
1540,HAM,Mercedes,18.0,5.0,AbuDhabi,2022.0,37.0,84.508,0.0,5.732930,...,330,13077,51.894322,12.338747,22.992974,80,race,1,0,1


In [61]:
import plotly.express as px
agg_data = df1.groupby(['RaceCountry', 'carIssue']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="RaceCountry", y="Count",
              barmode='group',
             height=400)
fig.show()

In [62]:
df2 = df[df["driverIssue"].isin([1])]
df2

Unnamed: 0,Abbreviation,TeamName,Position,GridPosition,RaceCountry,Year,AgeAtGP,BestQualiTime,FLap,SDLapTime,...,MaxSpeed,MaxRPM,AverageThrottle,MaxThrottlePct,Brake,raceID,CircuitType,carIssue,driverIssue,after_2020
37,GRO,HaasF1Team,20.0,11.0,Bahrain,2019.0,32.0,89.015,0.0,6.866675,...,300,12651,67.95309,52.505959,57.860155,2,race,0,1,0
156,GRO,HaasF1Team,20.0,16.0,France,2019.0,33.0,91.626,0.0,3.40715,...,333,12878,49.03374,8.934855,21.694235,8,race,0,1,0
200,LEC,Ferrari,17.0,10.0,Germany,2019.0,21.0,72.229,0.0,11.301776,...,323,12969,64.652483,43.71701,54.152752,11,race,0,1,0
205,HUL,Renault,16.0,9.0,Germany,2019.0,31.0,72.766,0.0,16.122998,...,322,12483,54.658946,28.339438,39.919746,11,race,0,1,0
214,BOT,Mercedes,15.0,3.0,Germany,2019.0,29.0,72.129,0.0,18.034042,...,332,13282,43.217683,9.441991,24.719918,11,race,0,1,0
255,GIO,AlfaRomeoRacing,18.0,18.0,Belgium,2019.0,25.0,105.637,0.0,2.546908,...,358,12976,56.278213,8.235813,20.196058,13,race,0,1,0
344,NOR,McLaren,20.0,8.0,Mexico,2019.0,19.0,76.322,0.0,3.671617,...,350,12598,51.078627,22.271436,34.844366,18,race,0,1,0
457,KVY,AlphaTauri,18.0,19.0,UK,2020.0,26.0,86.744,0.0,22.56347,...,323,12956,70.206629,56.91486,60.442978,24,race,0,1,0
474,MAG,HaasF1Team,20.0,17.0,UK,2020.0,28.0,88.236,0.0,4.651784,...,326,12526,51.639725,11.294602,22.561274,25,race,0,1,0
529,GIO,AlfaRomeoRacing,18.0,18.0,Belgium,2020.0,26.0,103.95,0.0,0.635618,...,345,12701,67.51212,53.838982,57.484851,27,race,0,1,0


In [63]:
import plotly.express as px
agg_data = df2.groupby(['RaceCountry', 'driverIssue']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="RaceCountry", y="Count",
              barmode='group',
             height=400)
fig.show()

## Number of podiums before and after

In [64]:
import plotly.express as px
agg_data = p1.groupby(['TeamName', 'Position', 'after_2020']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="TeamName", y="Count",
             color='after_2020', barmode='group',
             height=400)
fig.show()

In [65]:
import plotly.express as px
agg_data = p1.groupby(['Abbreviation', 'Position', 'after_2020']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="Abbreviation", y="Count",
             color='after_2020', barmode='group',
             height=400)
fig.show()

In [66]:
import plotly.express as px
agg_data = df.groupby(['PitstopNo', 'Position', 'after_2020']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="PitstopNo", y="Count",
             color='after_2020', barmode='group',
             height=400)
fig.show()

In [67]:
p1.rename(columns={'after_2020': 'after'}, inplace=True)

In [69]:
import plotly.express as px

agg_data = p1.groupby(['TeamName', 'Position', 'after']).size().reset_index(name='Count')

category_order = [0, 1]
colors = ['#171C54', '#93B8D6']  # Specify the desired colors for '0' and '1'

fig = px.histogram(agg_data, x="TeamName", y="Count",
                   color='after', barmode='group',
                   height=400, category_orders={"after": category_order},
                   color_discrete_sequence=colors)  # Use color_discrete_sequence instead

# Show the plot
fig.show()


In [70]:
import plotly.express as px
agg_data = p1.groupby(['Abbreviation', 'Position', 'after']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="Abbreviation", y="Count",
             color='after', barmode='group',
             height=400, color_discrete_sequence=colors)
fig.show()

In [71]:
fig = px.scatter(df, y="Position", x="GridPosition", trendline="ols")
fig.update_layout(barmode="group")
fig.show()

In [72]:
import plotly.express as px

fig = px.scatter(df, y="Position", x="GridPosition", trendline="ols")

# Customize the color palette
color_palette = ['#171C54']  # Specify your desired colors

# Update the color palette
fig.update_traces(marker=dict(color=color_palette))

# Show the plot
fig.show()

In [73]:
aux = df.drop(["raceID", "Year", "after_2020"], axis = 1)

In [74]:
import plotly.graph_objects as go
import plotly.figure_factory as ff
import pandas as pd
import numpy as np

# Create a correlation matrix for numeric variables
corr_matrix = aux.select_dtypes(include='number').corr()

# Create a mask to show only correlations above 0.6
mask = np.triu(np.ones_like(corr_matrix), k=1)
mask[corr_matrix.abs() <= 0.5] = False

# Create a heatmap using Plotly Figure Factory
fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=corr_matrix.round(2).values,
    colorscale='Viridis',
    hoverinfo='text',
    text=corr_matrix.round(2).values,
    showscale=True,
  
)

# Update the layout
fig.update_layout(
    title='Correlation Heatmap',
    xaxis=dict(title='Variables'),
    yaxis=dict(title='Variables')
)

# Show the heatmap
fig.show()


In [75]:
import plotly.graph_objects as go
import plotly.figure_factory as ff
import pandas as pd
import numpy as np

# Create a correlation matrix for numeric variables
corr_matrix = aux.select_dtypes(include='number').corr()

# Create a mask to show only correlations above 0.6
mask = np.triu(np.ones_like(corr_matrix), k=1)
mask[corr_matrix.abs() <= 0.5] = False

# Create a heatmap using Plotly Figure Factory
fig = ff.create_annotated_heatmap(
    z=corr_matrix.values,
    x=list(corr_matrix.columns),
    y=list(corr_matrix.index),
    annotation_text=corr_matrix.round(2).values,
    colorscale='RdBu',
    hoverinfo='text',
    text=corr_matrix.round(2).values,
    showscale=True,
  
)

# Update the layout
fig.update_layout(
    xaxis=dict(title='Variables'),
    yaxis=dict(title='Variables', autorange='reversed'),
    height=800, # Adjust the height here
    width=1000
)

# Show the heatmap
fig.show()

## Box plot of AgeAtGP to show the distribution of ages for drivers.

In [76]:
fig = px.box(df, y="AgeAtGP")
fig.show()

In [77]:
fig = px.box(df, y="AgeAtGP", x="after_2020")
fig.show()

In [78]:
fig = px.scatter(df, y="AverageSpeed", x="MaxSpeed", trendline="ols")
fig.update_layout(barmode="group")
fig.show()

In [79]:
fig = px.box(df, y="BestQualiTime")
fig.show()

In [83]:
import plotly.express as px
agg_data = p1.groupby(['Engine', 'Position', 'after']).size().reset_index(name='Count')
fig = px.histogram(agg_data, x="Engine", y="Count",
             color='after', barmode='group',
             height=400)
fig.show()

In [84]:
fig = px.scatter(df, y="Position", x="MaxSpeed", trendline="ols")
fig.update_layout(barmode="group")
fig.show()

In [85]:
p1.groupby('Position')['AgeAtGP'].mean()

Position
1.0    28.7375
2.0    28.7875
3.0    27.4875
Name: AgeAtGP, dtype: float64

In [87]:
p1.groupby(['Position', "after"])['AvgPitTime'].mean()

Position  after
1.0       0         69.302641
          1        144.629938
2.0       0         66.788844
          1        144.638814
3.0       0         89.886420
          1        144.947355
Name: AvgPitTime, dtype: float64

In [88]:
p1[["Position", "AvgPitTime"]]

Unnamed: 0,Position,AvgPitTime
9,3.0,21.157000
11,2.0,21.515000
16,1.0,22.014000
22,3.0,24.438000
31,1.0,24.289500
...,...,...
1523,3.0,24.168333
1525,1.0,23.500500
1527,1.0,22.735000
1529,3.0,21.363500


In [90]:
p1.groupby(['Position', "after"])['AverageSpeed'].mean()

Position  after
1.0       0        144.599508
          1        121.118728
2.0       0        144.594981
          1        120.972495
3.0       0        144.661264
          1        121.308442
Name: AverageSpeed, dtype: float64

In [91]:
p1.groupby(['Position'])['BestQualiTime'].mean()

Position
1.0    82.888262
2.0    83.285213
3.0    83.189837
Name: BestQualiTime, dtype: float64

In [92]:
p1.groupby(['Position', "after"])['Brake'].mean()

Position  after
1.0       0        15.864615
          1        19.333753
2.0       0        16.394368
          1        21.263064
3.0       0        17.347023
          1        20.754745
Name: Brake, dtype: float64

In [94]:
p1.groupby(['Position', "after"])['MaxThrottlePct'].mean()

Position  after
1.0       0         9.977427
          1        12.757761
2.0       0        16.521993
          1         9.044424
3.0       0         8.074361
          1        11.747676
Name: MaxThrottlePct, dtype: float64