In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

In [3]:
results = pd.read_csv('datasets/original/results.csv')
races = pd.read_csv('datasets/cleaned/races.csv')
drivers = pd.read_csv('datasets/cleaned/drivers.csv')

We remove 2022 from races and results data beacuse it isnt complete (points can be different).

In [4]:
races = races[races['year']!=2022]
results = results[results['raceId'].isin(races['raceId'])]

In [5]:
master_stats = pd.merge(results, drivers, on='driverId', how='left')
all_points = master_stats.groupby('name').agg({'points': 'sum'}).sort_values(by='points',ascending=False)
all_points.head(10)

Unnamed: 0_level_0,points
name,Unnamed: 1_level_1
Lewis Hamilton,4163.5
Sebastian Vettel,3061.0
Fernando Alonso,1980.0
Kimi Räikkönen,1873.0
Valtteri Bottas,1731.0
Nico Rosberg,1594.5
Michael Schumacher,1566.0
Max Verstappen,1550.5
Daniel Ricciardo,1273.0
Jenson Button,1235.0


In [6]:
winpoints = pd.merge(results, races, on='raceId', how='left').groupby('year').agg({'points': 'max'}).sort_values('year')
px.bar(winpoints, x = winpoints.index, y = 'points')

You can see the significant step in maximum points available in 2010 when the points changed from 10 points for a win to 25. 2014 shows a strange spike of 50 points which has never been available for a win in formula one. This is caused because in the last round of the 2014 championship they doubled the points available. Otherwise this would be 25 as well.

We have decided to use the scoring system of today to apply to the results. We am not including the bonus point for fastest lap which you could also include.

The points are awarded for the top 10 drivers in the following pattern: 1st gets 25, 2nd gets 18, 15, 12, 10, 8, 6, 4, 2, 1.

In [7]:
curr_points = pd.merge(results, drivers, on='driverId', how = 'left').sort_values(['driverRef','positionOrder']).groupby(['driverRef','positionOrder']).agg({'positionOrder':'count'})
curr_points.rename(columns = {'positionOrder':'num'}, inplace = True)
curr_points = curr_points.reset_index()
curr_points.rename(columns = {'driverRef':'Driver'}, inplace = True)
# curr_points

In [8]:
points_pattern = [25,18,15,12,10,8,6,4,2,1]
curr_points['points'] = curr_points.apply(lambda row: points_pattern[row.positionOrder-1] if row.positionOrder <= 10 else 0, axis = 1)
curr_points['points_pp'] = curr_points['num'] * curr_points['points']

new_points_rank = curr_points.groupby('Driver').agg({'points_pp':'sum'}).sort_values(by='points_pp', ascending=False)
new_points_rank.head(10)

Unnamed: 0_level_0,points_pp
Driver,Unnamed: 1_level_1
hamilton,4506
michael_schumacher,3891
vettel,3250
alonso,2821
raikkonen,2795
prost,2486
barrichello,1901
senna,1885
button,1851
rosberg,1739


# More races per season

In [9]:
nraces = races.groupby('year').agg({'round':'max'})
nraces.rename(columns={'round':'races'})
px.bar(nraces, x=nraces.index, y='round')

As you can see the number of races per year has been steadily increasing over the years which would put the points biased towards recent years.
We can adjust this by calculating:
1. Points per Race
2. Points per Race per Season 
</br></br>
Points per Race might provide bias towards those who have had good F1 careers but had high scoring 1st years. 
Points per Race per Season will account for the number of seasons which a driver has been competitive which is some form of indicator of quality.

In [10]:
results_year = pd.merge(results, races[['raceId','year']], on='raceId',how='left')
results_year = pd.merge(results_year, drivers[['driverId','driverRef']], on='driverId', how='left')
results_year['new_points'] = results_year.apply(lambda row: points_pattern[row.positionOrder-1] if row.positionOrder <= 10 else 0, axis = 1)
results_year['finished'] = results_year.apply(lambda row: 0 if (row.positionOrder > 10 and str(row.position)=='\\N') else 1,axis = 1)
results_year

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,...,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId,year,driverRef,new_points,finished
0,1,18,1,1,22,1,1,1,1,10.0,...,5690616,39,2,1:27.452,218.300,1,2008,hamilton,25,1
1,2,18,2,2,3,5,2,2,2,8.0,...,5696094,41,3,1:27.739,217.586,1,2008,heidfeld,18,1
2,3,18,3,3,7,7,3,3,3,6.0,...,5698779,41,5,1:28.090,216.719,1,2008,rosberg,15,1
3,4,18,4,4,5,11,4,4,4,5.0,...,5707797,58,7,1:28.603,215.464,1,2008,alonso,12,1
4,5,18,5,1,23,3,5,5,5,4.0,...,5708630,43,1,1:27.418,218.385,1,2008,kovalainen,10,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25395,25401,1073,849,3,6,16,\N,R,16,0.0,...,\N,30,15,1:29.293,212.912,3,2021,latifi,0,0
25396,25402,1073,841,51,99,14,\N,R,17,0.0,...,\N,33,16,1:29.442,212.557,6,2021,giovinazzi,0,0
25397,25403,1073,847,3,63,17,\N,R,18,0.0,...,\N,23,19,1:30.647,209.732,6,2021,russell,0,0
25398,25404,1073,8,51,7,18,\N,R,19,0.0,...,\N,23,18,1:29.698,211.951,23,2021,raikkonen,0,0


In [11]:
points_per_race = results_year.groupby('driverRef').agg({'new_points':'sum','raceId':'count','finished':'sum'})
points_per_race.rename(columns={'raceId':'races'},inplace=True)
points_per_race['new_points_pp'] = points_per_race['new_points']/points_per_race['races']
points_per_race = points_per_race.sort_values('new_points_pp',ascending=False)
points_per_race = points_per_race[points_per_race['races']>=8]
points_per_race.head(10)

Unnamed: 0_level_0,new_points,races,finished,new_points_pp
driverRef,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hamilton,4506,288,263,15.645833
fangio,876,58,46,15.103448
fagioli,112,8,7,14.0
michael_schumacher,3891,308,242,12.633117
prost,2486,202,144,12.306931
ascari,443,36,22,12.305556
farina,452,37,29,12.216216
senna,1885,162,111,11.635802
vettel,3250,280,244,11.607143
clark,847,73,54,11.60274


This still keeps Hamilton at the top but now drivers like Fangio, Ascari, Clark are joining the list so seems a better mix of decades. We have included a min of 8 races to be included in the list.

Now we try to evaluate based on if all the seasons had 22 races like the 2021 season did.

In [12]:
points_per_season = results_year.groupby(['driverRef','year']).agg({'new_points':'sum','raceId':'count','finished':'sum'}).sort_values(by='new_points', ascending=False)
points_per_season.rename(columns={'raceId':'races'},inplace=True)
points_per_season['season_points'] = points_per_season['new_points']/points_per_season['races']*22

points_per_season_total = points_per_season.reset_index()
points_per_season_total = points_per_season_total.groupby('driverRef').agg({'season_points':'sum','year':'count','races':'sum','finished':'sum'}).sort_values(by='season_points',ascending=False)
points_per_season_total.rename(columns={'year':'seasons','finished':'finishes'},inplace=True)
points_per_season_total.head(10)

Unnamed: 0_level_0,season_points,seasons,races,finishes
driverRef,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
michael_schumacher,5265.160419,19,308,242
hamilton,5135.500815,15,288,263
vettel,3700.05903,15,280,244
prost,3472.171154,13,202,144
alonso,3345.610923,18,336,274
raikkonen,3334.981678,19,352,286
fangio,2670.459524,8,58,46
senna,2598.291667,11,162,111
clark,2481.344444,9,73,54
barrichello,2449.051901,19,326,230


This approach brings an interesting new view. This puts Michael Schumacher at the top but Fangio and Clark make the list despite having <100 starts. Given the limited number of races but have 8 and 9 seasons.

This appraoch gives an interesting view and does reward drivers who have been in the sport for a long time. However, this appears to be bringing into some recent bais as well. Vettel and Hamilton are on the list with only 15 seasons but are still active and could have more seasons to come.

# Reliability

Cars are more reliable now than they were. We try to incorporate this into our results before.

In [15]:
points_per_season

Unnamed: 0_level_0,Unnamed: 1_level_0,new_points,races,finished,season_points
driverRef,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
hamilton,2018,408,21,20,427.428571
hamilton,2019,407,21,21,426.380952
vettel,2013,397,19,18,459.684211
max_verstappen,2021,396,22,19,396.000000
vettel,2011,392,19,18,453.894737
...,...,...,...,...,...
keizan,1975,0,1,1,0.000000
keizan,1974,0,1,1,0.000000
keizan,1973,0,1,0,0.000000
keegan,1982,0,5,1,0.000000


In [20]:
points_per_season['finish_season_points'] = points_per_season['new_points']/points_per_season['finished']*22
points_per_season = points_per_season.reset_index()
points_per_finish_season = points_per_season.groupby('driverRef').agg({'finish_season_points':'sum','year':'count','races':'sum','finished':'sum'}).sort_values(by='finish_season_points',ascending=False)
points_per_finish_season.rename(columns={'year':'seasons'})
points_per_finish_season.head(10)

Unnamed: 0_level_0,finish_season_points,year,races,finished
driverRef,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
michael_schumacher,6639.985589,19,308,242
hamilton,5576.713445,15,288,263
prost,4817.794505,13,202,144
mansell,4283.216667,15,192,99
raikkonen,4215.628346,19,352,286
vettel,4163.574098,15,280,244
alonso,3839.680261,18,336,274
piquet,3825.904884,14,207,125
senna,3664.257143,11,162,111
jack_brabham,3547.360317,16,129,79


This approach includes the benefit of the number of seasons you have raced in formula one. This however appears to be biased against toward the more recent F1 drivers but a few interesting additionals with Mansell jumping up onto the list but returns to a battle between Michael Schumacher and Hamilton.

This would lead a suggestion that drivers are having longer careers typically now than they did historically. 

If we google some of the best drivers of all time you typically find Fangio, Moss, Clark, Stewart, Lauda, Prost, Senna, Michael Schumacher and Hamilton. All except Lauda make that list so that makes me feel comfortable. Hamilton, of course, is still in the most dominate car and winning races continually so it might not be long before he tops that list too.