In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Load in the data for the drivers, results and races and remove unneeded columns.

In [2]:
drivers = pd.read_csv('./data/drivers.csv')
drivers = drivers[['driverId', 'forename', 'surname','nationality']]
drivers.head()

Unnamed: 0,driverId,forename,surname,nationality
0,1,Lewis,Hamilton,British
1,2,Nick,Heidfeld,German
2,3,Nico,Rosberg,German
3,4,Fernando,Alonso,Spanish
4,5,Heikki,Kovalainen,Finnish


In [3]:
drivers.shape

(850, 4)

In [4]:
results = pd.read_csv('./data/results.csv')
results = results[['resultId', 'raceId', 'driverId', 'constructorId']]
results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId
0,1,18,1,1
1,2,18,2,2
2,3,18,3,3
3,4,18,4,4
4,5,18,5,1


In [5]:
races = pd.read_csv('./data/races.csv')
races = races[['raceId', 'year']]
races.head()

Unnamed: 0,raceId,year
0,1,2009
1,2,2009
2,3,2009
3,4,2009
4,5,2009


Define a dictionary of continents, that we can use later to  map the continents to each driver, based on their nationality.

In [6]:
continents = {
    'Africa' :['Rhodesian', 'South African'],
    'Asia' :['Indian', 'Indonesian', 'Malaysian', 'Japanese', 'Thai'],
    'Europe' :['Austrian', 'Belgian','British', 'Czech', 'Danish', 'Dutch', 'East German', 'Finnish', 'French', 'German', 'Hungarian', 'Irish', 'Italian', 'Liechtensteiner', 'Monegasque', 'Polish', 'Portuguese', 'Russian', 'Spanish', 'Swedish', 'Swiss'],
    'Mixed': ['American-Italian', 'Argentine-Italian'],
    'North America' : ['American', 'Canadian', 'Mexican'],
    'Oceania': ['Australian', 'New Zealander'],
    'South America' : ['Argentine', 'Brazilian', 'Chilean', 'Colombian','Uruguayan', 'Venezuelan'],  
}

for continent, countries in continents.items():
    print(continent, countries)


Africa ['Rhodesian', 'South African']
Asia ['Indian', 'Indonesian', 'Malaysian', 'Japanese', 'Thai']
Europe ['Austrian', 'Belgian', 'British', 'Czech', 'Danish', 'Dutch', 'East German', 'Finnish', 'French', 'German', 'Hungarian', 'Irish', 'Italian', 'Liechtensteiner', 'Monegasque', 'Polish', 'Portuguese', 'Russian', 'Spanish', 'Swedish', 'Swiss']
Mixed ['American-Italian', 'Argentine-Italian']
North America ['American', 'Canadian', 'Mexican']
Oceania ['Australian', 'New Zealander']
South America ['Argentine', 'Brazilian', 'Chilean', 'Colombian', 'Uruguayan', 'Venezuelan']


Merge the drivers, reults and races data together and create a name column from the forename and surname columns.

In [7]:
df = races.merge(drivers.merge(results, on=['driverId']), on=['raceId'])
df['name'] = df.forename + " " + df.surname
df.head()

Unnamed: 0,raceId,year,driverId,forename,surname,nationality,resultId,constructorId,name
0,1,2009,1,Lewis,Hamilton,British,7573,1,Lewis Hamilton
1,1,2009,2,Nick,Heidfeld,German,7563,2,Nick Heidfeld
2,1,2009,3,Nico,Rosberg,German,7559,3,Nico Rosberg
3,1,2009,4,Fernando,Alonso,Spanish,7558,4,Fernando Alonso
4,1,2009,5,Heikki,Kovalainen,Finnish,7572,1,Heikki Kovalainen


Reduce the dataframe down to just the columns that we need.

In [8]:
df = df[['year','nationality','name']]
df.head()

Unnamed: 0,year,nationality,name
0,2009,British,Lewis Hamilton
1,2009,German,Nick Heidfeld
2,2009,German,Nico Rosberg
3,2009,Spanish,Fernando Alonso
4,2009,Finnish,Heikki Kovalainen


The dataframe currently contains data on the year, nationality and driver name for each race they have competed in, meaning the that same values are repeated for each race in a season. We only want one row for each season that the driver has raced in and removing the duplicates will get us this result. 

In [9]:
df = df.drop_duplicates()
df[df.year == 2020]

Unnamed: 0,year,nationality,name
24620,2020,British,Lewis Hamilton
24621,2020,Finnish,Kimi Räikkönen
24622,2020,German,Sebastian Vettel
24623,2020,French,Romain Grosjean
24624,2020,French,Pierre Gasly
24625,2020,Mexican,Sergio Pérez
24626,2020,Australian,Daniel Ricciardo
24627,2020,Finnish,Valtteri Bottas
24628,2020,Danish,Kevin Magnussen
24629,2020,Russian,Daniil Kvyat


Now let's extrace the driver's home continent based on their nationality.

In [10]:
continent_list = []
for row in df.iterrows():
    nat = row[1].nationality
    for continent, countries in continents.items():
        if nat in countries:
            continent_list.append(continent)
# print(continent_list)
df['continent'] = continent_list

Let's also sort the data, firstly by year and secondly by continent. This will make the final plot easier to make, and will removed the need to do the sorting in javascript (and hence each time the page is loaded).

In [11]:
df = df.sort_values(by=['year','continent'])
df[df.year == 2020]

Unnamed: 0,year,nationality,name,continent
24638,2020,Thai,Alexander Albon,Asia
24620,2020,British,Lewis Hamilton,Europe
24621,2020,Finnish,Kimi Räikkönen,Europe
24622,2020,German,Sebastian Vettel,Europe
24623,2020,French,Romain Grosjean,Europe
24624,2020,French,Pierre Gasly,Europe
24627,2020,Finnish,Valtteri Bottas,Europe
24628,2020,Danish,Kevin Magnussen,Europe
24629,2020,Russian,Daniil Kvyat,Europe
24630,2020,Dutch,Max Verstappen,Europe


Finally, we save the data to a csv file.

In [12]:
df.to_csv('./data/drivers_by_continent.csv')