In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd
import matplotlib.pyplot as plt
import pip
import geopandas
import altair as alt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_races = pd.read_csv('../input/formula-1-world-championship-1950-2020/races.csv')
df_circuits = pd.read_csv('../input/formula-1-world-championship-1950-2020/circuits.csv')

# The expension of Formula 1 around the world

This notebook was created to prove the expension of Formula One around the world.

To prove that an expension exist, we can ask ourself somes questions:
1. Is there an increase of races by year?
2. Which are the countries participating and where are they located?


# Is there an increase of race since 1950?
To answer that question, we can check the number of races by year. And then calculate the raising percentage of number.

In [None]:
rounds = df_races.groupby('year').round.max().reset_index()['round'].tolist()
years = df_races.groupby('year').round.max().reset_index()['year'].tolist()

fig, ax = plt.subplots(figsize=(15,5))
ax.plot(years, rounds)

ax.set_ylabel("Number of Races")
ax.set_xlabel("Year");

In [None]:
races1950 = df_races['year'].isin([1950]).value_counts()
races2019 = df_races['year'].isin([2019]).value_counts()
increase = (races2019 - races1950) / races1950 * 100
increase[True]

From 1950 to 2019 there was an increase of 200 percents of the races number by seasons.
That proves that the Formula One is appreciated and have evolved.

# Which are the countries participating and where are they located?

## 1. Where are located F1 circuits?

In [None]:
fig, ax = plt.subplots(figsize=(15,5))

df_circuits['country'].value_counts().plot.bar(ax=ax)

ax.set_xlabel("Countries")
ax.set_ylabel("Number of circuit");

In [None]:
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

lng = df_circuits['lng'].tolist()
lat = df_circuits['lat'].tolist()

gdf = geopandas.GeoDataFrame(
    df_circuits, geometry=geopandas.points_from_xy(lng, lat))

fig, ax = plt.subplots(figsize=(20,10))

world.plot(ax=ax, zorder=1);
gdf.plot(ax=ax, marker='o', color='red', markersize=15)
#cities.plot(lng, lat, ax=ax, marker='o', color='red', markersize=5)
#ax.plot(lng, lat)

plt.show();


We can see that they are many circuit in North America and in Europe. But there are also circuit in each continent.

## 2. So, in which country are held races?

In [None]:
df_circuits = df_circuits.rename(columns={"name": "circuitName"})
df_circuits['dummy'] = 1
df_races['dummy'] = 1
df_combined = df_circuits.merge(df_races, on = 'dummy').drop('dummy', axis=1)

df_combined["circuitId_x"] = df_combined["circuitId_x"].astype(str)
df_combined["circuitId_y"] = df_combined["circuitId_y"].astype(str)

df = df_combined[df_combined.apply(lambda x: x.circuitId_x == x.circuitId_y, axis=1).ge(1)]
df = df[['circuitId_x', 'circuitName', 'location', 'country', 'lat', 'lng', 
         'raceId', 'name', 'year', 'round', 'date', 'time']]
df = df.set_index('circuitId_x')

In [None]:
#Check if merge between races and circuit work
#df.sort_values("year", inplace=True) 
#df

In [None]:
df2 = df.drop_duplicates(["year", "country"])
fig, ax = plt.subplots(2,figsize=(15,15))
ax[0].plot(df2.groupby('year').country.size())
rounds = df_races.groupby('year').round.max().reset_index()['round'].tolist()
years = df_races.groupby('year').round.max().reset_index()['year'].tolist()
ax[1].plot(years, rounds)

ax[0].set_ylabel("Number of country participing")
ax[1].set_ylabel("Number of Races")
ax[0].set_xlabel("Year");
ax[1].set_xlabel("Year");

Before the covid-19 pandemic, races have been held in more that 20 countries, it has continued to increase over the year.


## 3. But these countries are they in all continents?


To answer that question, I need first to take a new dataset with all the contries and their continent.

In [None]:
# change contries name on the circuit dataset to match their name in the country dataset
df.loc[df.country.isin(["UK"]), ('country')] = "United Kingdom of Great Britain & Northern Ireland"
df.loc[df.country.isin(["USA"]), ('country')] = "United States of America"
df.loc[df.country.isin(["UAE"]), ('country')] = "United Arab Emirates"

In [None]:
# create the country dataset with the name of country in the circuit dataset
df_country = pd.read_csv('../input/covid19pluspopulations/country-and-continent-codes-list.csv')
df_country = df_country[df_country.Country_Name.str.contains('|'.join(df.country))]
df_country = df_country[['Continent_Name', 'Continent_Code', 'Country_Name', 'Three_Letter_Country_Code']]
df_country.rename(columns={'Three_Letter_Country_Code':'Country_Code'}, inplace=True)
df_country[df_country['Country_Name'].duplicated(keep=False)]
df_country.head()

In [None]:
# merge the two dataset into one with appropriate continent name
df_country['dummy'] = 1
df['dummy'] = 1
df_combined = df.merge(df_country, on = 'dummy').drop('dummy', axis=1)
df_circuit_continent = df_combined[df_combined.apply(lambda x: x.Country_Name.find(x.country), axis=1).ge(0)]
df_circuit_continent = df_circuit_continent[['circuitName', 'location', 'country', 'lat', 'lng', 
         'raceId', 'name', 'year', 'round', 'date', 'time',
         'Continent_Name', 'Continent_Code', 'Country_Name', 'Country_Code']]
df_circuit_continent.rename(columns={'name':'Circuit_Name'})

In [None]:
# check the number of continents that participated in each year of Formula 1 Grand Prix
df4 = df_circuit_continent.drop_duplicates(["year", "Continent_Code"])
fig, ax = plt.subplots(figsize=(10,5))

df4.groupby('year').Continent_Code.size().plot.bar(ax=ax)
ax.set_ylabel("Number of continents participing")
ax.set_xlabel("Year");

In [None]:
# Check the number of races in each continents
df_circuit_continent = df_circuit_continent.drop_duplicates(["year", "date"])
alt.Chart(df_circuit_continent).mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    x='year:O',
    y='count():Q',
    color='Continent_Name:N'
)

We can see a huge expansion of circuit's location with years. First, races was only in Europe and USA but then they started to be organised in South America and South Africa, for many years, they have been organised races there but the last one was in 1993. However, a huge increase of Asian races has appeared since 1999.

Lastly, the Covid-19 had an impact on the races number and location, from 22 to 17 races and occurring only on two continents.
So we hope next year will be better and why not maybe one time we will see the return of the South Africa race.