# Formula One Exploratory Analysis

I wanted to dive into Formula One data to find out what's in this particular dataset, and identify some questions I can follow-up on for more focused analysis. 

Starting out, I have these questions in mind - mainly motivated by curiosity:

 - In Formula One, does the winner of qualifying often go on to win the race?
 - What is the distribution of nationalities for Formula One drivers? What is the distribution of nationalities for Formula One constructors?
 - Who are the all-time top-ranking drivers? Who are the all-time top-ranking constructors?
 - What's the longest pit stop recorded in this dataset? What's the shortest pit stop recorded? Has the average pit stop duration decreased over time?

## Initial Setup and Cleaning of Data 

In [None]:
import pandas as pd
from pandas import Series, DataFrame
import os

from IPython.display import display, Markdown
from matplotlib import pyplot as plt
import seaborn as sns

sns.set_theme(style='whitegrid')

In [None]:
input_dir = '../input/formula-1-race-data-19502017'

# create full scope column reference since none exists
df_columns = None

for filename in os.listdir(input_dir):
  if (filename != '.git'):
    df_columns_tmp = DataFrame(
      {
        'filename': filename,
        'columns': pd.read_csv(f'{input_dir}/{filename}').columns
      }
    )
    if df_columns is not None:
      df_columns = df_columns.append(df_columns_tmp)
    elif df_columns is None:
      df_columns = df_columns_tmp

df_columns.to_csv('Metadata.csv')


# initialize dataframes
df_circuits = pd.read_csv(f'{input_dir}/circuits.csv')
df_qualifying = pd.read_csv(f'{input_dir}/qualifying.csv')
df_races = pd.read_csv(f'{input_dir}/races.csv')
df_results = pd.read_csv(f'{input_dir}/results.csv')
df_drivers = pd.read_csv(f'{input_dir}/drivers.csv')
df_constructors = pd.read_csv(f'{input_dir}/constructors.csv')
df_seasons = pd.read_csv(f'{input_dir}/seasons.csv')
df_driverStandings = pd.read_csv(f'{input_dir}/driverStandings.csv')
df_constructorStandings = pd.read_csv(f'{input_dir}/constructorStandings.csv')

In [None]:
# helper function for data quality check of dataframes
def get_unique_value_counts(dataframe):
  result = {'count_records':dataframe.shape[0]}
  for column in dataframe.columns:
    result[f'count_{column}'] = dataframe[column].nunique()
  return Series(result)

def display_results(dataframe,description,limit=10):
  '''Helper function for displaying formated results from Python'''
  display(Markdown(f'### {description}'))
  display(dataframe[:limit])

In [None]:
display_results(get_unique_value_counts(df_results),'Results.csv unique value counts.')
display_results(get_unique_value_counts(df_qualifying),'Qualifying.csv unique value counts.')
display_results(get_unique_value_counts(df_drivers),'Drivers.csv unique value counts.')


df_dups = df_results[df_results.duplicated(subset=['raceId','positionOrder'], keep=False)]

display_text = f'''
### Results.csv Summary Counts
 - \# of duplicate rows: {df_dups.shape[0]}
 - \# of total rows: {df_results.shape[0]}

There are duplicate rows - but since they are a relatively
small fraction of the overall results, we can drop without
significantly impacting the outcome

'''

display(Markdown(display_text))
df_results = df_results.drop_duplicates(subset=['raceId','positionOrder'], keep=False)

### Removal of irrelevant datapoints

In [None]:
# remove race results for the Indianapolis 500
df_results = df_results[~df_results.raceId.isin(df_races[df_races.name == 'Indianapolis 500'].raceId)]

# remove drivers that did not race the average number of races for a season
df_results = df_results[
  df_results.driverId.isin(
    df_results.driverId.value_counts()[
      df_results.driverId.value_counts() >= 15].index
      )
    ]

# remove drivers that are no longer in the race results
df_drivers = df_drivers[df_drivers.driverId.isin(df_results.driverId)]

# remove constructors that are no longer in the race results
df_constructors = df_constructors[df_constructors.constructorId.isin(df_results.constructorId)]


## High Level Analysis

In [None]:
df_tmp = df_results.merge(df_qualifying,on=['raceId','driverId'],suffixes=['_result','_qual'])

# check for duplicates
df_tmp[df_tmp.duplicated(subset=['raceId','driverId'])]



# For what percentage of races did the winner of qualifying go on to win the race?
df_tmp2 = df_tmp[df_tmp.position_qual == 1].groupby('position_result',as_index=False).agg(count=pd.NamedAgg(column='resultId',aggfunc='nunique'))
df_tmp2['perc'] = df_tmp2['count']/df_tmp2.sum()['count']
# df_tmp2


plt.figure(figsize=(15,5))
graph = sns.barplot(
    data=df_tmp2,
    y ='perc',
    x='position_result',
    color='#4e79a7'
     ).set_title(
    'Race Result from Pole Position',
    size=14
    )

plt.xlabel('Final Race Result')
plt.ylabel('% Frequency')

In [None]:
sns.set_theme(style='whitegrid')
plt.figure(figsize=(15,10))

# What is the distribution of nationalities for Formula 1 drivers?
df_tmp = df_drivers.groupby('nationality',as_index=False)\
  .agg(count_drivers = pd.NamedAgg(column='driverId',aggfunc='nunique'))\
  .sort_values('count_drivers',ascending=False)

plt.figure(figsize=(15,15))

graph = sns.barplot(
    data=df_tmp,
    y ='nationality',
    x='count_drivers',
    color='#4e79a7'
     ).set_title(
    'Formula 1 Driver Nationalities',
    size=14
    )

plt.xlabel('# of Drivers')
plt.ylabel('Nationalities')


In [None]:
# What is the distribution of nationalities for Formula 1 drivers?
df_tmp = df_constructors.groupby('nationality',as_index=False)\
  .agg(count_constructors = pd.NamedAgg(column='constructorId',aggfunc='nunique'))\
  .sort_values('count_constructors',ascending=False)

plt.figure(figsize=(15,10))

graph = sns.barplot(
    data=df_tmp,
    y ='nationality',
    x='count_constructors',
    color='#4e79a7'
     ).set_title(
    'Formula 1 Constuctor Nationalities',
    size=14
    )
plt.xlabel('# of Constructors')
plt.ylabel('Nationalities')

## Formula One Drivers
### Summary Analysis

In [None]:
# get driver, season, total points
df_tmp = df_driverStandings \
  .merge(df_results, on=['raceId','driverId'],suffixes=['_standing','_result']) \
  .merge(df_races, on='raceId', suffixes=['_standing','_race'])\
  .merge(df_drivers, on='driverId',suffixes=['_standing','_driver'])\
  .merge(df_constructors, on='constructorId',suffixes=['_driver','_constructor'])

df_tmp2 = df_tmp.groupby(['year'], as_index=False).agg({'points_standing':'max'})

df_tmp3 = df_tmp.merge(df_tmp2, on=['year','points_standing'])\
  .drop_duplicates(subset=['year','points_standing','driverId'],keep='first')

# which drivers have the most titles?
df_tmp4 = df_tmp3.groupby(['surname','forename','nationality_driver','driverId'], as_index=False)\
  .agg(count_year = pd.NamedAgg(column='year',aggfunc='count'))\
  .sort_values(by='count_year', ascending=False)

display_results(df_tmp4,'Drivers with the most championships')

# which nations have the most driver's titles?
df_tmp5 = df_tmp4.groupby('nationality_driver',as_index=False)\
  .agg(count_titles = pd.NamedAgg(column='count_year',aggfunc='sum'))\
  .sort_values(by='count_titles',ascending=False)[:10]

display_results(df_tmp5,'Nations with the most driver\'s championships.')

# which nations have the most driver champions?
df_tmp5 = df_tmp4.groupby('nationality_driver',as_index=False)\
  .agg(count_drivers=pd.NamedAgg(column='driverId',aggfunc='nunique'))\
  .sort_values(by='count_drivers',ascending=False)[:10]

display_results(df_tmp5, 'Nations with the most driver\'s *champions*.')

# which drivers has won the most races in all Formula One history?
df_driver_wins = df_tmp[df_tmp.position_result==1]\
  .groupby(['driverId','surname','forename'],as_index=False)\
  .agg(count_wins=pd.NamedAgg(column='raceId',aggfunc='nunique'))\
  .sort_values(by='count_wins',ascending=False)

display(Markdown('### Formula 1 drivers with the most wins. '))
display(df_driver_wins[:10])

display_results(df_driver_wins, 'Formula 1 drivers with the most wins.')

# who are winning drivers that never won a driver's championship?
df_tmp5 = df_driver_wins[~df_driver_wins.driverId.isin(df_tmp3.driverId.values)]

display_results(df_tmp5, 'Top drivers that never won a driver''s championship.')


## Formula One Constructors
### Summary Analysis

In [None]:
# get constructor, total season points

df_tmp = df_constructorStandings\
  .merge(df_races,on=['raceId'],suffixes=['_standing','_race'])\
  .merge(df_constructors, on=['constructorId'],suffixes=['_races','_constructor'])


# df_tmp.groupby(['year','name_constructor']).agg(max_points=pd.NamedAgg(column='points_standings_standing',aggfunc='max'))
df_tmp2 = df_tmp.groupby(['year'],as_index=False)\
  .agg(max_points=pd.NamedAgg(column='points',aggfunc='max'))

df_tmp3 = df_tmp.merge(
    df_tmp2, left_on=['year','points'],
    right_on=['year','max_points']
  ).drop_duplicates(subset=['year','constructorId'])

# which constructor has the most championships?
df_tmp4 = df_tmp3.groupby(['constructorId','name_constructor','nationality'],as_index=False)\
  .agg(count_years=pd.NamedAgg(column='year',aggfunc='nunique'))\
  .sort_values(by='count_years',ascending=False)

display_results(df_tmp4,'Constructors with the most championships.')

# which nation has the most constructors championships?
df_tmp4 = df_tmp3.groupby(['nationality'],as_index=False)\
  .agg(count_years=pd.NamedAgg(column='year',aggfunc='nunique'))\
  .sort_values(by='count_years',ascending=False)

display_results(df_tmp4,'Nations with the most constructor\'s championships.')


## Next steps

Some potential next steps for analysis:
 - Time series analysis of average pit stop time, and how it's changed over the history of Formula One results
 - Geographic visualization that may reveal patterns related to race circuits, driver nationality, etc.
 - More general understanding of how qualfying placement relates to the final race result; I imagine a scatter plot could be effective for this
 - Segmentation of current analysis to see if there are any surprising differences when changing one or two variables