# Exploratory Analysis - 4

#### This analysis is to explore if there is a correlation between winning racers and circuits.

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np

In [2]:
#Load files for analysis
results_df = pd.read_csv('../Resources/Dataset/results.csv')
circuits_df = pd.read_csv('../Resources/Dataset/circuits.csv')
constructor_results_df = pd.read_csv('../Resources/Dataset/constructor_results.csv')
constructors_df = pd.read_csv('../Resources/Dataset/constructors.csv')
drivers_df = pd.read_csv('../Resources/Dataset/drivers.csv')
races_df = pd.read_csv('../Resources/Dataset/races.csv')
status_df = pd.read_csv('../Resources/Dataset/status.csv')

### Merging datasets

In [3]:
# Joining datasets

#join results with drivers datasets
res_dr_df = pd.merge(results_df,drivers_df,how='left',on='driverId', suffixes=('_res','_drv'))

#join status dataset
res_dr_df_st = pd.merge(res_dr_df, status_df, how='left', on='statusId', suffixes=('_mer1', 'st'))

#join races with results
res_dr_ra_df = pd.merge(res_dr_df_st,races_df,how='left',on='raceId', suffixes=('_mer2','_rac'))

#join circuit dataset
res_dr_ra_cir_df = pd.merge(res_dr_ra_df,circuits_df,how='left', on='circuitId', suffixes=('_mer3','_cir'))

#join the above with constructor data
merged_df = pd.merge(res_dr_ra_cir_df,constructors_df,how='left',on='constructorId', suffixes=('_mer4','_con'))

In [4]:
#Drop columns that are not necessary
merged_df = merged_df.drop(columns=['number_res','points','laps','fastestLap','rank','round',\
                                    'statusId','number_drv','code','url_mer2','url_rac','url_mer4',\
                                    'url_con','time_rac'])

In [5]:
#Replace '\N' with 0 and convert to numeric
merged_df['position'] = merged_df['position'].replace({'\\N':0})
merged_df['position'] = pd.to_numeric(merged_df['position'])

In [6]:
#Replace all values that have 'laps' at the end in the status column to 'Finished' 
#as these racers are finished the race anyway
merged_df['status'] = merged_df.status.str.replace("\+\d\s\w*|\+\d\d\s\w*", 'Finished')

  """Entry point for launching an IPython kernel.


## Racer vs Circuit

### 2000 to 2009

In [8]:
#filter data by years
circuits_1 = merged_df.loc[(merged_df['year'] >= 2000) & (merged_df['year'] < 2010)]
circuits_races_count = circuits_1.groupby(['surname','circuitRef'])['surname'].count().reset_index(name='races')

#filter data for winners
dr_cir_wins = merged_df.loc[(merged_df['year'] >= 2000) & (merged_df['year'] < 2010) & (merged_df['position'] == 1)]
dr_cir_wins_count = dr_cir_wins.groupby(['surname','circuitRef'])['surname'].count().reset_index(name='wins')

# transfrom data to create a column with winning percentage
merged_df_2 = pd.merge(circuits_races_count, dr_cir_wins_count, how='left', on=['surname','circuitRef'])
merged_df_2['wins']=merged_df_2['wins'].fillna(0)
merged_df_wins = merged_df_2.loc[merged_df_2['wins'] > 0].sort_values(by='wins',ascending=False)
merged_df_wins['win_per'] = (merged_df_2['wins'] / merged_df_2['races']) * 100
merged_df_wins = merged_df_wins.sort_values(by='win_per', ascending= False)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(merged_df_wins)

            surname      circuitRef  races  wins     win_per
405        Hamilton    indianapolis      1   1.0  100.000000
645           Massa        valencia      1   1.0  100.000000
1022         Vettel          suzuka      1   1.0  100.000000
1025         Vettel      yas_marina      1   1.0  100.000000
403        Hamilton  hockenheimring      1   1.0  100.000000
404        Hamilton     hungaroring      3   2.0   66.666667
633           Massa        istanbul      5   3.0   60.000000
859       Räikkönen             spa      7   4.0   57.142857
586          Kubica      villeneuve      2   1.0   50.000000
475        Häkkinen     silverstone      2   1.0   50.000000
47           Alonso            fuji      2   1.0   50.000000
55           Alonso      marina_bay      2   1.0   50.000000
419        Hamilton      villeneuve      2   1.0   50.000000
409        Hamilton      marina_bay      2   1.0   50.000000
402        Hamilton            fuji      2   1.0   50.000000
463        Häkkinen     

### 2010 to 2019

In [9]:
#filter data by years
circuits_1 = merged_df.loc[(merged_df['year'] >= 2000) & (merged_df['year'] < 2010)]
circuits_races_count = circuits_1.groupby(['surname','circuitRef'])['surname'].count().reset_index(name='races')

#filter data for winners
dr_cir_wins = merged_df.loc[(merged_df['year'] >= 2000) & (merged_df['year'] < 2010) & (merged_df['position'] == 1)]
dr_cir_wins_count = dr_cir_wins.groupby(['surname','circuitRef'])['surname'].count().reset_index(name='wins')

# transfrom data to create a column with winning percentage
merged_df_2 = pd.merge(circuits_races_count, dr_cir_wins_count, how='left', on=['surname','circuitRef'])
merged_df_2['wins']=merged_df_2['wins'].fillna(0)
merged_df_wins = merged_df_2.loc[merged_df_2['wins'] > 0].sort_values(by='wins',ascending=False)
merged_df_wins['win_per'] = (merged_df_2['wins'] / merged_df_2['races']) * 100
merged_df_wins = merged_df_wins.sort_values(by='win_per', ascending= False)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(merged_df_wins)

            surname      circuitRef  races  wins     win_per
405        Hamilton    indianapolis      1   1.0  100.000000
645           Massa        valencia      1   1.0  100.000000
1022         Vettel          suzuka      1   1.0  100.000000
1025         Vettel      yas_marina      1   1.0  100.000000
403        Hamilton  hockenheimring      1   1.0  100.000000
404        Hamilton     hungaroring      3   2.0   66.666667
633           Massa        istanbul      5   3.0   60.000000
859       Räikkönen             spa      7   4.0   57.142857
586          Kubica      villeneuve      2   1.0   50.000000
475        Häkkinen     silverstone      2   1.0   50.000000
47           Alonso            fuji      2   1.0   50.000000
55           Alonso      marina_bay      2   1.0   50.000000
419        Hamilton      villeneuve      2   1.0   50.000000
409        Hamilton      marina_bay      2   1.0   50.000000
402        Hamilton            fuji      2   1.0   50.000000
463        Häkkinen     

#### Conclusion: There is no clear trend between races and circuit.