# Exploratory Analysis - 5

#### This analysis is to explore if there is a correlation between winning constructors and circuits.

In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import re
import numpy as np

In [2]:
#Load files for analysis
results_df = pd.read_csv('../Resources/Dataset/results.csv')
circuits_df = pd.read_csv('../Resources/Dataset/circuits.csv')
constructor_results_df = pd.read_csv('../Resources/Dataset/constructor_results.csv')
constructors_df = pd.read_csv('../Resources/Dataset/constructors.csv')
drivers_df = pd.read_csv('../Resources/Dataset/drivers.csv')
races_df = pd.read_csv('../Resources/Dataset/races.csv')
status_df = pd.read_csv('../Resources/Dataset/status.csv')

### Merging datasets

In [3]:
# Joining datasets

#join results with drivers datasets
res_dr_df = pd.merge(results_df,drivers_df,how='left',on='driverId', suffixes=('_res','_drv'))

#join status dataset
res_dr_df_st = pd.merge(res_dr_df, status_df, how='left', on='statusId', suffixes=('_mer1', 'st'))

#join races with results
res_dr_ra_df = pd.merge(res_dr_df_st,races_df,how='left',on='raceId', suffixes=('_mer2','_rac'))

#join circuit dataset
res_dr_ra_cir_df = pd.merge(res_dr_ra_df,circuits_df,how='left', on='circuitId', suffixes=('_mer3','_cir'))

#join the above with constructor data
merged_df = pd.merge(res_dr_ra_cir_df,constructors_df,how='left',on='constructorId', suffixes=('_mer4','_con'))

In [4]:
#Drop columns that are not necessary
merged_df = merged_df.drop(columns=['number_res','points','laps','fastestLap','rank','round',\
                                    'statusId','number_drv','code','url_mer2','url_rac','url_mer4',\
                                    'url_con','time_rac'])

In [5]:
#Replace '\N' with 0 and convert to numeric
merged_df['position'] = merged_df['position'].replace({'\\N':0})
merged_df['position'] = pd.to_numeric(merged_df['position'])

In [6]:
#Replace all values that have 'laps' at the end in the status column to 'Finished' 
#as these racers are finished the race anyway
merged_df['status'] = merged_df.status.str.replace("\+\d\s\w*|\+\d\d\s\w*", 'Finished')

  """Entry point for launching an IPython kernel.


## Racer vs Circuit

### 2000 to 2009

In [7]:
#filter data by years
circuits_1 = merged_df.loc[(merged_df['year'] >= 2000) & (merged_df['year'] < 2010)]
circuits_races_count = circuits_1.groupby(['name','circuitRef'])['surname'].count().reset_index(name='races')

In [8]:
#filter data for winners
dr_cir_wins = merged_df.loc[(merged_df['year'] >= 2000) & (merged_df['year'] < 2010) & (merged_df['position'] == 1)]
dr_cir_wins_count = dr_cir_wins.groupby(['name','circuitRef'])['surname'].count().reset_index(name='wins')

In [9]:
# transfrom data to create a column with winning percentage
merged_df_2 = pd.merge(circuits_races_count, dr_cir_wins_count, how='left', on=['name','circuitRef'])
merged_df_2['wins']=merged_df_2['wins'].fillna(0)
merged_df_wins = merged_df_2.loc[merged_df_2['wins'] > 0].sort_values(by='wins',ascending=False)
merged_df_wins['win_per'] = (merged_df_2['wins'] / merged_df_2['races']) * 100
merged_df_wins = merged_df_wins.sort_values(by='win_per', ascending= False)

In [10]:
#display all rows
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(merged_df_wins)

           name      circuitRef  races  wins    win_per
297    Red Bull      yas_marina      2   1.0  50.000000
78        Brawn         bahrain      2   1.0  50.000000
79        Brawn       catalunya      2   1.0  50.000000
82        Brawn        istanbul      2   1.0  50.000000
84        Brawn          monaco      2   1.0  50.000000
85        Brawn           monza      2   1.0  50.000000
87        Brawn          sepang      2   1.0  50.000000
92        Brawn        valencia      2   1.0  50.000000
77        Brawn     albert_park      2   1.0  50.000000
101     Ferrari    indianapolis     16   6.0  37.500000
100     Ferrari           imola     14   5.0  35.714286
104     Ferrari     magny_cours     18   6.0  33.333333
114     Ferrari          suzuka     16   5.0  31.250000
113     Ferrari             spa     16   5.0  31.250000
103     Ferrari        istanbul     10   3.0  30.000000
96      Ferrari       catalunya     20   6.0  30.000000
108     Ferrari     nurburgring     18   5.0  27

### 2010 to 2019

In [11]:
#filter data by years
circuits_1 = merged_df.loc[(merged_df['year'] >= 2010) & (merged_df['year'] < 2020)]
circuits_races_count = circuits_1.groupby(['name','circuitRef'])['surname'].count().reset_index(name='races')

In [12]:
#filter data for winners
dr_cir_wins = merged_df.loc[(merged_df['year'] >= 2010) & (merged_df['year'] < 2020) & (merged_df['position'] == 1)]
dr_cir_wins_count = dr_cir_wins.groupby(['name','circuitRef'])['surname'].count().reset_index(name='wins')

In [13]:
# transfrom data to create a column with winning percentage
merged_df_2 = pd.merge(circuits_races_count, dr_cir_wins_count, how='left', on=['name','circuitRef'])
merged_df_2['wins']=merged_df_2['wins'].fillna(0)
merged_df_wins = merged_df_2.loc[merged_df_2['wins'] > 0].sort_values(by='wins',ascending=False)
merged_df_wins['win_per'] = (merged_df_2['wins'] / merged_df_2['races']) * 100
merged_df_wins = merged_df_wins.sort_values(by='win_per', ascending= False)

In [14]:
#display all rows
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(merged_df_wins)

         name      circuitRef  races  wins    win_per
273  Mercedes          ricard      4   2.0  50.000000
278  Mercedes           sochi     12   6.0  50.000000
310  Red Bull           buddh      6   3.0  50.000000
332  Red Bull         yeongam      8   3.0  37.500000
258  Mercedes             BAK      8   3.0  37.500000
272  Mercedes   red_bull_ring     12   4.0  33.333333
329  Red Bull        valencia      6   2.0  33.333333
260  Mercedes        americas     16   5.0  31.250000
323  Red Bull          sepang     16   5.0  31.250000
283  Mercedes      yas_marina     20   6.0  30.000000
276  Mercedes        shanghai     20   6.0  30.000000
280  Mercedes          suzuka     20   6.0  30.000000
277  Mercedes     silverstone     20   6.0  30.000000
274  Mercedes       rodriguez     10   3.0  30.000000
263  Mercedes       catalunya     20   5.0  25.000000
264  Mercedes  hockenheimring     12   3.0  25.000000
319  Red Bull     nurburgring      4   1.0  25.000000
270  Mercedes           monz

#### Conclusion: There is no clear trend between contructors and circuit.