In [2]:
import sys
sys.path.append('../../')

import numpy as np
import pandas as pd

## 1. Load the Datasets

In [3]:
df_constructors = pd.read_csv('../../data/outputs/04_constructors_processed.csv')
df_races_results = pd.read_csv('../../data/outputs/03_races_results.csv')

In [4]:
df_constructors.head()

Unnamed: 0,constructorId,name,nationality,constructor_is_active
0,1,McLaren,British,1
1,2,BMW Sauber,German,0
2,3,Williams,British,1
3,4,Renault,French,0
4,5,Toro Rosso,Italian,0


In [5]:
df_races_results.head()

Unnamed: 0,resultId,raceId,driverId,constructorId,grid,race_rank,points,laps,milliseconds,fastestLap,fastestLapTime,fastestLapSpeed,statusId,year,round,circuitId,name
0,1,18,1,1,1,1,10.0,58,5690616,39,87452,218.3,1,2008,1,1,Australian Grand Prix
1,2,18,2,2,5,2,8.0,58,5696094,41,87739,217.586,1,2008,1,1,Australian Grand Prix
2,3,18,3,3,7,3,6.0,58,5698779,41,88090,216.719,1,2008,1,1,Australian Grand Prix
3,4,18,4,4,11,4,5.0,58,5707797,58,88603,215.464,1,2008,1,1,Australian Grand Prix
4,5,18,5,1,3,5,4.0,58,5708630,43,87418,218.385,1,2008,1,1,Australian Grand Prix


## 2. Process constructor dataset

### 2.1. Calculate the number of races won by each constructor

In [6]:
# Get all the ids for each constructor
ids = df_constructors.constructorId.to_list()

In [8]:
df_constructors['constructor_races_won'] = 0

for constId in ids:
    df_constructors.loc[df_constructors.constructorId == constId, 'constructor_races_won'] = df_races_results[(df_races_results.constructorId == constId) & (df_races_results.race_rank == 1)].shape[0]

In [9]:
df_constructors.head()

Unnamed: 0,constructorId,name,nationality,constructor_is_active,constructor_races_won
0,1,McLaren,British,1,179
1,2,BMW Sauber,German,0,1
2,3,Williams,British,1,114
3,4,Renault,French,0,35
4,5,Toro Rosso,Italian,0,1


### 2.2. Calculate the constructor's average points

In [10]:
df_constructors['constructor_avg_point'] = np.float64(0)

for constId in ids:
    avg_points = df_races_results[df_races_results.constructorId == constId]['points'].mean()
    df_constructors.loc[df_constructors.constructorId == constId, 'constructor_avg_point'] = np.float64(avg_points)

In [11]:
df_constructors.head()

Unnamed: 0,constructorId,name,nationality,constructor_is_active,constructor_races_won,constructor_avg_point
0,1,McLaren,British,1,179,3.364151
1,2,BMW Sauber,German,0,1,2.2
2,3,Williams,British,1,114,2.243008
3,4,Renault,French,0,35,2.257942
4,5,Toro Rosso,Italian,0,1,0.932836


### 2.3. Number of time inside the point area

In [12]:
df_constructors['constructor_times_in_top_10'] = 0

for constId in ids:
    df_constructors.loc[df_constructors.constructorId == constId, 'constructor_times_in_top_10'] = df_races_results[(df_races_results.constructorId == constId) & (df_races_results.race_rank <= 10)].shape[0]

In [13]:
df_constructors.head()

Unnamed: 0,constructorId,name,nationality,constructor_is_active,constructor_races_won,constructor_avg_point,constructor_times_in_top_10
0,1,McLaren,British,1,179,3.364151,1110
1,2,BMW Sauber,German,0,1,2.2,91
2,3,Williams,British,1,114,2.243008,764
3,4,Renault,French,0,35,2.257942,390
4,5,Toro Rosso,Italian,0,1,0.932836,158


### 2.4. Rename the columns

In [16]:
df_constructors.rename({'name': 'constructor_name', 'nationality': 'constructor_country'}, axis=1, inplace=True)

In [17]:
df_constructors.head()

Unnamed: 0,constructorId,constructor_name,constructor_country,constructor_is_active,constructor_races_won,constructor_avg_point,constructor_times_in_top_10
0,1,McLaren,British,1,179,3.364151,1110
1,2,BMW Sauber,German,0,1,2.2,91
2,3,Williams,British,1,114,2.243008,764
3,4,Renault,French,0,35,2.257942,390
4,5,Toro Rosso,Italian,0,1,0.932836,158


### 3. Save the dataset

In [18]:
df_constructors.to_csv('../../data/db/02_constructors.csv', index=False)