In [1]:
# Import dependencies
import pandas as pd
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

In [2]:
# Read in F1 Grand Prix data from https://www.kaggle.com/cjgdev/formula-1-race-data-19502017
races_csv = pd.read_csv("races.csv") 
results_csv = pd.read_csv("results.csv")
circuits_csv = pd.read_csv("circuits.csv")
drivers_csv = pd.read_csv("drivers.csv")

In [3]:
# Set the index for the circuits csv to circuitId
circuits_csv = circuits_csv.set_index('circuitId')

In [4]:
# Locate only circuits in the US
circuits_csv = circuits_csv.loc[circuits_csv['country'] == 'USA']

In [5]:
# drop uncommon or unknown US circuits based on circuitId (Index)
circuits_csv = circuits_csv.drop([37, 42, 43, 44, 72])

In [6]:
# Reset the index to make merging easier later
circuits_csv = circuits_csv.reset_index()

In [7]:
# Remove uncommon or unknown US circuits
races_csv = races_csv.loc[races_csv['circuitId'] != 43]

In [8]:
# Save only US Races
races_csv = races_csv.loc[races_csv['name'].isin(['United States Grand Prix', 'United States Grand Prix West'])]

In [9]:
# Combine the circuits and races dfs
cir_race = pd.merge(circuits_csv, races_csv, on='circuitId')

In [10]:
# Combine the drivers and the results dfs
dri_res = pd.merge(drivers_csv, results_csv, on='driverId')

In [11]:
# combine the circuits/race df and the driver/results df into one final combined df
combined_df = pd.merge(cir_race, dri_res, on='raceId')
combined_df = combined_df.drop(columns=['circuitRef', 'lat', 'lng', 'alt', 'url_x', 'url_y', 'url', 'name_y', 'time_x', 'round', 'constructorId', 'number_y', 'grid', 'position', 'positionText', 'milliseconds',
       'fastestLap', 'statusId', 'rank', 'resultId', 'number_x', 'driverRef', 'code'])

In [12]:
# Rename columns to something more meaningful
combined_df = combined_df.rename(columns={'name_x':'trackName',
                          'forename':'firstName',
                          'time_y':'driverRaceTime'})

In [13]:
# min_lap_time = combined_df\
#     .dropna(how='any')\
#     .groupby('date')['fastestLapTime']\
#     .min()\
#     .to_frame()\
#     .rename(columns={'fastestLapTime': 'overallFastestLapTime'})
# min_lap_time

In [14]:
#df2 = pd.merge(combined_df, min_lap_time, on='date')

In [15]:
#df2.loc[df2['fastestLapTime'] == df2['overallFastestLapTime']] 

In [16]:
# combined_df.columns

In [17]:
# Connect to the database
rds_connection_string = "root:Stup!dR00tP@55w0rd@127.0.0.1/f1_weather_db"
engine = create_engine(f'mysql://{rds_connection_string}')

In [18]:
# Add the combined f1 dataframe to the database
combined_df.to_sql(name='f1_data', con=engine, if_exists='append', index=False)

In [19]:
# Test that the table is presnt and working
pd.read_sql_query('select * from f1_data', con=engine).head()

Unnamed: 0,circuitId,trackName,location,country,raceId,year,date,driverId,firstName,surname,dob,nationality,positionOrder,points,laps,driverRaceTime,fastestLapTime,fastestLapSpeed
0,19,Indianapolis Motor Speedway,Indianapolis,USA,42,2007,6/17/2007,1,Lewis,Hamilton,7/1/1985,British,1,10.0,73,31:10.0,01:13.2,206.101
1,19,Indianapolis Motor Speedway,Indianapolis,USA,42,2007,6/17/2007,2,Nick,Heidfeld,10/5/1977,German,18,0.0,56,,01:13.4,205.562
2,19,Indianapolis Motor Speedway,Indianapolis,USA,42,2007,6/17/2007,3,Nico,Rosberg,27/06/1985,German,16,0.0,68,,01:14.1,203.753
3,19,Indianapolis Motor Speedway,Indianapolis,USA,42,2007,6/17/2007,4,Fernando,Alonso,29/07/1981,Spanish,2,8.0,73,1.5,01:13.3,206.003
4,19,Indianapolis Motor Speedway,Indianapolis,USA,42,2007,6/17/2007,5,Heikki,Kovalainen,19/10/1981,Finnish,5,4.0,73,41.4,01:14.0,203.94
