# Insert circuits

This script inserts the results from 2021 onwards within the results.csv into the results table.

In [2]:
import pandas as pd
from io import BytesIO
from minio import Minio
from sqlalchemy import create_engine, text
from fuzzywuzzy import fuzz

import warnings

In [3]:
# Initialize Minio client
minio_client = Minio(
    "localhost:9000",
    access_key="minioadmin",
    secret_key="minioadmin",
    secure=False
)

In [4]:
# Download the results CSV file from the Minio bucket into a pandas DataFrame
data_res = minio_client.get_object("track.data-raw", "results.csv")
data_res = BytesIO(data_res.read())
df_results = pd.read_csv(data_res)

In [5]:
df_results

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.300,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26075,26081,1110,817,213,3,19,16,16,16,0.0,44,+1:43.071,5053521,25,15,1:50.994,227.169,1
26076,26082,1110,858,3,2,18,17,17,17,0.0,44,+1:44.476,5054926,37,9,1:50.486,228.213,1
26077,26083,1110,807,210,27,0,18,18,18,0.0,44,+1:50.450,5060900,26,4,1:49.907,229.415,1
26078,26084,1110,832,6,55,4,\N,R,19,0.0,23,\N,\N,9,19,1:53.138,222.864,130


In [6]:
# Initialize connection to the PostgreSQL database using SQLAlchemy
engine = create_engine('postgresql://admin:admin@localhost/postgres')

We want to get the data from 2021 onwards that can be achieved as follows - <br> 
1. get seasonID from seasons table of year 2021 onwards <br>
2. filter out the data from the events table based on the seasonID from step 1 <br>
3. As from init.sql it is clear that raceId and event.eventID are same therefore to achieve the results we can match the eventID from events table and raceID from results.csv to load the data into results table. <br>
4. To achieve this we first need to load data in events table, driver table, constructor table and status tables.

In [7]:
#load the data from season table

df_seasons = pd.read_sql("SELECT * FROM race_data.season", engine)
df_seasons

Unnamed: 0,season_id,championship_id,year
0,1,1,1950
1,2,1,1951
2,3,1,1952
3,4,1,1953
4,5,1,1954
...,...,...,...
69,70,1,2019
70,71,1,2020
71,72,1,2021
72,73,1,2022


In [8]:
#load data from events table where season is greater than the year 2021
df_events = pd.read_sql ("select * FROM race_data.events where season_id in (select season_id from race_data.season where year >= 2021)", engine)
df_events

Unnamed: 0,event_id,season_id,race_round,circuit_id,official_name,date
0,18,72,1,12.0,FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2021,
1,19,72,2,13.0,FORMULA 1 PIRELLI GRAN PREMIO DEL MADE IN ITAL...,
2,20,72,3,10.0,FORMULA 1 HEINEKEN GRANDE PRÉMIO DE PORTUGAL 2021,
3,21,72,4,5.0,FORMULA 1 ARAMCO GRAN PREMIO DE ESPAÑA 2021,
4,22,72,5,14.0,FORMULA 1 GRAND PRIX DE MONACO 2021,
5,23,72,6,15.0,FORMULA 1 AZERBAIJAN GRAND PRIX 2021,
6,24,72,7,16.0,FORMULA 1 EMIRATES GRAND PRIX DE FRANCE 2021,
7,25,72,8,3.0,FORMULA 1 BWT GROSSER PREIS DER STEIERMARK 2021,
8,26,72,9,3.0,FORMULA 1 BWT GROSSER PREIS VON ÖSTERREICH 2021,
9,27,72,10,4.0,FORMULA 1 PIRELLI BRITISH GRAND PRIX 2021,


In [9]:
#create a new dataframe from df_results where df_results.raceId is in df_events.event_id

df_results_new = df_results[df_results['raceId'].isin(df_events['event_id'])]
df_results_new

Unnamed: 0,resultId,raceId,driverId,constructorId,number,grid,position,positionText,positionOrder,points,laps,time,milliseconds,fastestLap,rank,fastestLapTime,fastestLapSpeed,statusId
0,1,18,1,1,22,1,1,1,1,10.0,58,1:34:50.616,5690616,39,2,1:27.452,218.300,1
1,2,18,2,2,3,5,2,2,2,8.0,58,+5.478,5696094,41,3,1:27.739,217.586,1
2,3,18,3,3,7,7,3,3,3,6.0,58,+8.163,5698779,41,5,1:28.090,216.719,1
3,4,18,4,4,5,11,4,4,4,5.0,58,+17.181,5707797,58,7,1:28.603,215.464,1
4,5,18,5,1,23,3,5,5,5,4.0,58,+18.014,5708630,43,1,1:27.418,218.385,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1231,1232,75,35,15,11,12,\N,R,14,0.0,51,\N,\N,43,13,1:17.585,214.696,5
1232,1233,75,30,6,1,8,\N,R,15,0.0,46,\N,\N,31,2,1:15.648,220.193,29
1233,1234,75,27,18,21,14,\N,R,16,0.0,19,\N,\N,15,17,1:20.124,207.892,6
1234,1235,75,40,18,20,15,\N,R,17,0.0,11,\N,\N,10,18,1:20.865,205.987,20


In [10]:
#print all the unique values in df_results_new.raceId
print('df_results_new')
print(df_results_new['raceId'].unique())

#print all the unique values in df_events.event_id
print('df_events')
print(df_events['event_id'].unique())

df_results_new
[18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
 66 67 68 69 70 71 72 73 74 75]
df_events
[18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
 66 67 68 69 70 71 72 73 74 75]


as these are matching so we can say that the dataframe has the correct values required.

In [11]:
#Now loading the data into drivers table
# Download the drivers CSV file from the Minio bucket into a pandas DataFrame
driverdata = minio_client.get_object("track.data-raw", "drivers.csv")
driverdata = BytesIO(driverdata.read())
df_drivercsv = pd.read_csv(driverdata)

In [12]:
df_drivercsv

Unnamed: 0,driverId,driverRef,number,code,forename,surname,dob,nationality,url
0,1,hamilton,44,HAM,Lewis,Hamilton,1985-01-07,British,http://en.wikipedia.org/wiki/Lewis_Hamilton
1,2,heidfeld,\N,HEI,Nick,Heidfeld,1977-05-10,German,http://en.wikipedia.org/wiki/Nick_Heidfeld
2,3,rosberg,6,ROS,Nico,Rosberg,1985-06-27,German,http://en.wikipedia.org/wiki/Nico_Rosberg
3,4,alonso,14,ALO,Fernando,Alonso,1981-07-29,Spanish,http://en.wikipedia.org/wiki/Fernando_Alonso
4,5,kovalainen,\N,KOV,Heikki,Kovalainen,1981-10-19,Finnish,http://en.wikipedia.org/wiki/Heikki_Kovalainen
...,...,...,...,...,...,...,...,...,...
852,854,mick_schumacher,47,MSC,Mick,Schumacher,1999-03-22,German,http://en.wikipedia.org/wiki/Mick_Schumacher
853,855,zhou,24,ZHO,Guanyu,Zhou,1999-05-30,Chinese,http://en.wikipedia.org/wiki/Zhou_Guanyu
854,856,de_vries,21,DEV,Nyck,de Vries,1995-02-06,Dutch,http://en.wikipedia.org/wiki/Nyck_de_Vries
855,857,piastri,81,PIA,Oscar,Piastri,2001-04-06,Australian,http://en.wikipedia.org/wiki/Oscar_Piastri


In [13]:
for index, row in df_drivercsv.iterrows():
    csv_drivers = row[['driverRef', 'code', 'forename', 'surname', 'dob']]
    #inserting the values into the drivers table
    insert_query = text("""
        INSERT INTO race_data.drivers 
        (driver_ref, code, forename, surname, dob) 
        VALUES (:driverRef, :code, :forename, :surname, :dob)
        """)
    csv_drivers_dict = csv_drivers.to_dict()
    #print(csv_drivers)
    with engine.begin() as connection:
        connection.execute(insert_query, csv_drivers_dict)


In [14]:
pd.read_sql("SELECT * FROM race_data.drivers", engine)

Unnamed: 0,driver_id,driver_ref,number,code,forename,surname,dob,nationality
0,1,hamilton,44.0,HAM,Lewis,Hamilton,1985-01-07,
1,2,hamilton,,HAM,Lewis,Hamilton,1985-01-07,
2,3,heidfeld,,HEI,Nick,Heidfeld,1977-05-10,
3,4,rosberg,,ROS,Nico,Rosberg,1985-06-27,
4,5,alonso,,ALO,Fernando,Alonso,1981-07-29,
...,...,...,...,...,...,...,...,...
1710,1711,mick_schumacher,,MSC,Mick,Schumacher,1999-03-22,
1711,1712,zhou,,ZHO,Guanyu,Zhou,1999-05-30,
1712,1713,de_vries,,DEV,Nyck,de Vries,1995-02-06,
1713,1714,piastri,,PIA,Oscar,Piastri,2001-04-06,


In [15]:
#loading the data into constructors table
# Download the constructor CSV file from the Minio bucket into a pandas DataFrame

constructordata = minio_client.get_object("track.data-raw", "constructors.csv")
constructordata = BytesIO(constructordata.read())
df_constructorcsv = pd.read_csv(constructordata)
df_constructorcsv

Unnamed: 0,constructorId,constructorRef,name,nationality,url
0,1,mclaren,McLaren,British,http://en.wikipedia.org/wiki/McLaren
1,2,bmw_sauber,BMW Sauber,German,http://en.wikipedia.org/wiki/BMW_Sauber
2,3,williams,Williams,British,http://en.wikipedia.org/wiki/Williams_Grand_Pr...
3,4,renault,Renault,French,http://en.wikipedia.org/wiki/Renault_in_Formul...
4,5,toro_rosso,Toro Rosso,Italian,http://en.wikipedia.org/wiki/Scuderia_Toro_Rosso
...,...,...,...,...,...
206,209,manor,Manor Marussia,British,http://en.wikipedia.org/wiki/Manor_Motorsport
207,210,haas,Haas F1 Team,American,http://en.wikipedia.org/wiki/Haas_F1_Team
208,211,racing_point,Racing Point,British,http://en.wikipedia.org/wiki/Racing_Point_F1_Team
209,213,alphatauri,AlphaTauri,Italian,http://en.wikipedia.org/wiki/Scuderia_AlphaTauri


In [16]:
for index, row in df_constructorcsv.iterrows():
    csv_constructor = row[['name']]
    #inserting the values into the constructors table
    insert_query = text("""
        INSERT INTO race_data.constructors 
        (name) 
        VALUES (:name)
        """)
    csv_constructor_dict = csv_constructor.to_dict()
    with engine.begin() as connection:
        connection.execute(insert_query, csv_constructor_dict)

In [18]:
pd.read_sql("SELECT * FROM race_data.constructors", engine)

Unnamed: 0,constructor_id,name,country
0,1,McLaren,
1,2,BMW Sauber,
2,3,Williams,
3,4,Renault,
4,5,Toro Rosso,
...,...,...,...
628,629,Manor Marussia,
629,630,Haas F1 Team,
630,631,Racing Point,
631,632,AlphaTauri,


In [19]:
#loading the data into status table
# Download the status CSV file from the Minio bucket into a pandas DataFrame

statusdata = minio_client.get_object("track.data-raw", "status.csv")
statusdata = BytesIO(statusdata.read())
df_statuscsv = pd.read_csv(statusdata)
df_statuscsv

Unnamed: 0,statusId,status
0,1,Finished
1,2,Disqualified
2,3,Accident
3,4,Collision
4,5,Engine
...,...,...
134,137,Damage
135,138,Debris
136,139,Illness
137,140,Undertray


In [20]:
for index, row in df_statuscsv.iterrows():
   
    csv_status = row[['status']]
    #inserting the values into the status table
    insert_query = text("""
        INSERT INTO race_data.status 
        (status) 
        VALUES (:status)
        """)
    csv_status_dict = csv_status.to_dict()
    with engine.begin() as connection:
        connection.execute(insert_query, csv_status_dict)

In [21]:
pd.read_sql("SELECT * FROM race_data.status", engine)

Unnamed: 0,status_id,status
0,1,Finished
1,2,Disqualified
2,3,Accident
3,4,Collision
4,5,Engine
...,...,...
273,274,Damage
274,275,Debris
275,276,Illness
276,277,Undertray


In [22]:
#now lets load the data into the results table
for index, row in df_results_new.iterrows():
    csv_results = row[['raceId', 'driverId', 'constructorId', 'number', 'grid', 'position', 'points', 'laps', 'time', 'fastestLapTime', 'rank', 'fastestLapSpeed', 'statusId']]
    #inserting the values into the results table
    insert_query = text("""
        INSERT INTO race_data.results 
        (event_id, driver_id, constructor_id, number, grid, points, laps, status_id) 
        VALUES (:raceId, :driverId, :constructorId, :number, :grid, :points, :laps, :statusId)
        """)
    csv_results_dict = csv_results.to_dict()
    #print(csv_results)
    with engine.begin() as connection:
        connection.execute(insert_query, csv_results_dict)

In [23]:
pd.read_sql("select * from race_data.results", engine)

Unnamed: 0,result_id,event_id,driver_id,constructor_id,number,grid,position,points,laps,time,fastest_lap_time,rank,fastest_lap_speed,status_id
0,1288,18,1,1,22,1,,10.0,58,,,,,1
1,1289,18,2,2,3,5,,8.0,58,,,,,1
2,1290,18,3,3,7,7,,6.0,58,,,,,1
3,4,18,1,1,22,1,1.0,10.0,58,1:34:50.616,1:27.452,2.0,218.300,1
4,5,18,2,2,3,5,2.0,8.0,58,+5.478,1:27.739,3.0,217.586,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2515,2519,75,35,15,11,12,,0.0,51,,,,,5
2516,2520,75,30,6,1,8,,0.0,46,,,,,29
2517,2521,75,27,18,21,14,,0.0,19,,,,,6
2518,2522,75,40,18,20,15,,0.0,11,,,,,20
