# EDA on given Formula 1 dataset using SQL

Queries covered in this notebook:

1. Finding locations that hosted maximum number of race
2. Finding circuits that hosted opening races
3. Number of races hosted by every country
4. All time constructor points and leaders
5. Number of races organized per season
6. Comparison of all time top 10 teams (based on points and races won)
7. Number of races won by top 10 drivers
8. Finding the fastest laps on each circuit
9. Number of constructors from each country
10. Finding the fastest quali times on every circuit for pole position
11. Drivers who have won atleast one race

In [None]:
import pandas as pd
import sqlite3 
import plotly.express as px
import plotly.graph_objects as go
import os

In [None]:
# Establishing a connection between the SQLITE3 database
conn = sqlite3.connect("f1.db")

In [None]:
# Reading CSV files

data_1 = pd.read_csv("../input/formula-1-world-championship-1950-2020/circuits.csv")
data_2 = pd.read_csv("../input/formula-1-world-championship-1950-2020/constructor_results.csv")
data_3 = pd.read_csv("../input/formula-1-world-championship-1950-2020/constructor_standings.csv")
data_4 = pd.read_csv("../input/formula-1-world-championship-1950-2020/constructors.csv")
data_5 = pd.read_csv("../input/formula-1-world-championship-1950-2020/driver_standings.csv")
data_6 = pd.read_csv("../input/formula-1-world-championship-1950-2020/drivers.csv")
data_7 = pd.read_csv("../input/formula-1-world-championship-1950-2020/lap_times.csv")
data_8 = pd.read_csv("../input/formula-1-world-championship-1950-2020/pit_stops.csv")
data_9 = pd.read_csv("../input/formula-1-world-championship-1950-2020/qualifying.csv")
data_10 = pd.read_csv("../input/formula-1-world-championship-1950-2020/races.csv")
data_11 = pd.read_csv("../input/formula-1-world-championship-1950-2020/seasons.csv")
data_12 = pd.read_csv("../input/formula-1-world-championship-1950-2020/status.csv")

# Inserting data into SQL Tables

data_1.to_sql("circuits", conn)
data_2.to_sql("constructor_results", conn)
data_3.to_sql("constructor_standings", conn)
data_4.to_sql("constructors", conn)
data_5.to_sql("driver_standings", conn)
data_6.to_sql("drivers", conn)
data_8.to_sql("pit_stops", conn)
data_9.to_sql("qualifying", conn)
data_10.to_sql("races", conn)
data_11.to_sql("seasons", conn)
data_12.to_sql("status", conn)

In [None]:
data_13 = pd.read_csv("../input/formula-1-world-championship-1950-2020/results.csv")
data_13.to_sql("results", conn)

In [None]:
data_7.to_sql("lap_times", conn)

In [None]:
# Finding the locations that hosted the maximum races

cir_max = pd.read_sql('select c.location as "City", count(r.circuitid) as "Races Hosted" from circuits c, races r where r.circuitid = c.circuitid group by "City" order by "Races Hosted" desc;', conn)
cir_max

px.bar(cir_max, x="City", y="Races Hosted", color="Races Hosted", title="Races hosted by cities")

In [None]:
# Finding the circuits that have hosted the opening races

op_ci = pd.read_sql('select r.name as "Grand Prix Name", c.name as "Circuit Name", c.location as "City", c.country as "Country", count(*) as "Opening Races Hosted" from circuits c, races r where r.circuitid = c.circuitid and r.round = 1 group by "City" order by "Opening Races Hosted" desc;', conn)
op_ci

In [None]:
px.bar(op_ci, x="City", y="Opening Races Hosted", color="Opening Races Hosted", title="Cities that hosted opening races in decade 2010-2021")

In [None]:
# Number of races hosted by every country

cont_ra = pd.read_sql('select c.country as "Country", count(*) as "Races Hosted" from circuits c, races r where r.circuitid = c.circuitid group by "Country" order by "Races Hosted" desc;', conn)
cont_ra

In [None]:
px.bar(cont_ra, x="Country", y="Races Hosted", color="Races Hosted", title="Countries that have hosted race")

In [None]:
# All time constructor points and leaders

con_at = pd.read_sql('select c.name as "Constructor", c.nationality as "Nationality", sum(points) as "Total Points" from constructor_results r, constructors c where c.constructorId = r.constructorId group by "Constructor" order by "Total Points" desc;', conn)
con_at

In [None]:
px.bar(con_at, x="Constructor", y="Total Points", color="Total Points", title="All time points scored by constructors")

In [None]:
# Finding number of races organized per season

ra_season = pd.read_sql('select strftime("%Y", "date") as "Year", count(*) as "Races held" from races group by "Year" order by "Year";', conn)
ra_season

In [None]:
px.line(ra_season, x="Year", y="Races held", title="Races held every season")

In [None]:
# Comparing the all-time top 5 teams

con_top = pd.read_sql('select c.name as "Constructor", c.nationality as "Nationality", sum(points) as "Total Points" from constructor_results r, constructors c where c.constructorId = r.constructorId and points > 0 group by "Constructor" order by "Total Points" desc limit 5;', conn)
con_top

In [None]:
# Finding the number of races won by the top 5 teams

raw_top = pd.read_sql('select c.name as "Constructor", count(r.constructorId) as "Races won" from results r, constructors c where c.constructorId = r.constructorId and "Constructor" in ("Ferrari", "Mercedes", "McLaren", "Red Bull", "Williams") and r.position = 1 group by "Constructor" order by "Races won" desc;', conn)
raw_top

In [None]:
# Finding the number of races won by top 10 drivers

rawd_top = pd.read_sql('select ("forename" || " " || "surname") as "Name", count(r.driverId) as "Races Won" from drivers d, results r where r.driverId = d.driverId and r.position = 1 group by "Name" order by "Races Won" desc limit 10;', conn)
rawd_top

In [None]:
# Finding the fastest laps on each circuit

fas_lap = pd.read_sql('select c.location as "City", r.year as "Year of race", min(l.time) as "Lap Time", ("forename" || " " || "surname") as "Driver" from circuits c, lap_times l, races r, drivers d where l.raceId = r.raceId and r.circuitId = c.circuitId and l.driverId = d.driverId group by "City" order by "City";', conn)
fas_lap

In [None]:
# Constructors from each country

con_na = pd.read_sql('select nationality as "Nationality", count(*) as "Number of constructors" from constructors group by "Nationality" order by "Number of constructors" desc;', conn)
con_na

In [None]:
px.bar(con_na, x="Nationality", y="Number of constructors", color="Number of constructors", title="Number of constructors from each country")

In [None]:
# Fastest quali times for pole position for every race

fa_qpo = pd.read_sql('select r.name as "Grand Prix Name", ("forename" || " " || "surname") as "Driver", min(q1) as "Fastest Q1 time", min(q2) as "Fasstes Q2 time", min(Q3) as "Fastest Q3 time" from qualifying q, races r, drivers d where q.raceId = r.raceId and q.position = 1 and q.driverId = d.driverId group by "Grand Prix Name" order by "Grand Prix Name";', conn)
fa_qpo

In [None]:
# Finding driver who have won at least one race

ra_won = pd.read_sql('select ("forename" || " " || "surname") as "Driver", sum(case when position = 1 then 1 else 0 end) as "Races Won" from drivers d, results r where r.driverId = d.driverId and position = 1 group by "Driver" order by "Races Won" desc;', conn)
ra_won

In [None]:
px.bar(ra_won, x="Driver", y="Races Won", color="Races Won", title="Drivers who have won the race")