# Creating intermediate table

# What we changed:
* Added descriptions in markdown for criteria 8, 9, and 10
  * Criteria 8: Laps
  * Criteria 9: Drivers
  * Criteria 10: Circuits
* We also added logical checks that checks every PK for each table under the PK/FK declarations
  * Each table has its own cell for the check


## Variables to use throughout

In [None]:
project_id = "saffatandsourik"
region = "us-central1"
model_name = "gemini-2.0-flash-001"
dataset = "formula1_int"
region = "us-central1"

## Create BQ Dataset for Intermediate

In [None]:
from google.cloud import bigquery

bq_client = bigquery.Client()

dataset_id = bigquery.Dataset(f"{project_id}.{dataset}")
dataset_id.location = region
resp = bq_client.create_dataset(dataset_id, exists_ok=True)
print("Created dataset {}.{}".format(bq_client.project, resp.dataset_id))

Created dataset saffatandsourik.formula1_int


## Criteria 9: Combine ergast_drivers and drivers_openf1 into one Drivers table

#####Goal: Come up with a universal driver identifier. We will use driver_id from drivers_openf1 and permanent_number from ergast_drivers. These have some overlap values which will make it easier to develop a universal key.

######drivers_openf1 has many records because it has driver data for every session, therefore we need to remove duplicates and take one instance of each driver and add it to universal Drivers table

######Once one instance of each driver is added we drop duplicates again since there are overlaps. Then we make new ids for repeated ids so that everything is unique

######DO NOT RUN AGAIN

In [None]:
%%bigquery
  select distinct permanent_number, given_name as first_name, family_name as last_name from formula1_stg.ergast_drivers order by permanent_number;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,permanent_number,first_name,last_name
0,2,Logan,Sargeant
1,3,Daniel,Ricciardo
2,4,Lando,Norris
3,10,Pierre,Gasly
4,11,Sergio,Pérez
5,14,Fernando,Alonso
6,16,Charles,Leclerc
7,18,Lance,Stroll
8,20,Kevin,Magnussen
9,21,Nyck,de Vries


In [None]:
%%bigquery
  select distinct driver_number, first_name, CONCAT(UPPER(SUBSTRING(last_name, 1, 1)), LOWER(SUBSTRING(last_name, 2))) from formula1_stg.drivers_openf1 order by driver_number;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,driver_number,first_name,f0_
0,1,Max,Verstappen
1,2,Logan,Sargeant
2,3,Daniel,Ricciardo
3,4,Lando,Norris
4,5,Gabriel,Bortoleto
5,6,Oliver,Goethe
6,7,Kaylen,Frederick
7,8,Gregoire,Saucy
8,9,Nikola,Tsolov
9,10,Pierre,Gasly


In [None]:
# creating drivers table, initially adding ergast_drivers data
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Drivers AS
    SELECT
      permanent_number AS driver_number,
      given_name AS first_name,
      family_name AS last_name,
      code AS name_acronym,
      _data_source,
      _load_time
    FROM formula1_stg.ergast_drivers;


Query is running:   0%|          |

In [None]:
# adding open_f1 driver data, but just one instance of each driver
%%bigquery
  INSERT INTO formula1_int.Drivers
    SELECT
      driver_number,
      first_name,
      last_name,
      ANY_VALUE(name_acronym) AS name_acronym,
      ANY_VALUE(_data_source) AS _data_source,
      ANY_VALUE(_load_time)
    FROM formula1_stg.drivers_openf1
    GROUP BY driver_number, first_name, last_name;

Query is running:   0%|          |

In [None]:
# drop duplicates that are only different by _load_time
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Drivers AS
    SELECT *
    FROM (
        SELECT *,
              ROW_NUMBER() OVER (PARTITION BY driver_number, first_name, last_name, name_acronym ORDER BY _load_time) AS rn
        FROM formula1_int.Drivers
    )
    WHERE rn = 1;

Query is running:   0%|          |

In [None]:
# dropping temporary rn column that allowed to get one instance of each driver from window
%%bigquery
  ALTER TABLE formula1_int.Drivers
  DROP COLUMN rn;

Query is running:   0%|          |

In [None]:
# format first name
%%bigquery
  UPDATE formula1_int.Drivers
  SET first_name = CONCAT(UPPER(SUBSTRING(first_name, 1, 1)), LOWER(SUBSTRING(first_name, 2)))
  WHERE first_name IS NOT NULL;

Query is running:   0%|          |

In [None]:
# format last name
%%bigquery
  UPDATE formula1_int.Drivers
  SET last_name = CONCAT(UPPER(SUBSTRING(last_name, 1, 1)), LOWER(SUBSTRING(last_name, 2)))
  WHERE last_name IS NOT NULL;

Query is running:   0%|          |

In [None]:
# seeing where name and acronym are duplicated
%%bigquery
  SELECT *
  FROM (
    SELECT *,
          COUNT(*) OVER (PARTITION BY first_name, last_name) AS cnt
    FROM formula1_int.Drivers
  )
  WHERE cnt > 1;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,driver_number,first_name,last_name,name_acronym,_data_source,_load_time,cnt
0,81,Oscar,Piastri,PIA,ergast,2025-01-31 21:52:56.883677+00:00,2
1,81,Oscar,Piastri,PIA,openf1,2025-02-07 04:42:38.516130+00:00,2
2,16,Charles,Leclerc,LEC,ergast,2025-01-31 21:52:56.883677+00:00,2
3,16,Charles,Leclerc,LEC,openf1,2025-02-06 21:49:17.505931+00:00,2
4,77,Valtteri,Bottas,BOT,openf1,2025-01-31 20:23:49.265190+00:00,2
5,77,Valtteri,Bottas,BOT,ergast,2025-01-31 21:52:56.883677+00:00,2
6,43,Franco,Colapinto,COL,ergast,2025-01-31 21:52:56.883677+00:00,3
7,45,Franco,Colapinto,COL,openf1,2025-02-05 20:22:06.457063+00:00,3
8,43,Franco,Colapinto,COL,openf1,2025-02-13 15:24:11.269654+00:00,3
9,20,Kevin,Magnussen,MAG,openf1,2025-01-31 20:23:49.265190+00:00,2


In [None]:
# removing records that are exactly the same (other than data source and load time)
# duplicate records with different driver numbers remain
%%bigquery
CREATE OR REPLACE TABLE formula1_int.Drivers AS
  SELECT *
      FROM (
          SELECT *,
                ROW_NUMBER() OVER (PARTITION BY driver_number, first_name, last_name, name_acronym ORDER BY _load_time) AS rn
          FROM formula1_int.Drivers
      )
      WHERE rn = 1;

Query is running:   0%|          |

In [None]:
# different driver id
%%bigquery
SELECT * FROM formula1_int.Drivers
WHERE first_name = "Max";

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,driver_number,first_name,last_name,name_acronym,_data_source,_load_time,rn
0,33,Max,Verstappen,VER,ergast,2025-01-31 21:52:56.883677+00:00,1
1,1,Max,Verstappen,VER,openf1,2025-01-31 20:23:49.265190+00:00,1


In [None]:
# had exact duplicate records
%%bigquery
SELECT * FROM formula1_int.Drivers
WHERE first_name = "Lewis";

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,driver_number,first_name,last_name,name_acronym,_data_source,_load_time,rn
0,44,Lewis,Hamilton,HAM,ergast,2025-01-31 21:52:56.883677+00:00,1


In [None]:
# dropping temporary rn column
%%bigquery
  ALTER TABLE formula1_int.Drivers
  DROP COLUMN rn;

Query is running:   0%|          |

In [None]:
# seeing what drivers share a driver id
%%bigquery
  SELECT *
  FROM (
    SELECT *, COUNT(*) OVER (PARTITION BY driver_number) AS cnt
    FROM formula1_int.Drivers
  )
  WHERE cnt > 1;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,driver_number,first_name,last_name,name_acronym,_data_source,_load_time,cnt
0,30,Liam,Lawson,LAW,ergast,2025-01-31 21:52:56.883677+00:00,2
1,30,Roberto,Faria,FAR,openf1,2025-02-05 20:22:06.457063+00:00,2
2,40,Liam,Lawson,LAW,openf1,2025-01-31 20:23:49.265190+00:00,2
3,40,Ayumu,Iwasa,IWA,openf1,2025-02-06 21:49:17.505931+00:00,2
4,45,Zak,O'sullivan,OSU,openf1,2025-01-31 20:23:49.265190+00:00,2
5,45,Franco,Colapinto,COL,openf1,2025-02-05 20:22:06.457063+00:00,2
6,24,Guanyu,Zhou,ZHO,ergast,2025-01-31 21:52:56.883677+00:00,2
7,24,Zhou,Guanyu,ZHO,openf1,2025-02-13 15:24:11.269654+00:00,2
8,11,Sergio,Pérez,PER,ergast,2025-01-31 21:52:56.883677+00:00,2
9,11,Sergio,Perez,PER,openf1,2025-02-13 15:24:11.269654+00:00,2


In [None]:
# adjusting numbers for drivers with same id
%%bigquery
CREATE OR REPLACE TABLE formula1_int.Drivers AS
WITH duplicates AS (
  SELECT *,
         ROW_NUMBER() OVER (PARTITION BY driver_number ORDER BY last_name, first_name) AS rn,
         MAX(driver_number) OVER () AS max_number
  FROM formula1_int.Drivers
),
resolved_duplicates AS (
  SELECT
    first_name,
    last_name,
    CASE
      WHEN rn = 1 THEN driver_number
      ELSE max_number + ROW_NUMBER() OVER ()
    END AS driver_number,
    name_acronym,
    _data_source,
    _load_time
  FROM duplicates
)
SELECT * FROM resolved_duplicates;

Query is running:   0%|          |

In [None]:
# checking to make sure driver_id is unique for all rows
%%bigquery
  SELECT *
  FROM formula1_int.Drivers
  WHERE driver_number IN (
    SELECT driver_number
    FROM formula1_int.Drivers
    GROUP BY driver_number
    HAVING COUNT(*) > 1
  )
  ORDER BY driver_number;

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,first_name,last_name,driver_number,name_acronym,_data_source,_load_time


In [None]:
# checking for duplicate names again and for names that are different slightly
%%bigquery
  SELECT *
  FROM (
    SELECT *,
          COUNT(*) OVER (PARTITION BY first_name) AS cnt
    FROM formula1_int.Drivers
  )
  WHERE cnt > 1;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,first_name,last_name,driver_number,name_acronym,_data_source,_load_time,cnt
0,Max,Verstappen,33,VER,ergast,2025-01-31 21:52:56.883677+00:00,2
1,Max,Verstappen,1,VER,openf1,2025-01-31 20:23:49.265190+00:00,2
2,Liam,Lawson,154,LAW,ergast,2025-01-31 21:52:56.883677+00:00,2
3,Liam,Lawson,144,LAW,openf1,2025-01-31 20:23:49.265190+00:00,2
4,Oliver,Bearman,38,BEA,ergast,2025-01-31 21:52:56.883677+00:00,3
5,Oliver,Goethe,6,GOE,openf1,2025-02-05 20:22:06.457063+00:00,3
6,Oliver,Bearman,50,BEA,openf1,2025-02-07 04:42:38.516130+00:00,3
7,Sergio,Pérez,103,PER,ergast,2025-01-31 21:52:56.883677+00:00,2
8,Sergio,Perez,11,PER,openf1,2025-02-13 15:24:11.269654+00:00,2
9,Franco,Colapinto,43,COL,ergast,2025-01-31 21:52:56.883677+00:00,2


In [None]:
# manually removing duplicate names
%%bigquery
DELETE FROM formula1_int.Drivers
WHERE driver_number in (113,45,41,33,132,151,50,127,153)

Query is running:   0%|          |

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Drivers AS
    SELECT
      driver_number as driver_id,
      first_name,
      last_name,
      name_acronym,
      _data_source,
      _load_time
    FROM formula1_int.Drivers

Query is running:   0%|          |

In [None]:
%%bigquery
  UPDATE formula1_int.Drivers
  SET last_name = 'de Vries'
  WHERE last_name = 'De vries'

Query is running:   0%|          |

In [None]:
# observing final drivers table
%%bigquery
  SELECT *
  FROM formula1_int.Drivers
  ORDER BY driver_id

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,driver_id,first_name,last_name,name_acronym,_data_source,_load_time
0,1,Max,Verstappen,VER,openf1,2025-01-31 20:23:49.265190+00:00
1,2,Logan,Sargeant,SAR,openf1,2025-01-30 20:52:05.891113+00:00
2,3,Daniel,Ricciardo,RIC,openf1,2025-01-31 20:23:49.265190+00:00
3,4,Lando,Norris,NOR,ergast,2025-01-31 21:52:56.883677+00:00
4,5,Gabriel,Bortoleto,BOR,openf1,2025-01-30 20:52:05.891113+00:00
5,6,Oliver,Goethe,GOE,openf1,2025-02-05 20:22:06.457063+00:00
6,7,Kaylen,Frederick,FRE,openf1,2025-02-05 20:22:06.457063+00:00
7,8,Gregoire,Saucy,SAU,openf1,2025-01-31 20:32:17.733838+00:00
8,9,Nikola,Tsolov,TSO,openf1,2025-01-30 20:52:05.891113+00:00
9,10,Pierre,Gasly,GAS,openf1,2025-01-31 20:32:17.733838+00:00


## Sessions table

In [None]:
# creating sessions table
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Sessions AS
    SELECT
      session_key,
      meeting_key,
      location,
      country_key,
      country_code,
      country_name,
      circuit_key,
      circuit_short_name,
      session_name,
      date_start,
      date_end,
      gmt_offset,
      year,
      _data_source,
      _load_time
    FROM formula1_stg.sessions

Query is running:   0%|          |

In [None]:
# removing duplicates that are only different by load time
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Sessions AS
    SELECT *
    FROM (
        SELECT *,
              ROW_NUMBER() OVER (PARTITION BY session_key ORDER BY _load_time) AS rn
        FROM formula1_int.Sessions
    )
    WHERE rn = 1

Query is running:   0%|          |

## Qualifying_Results

In [None]:
# creating Qualifying_Results table
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Qualifying_Results AS
    SELECT
      year,
      round,
      race_name,
      circuit_name,
      date,
      position,
      first_name,
      last_name,
      driver_nationality,
      constructor_name,
      constructor_nationality,
      q1_time,
      q2_time,
      q3_time,
      _data_source,
      _load_time
    FROM formula1_stg.qualifying_results

Query is running:   0%|          |

In [None]:
# removing duplicates that are only different by load time
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Qualifying_Results AS
    SELECT *
    FROM (
        SELECT *,
              ROW_NUMBER() OVER (PARTITION BY year, round, race_name, circuit_name,
              date, position, first_name, last_name, driver_nationality, constructor_name,
              constructor_nationality, q1_time, q2_time, q3_time ORDER BY _load_time) AS rn
        FROM formula1_int.Qualifying_Results
    )
    WHERE rn = 1

Query is running:   0%|          |

In [None]:
# fixing specific names that are different than Drivers table
%%bigquery
  UPDATE formula1_int.Qualifying_Results
  SET last_name = CASE
    WHEN last_name = 'Hülkenberg' THEN 'Hulkenberg'
    WHEN last_name = 'de' THEN 'de Vries'
    WHEN last_name = 'Pérez' THEN 'Perez'
    ELSE last_name
  END
  WHERE last_name IN ('Hülkenberg', 'de', 'Pérez')

Query is running:   0%|          |

In [None]:
# adding driver id from Drivers table
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Qualifying_Results AS
    SELECT d.driver_id, q.*
    FROM formula1_int.Qualifying_Results q
    LEFT JOIN formula1_int.Drivers d
      ON q.first_name = d.first_name AND q.last_name = d.last_name

Query is running:   0%|          |

In [None]:
# checking to see if all drivers were given driver id
%%bigquery
SELECT distinct last_name
FROM formula1_int.Qualifying_Results
WHERE driver_id IS NULL

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,last_name


In [164]:
# dropping temporary rn column
%%bigquery
  ALTER TABLE formula1_int.Qualifying_Results
  DROP COLUMN rn;

Executing query with job ID: 771c2671-dcb6-4826-ac4c-4fa7d85dcbfd
Query executing: 0.20s


ERROR:
 400 Column not found: rn at [2:15]; reason: invalidQuery, location: query, message: Column not found: rn at [2:15]

Location: us-central1
Job ID: 771c2671-dcb6-4826-ac4c-4fa7d85dcbfd



In [163]:
%%bigquery
  ALTER TABLE formula1_int.Qualifying_Results
  DROP COLUMN driver_nationality;

  ALTER TABLE formula1_int.Qualifying_Results
  DROP COLUMN first_name;

  ALTER TABLE formula1_int.Qualifying_Results
  DROP COLUMN last_name;

Query is running:   0%|          |

In [170]:
%%bigquery
create or replace table formula1_int.Qualifying_Results as
  select q.*, c.circuit_id
  from formula1_int.Qualifying_Results q left join formula1_int.Circuits c
    on q.race_name = c.circuit_name

Query is running:   0%|          |

In [171]:
%%bigquery
ALTER TABLE formula1_int.Qualifying_Results
  DROP COLUMN race_name;

ALTER TABLE formula1_int.Qualifying_Results
  DROP COLUMN circuit_name;

Query is running:   0%|          |

In [172]:
%%bigquery
create or replace table formula1_int.Qualifying_Results as
  select
    driver_id,
    year,
    round,
    date,
    position,
    circuit_id,
    q1_time,
    q2_time,
    q3_time,
    _data_source,
    _load_time
  from formula1_int.Qualifying_Results

Query is running:   0%|          |

## Race_Lap_Times table

In [None]:
# creating Lap_Times table
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Lap_Times AS
    SELECT
      driver_id as name_id,
      year,
      round,
      race_name,
      circuit_name,
      location,
      country,
      race_date,
      lap_number,
      position,
      lap_time,
      _data_source,
      _load_time
    FROM formula1_stg.ergast_lap_times

Query is running:   0%|          |

In [None]:
# removing duplicate records
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Lap_Times AS
    SELECT *
    FROM (
        SELECT *,
              ROW_NUMBER() OVER (PARTITION BY name_id, year, round, race_name,
              circuit_name, location, country, race_date, lap_number, position,
              lap_time ORDER BY _load_time) AS rn
        FROM formula1_int.Race_Lap_Times
    )
    WHERE rn = 1

Query is running:   0%|          |

In [None]:
# adding permanent number and name from stg ergast drivers
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Lap_Times AS
    SELECT d.permanent_number, d.given_name, d.family_name, l.*
    FROM formula1_int.Race_Lap_Times as l
      JOIN formula1_stg.ergast_drivers as d
      ON l.name_id = d.driver_id

Query is running:   0%|          |

In [None]:
# changing names to fit Drivers table naming convention
%%bigquery
  UPDATE formula1_int.Race_Lap_Times
  SET family_name = CASE
    WHEN family_name = 'Hülkenberg' THEN 'Hulkenberg'
    WHEN family_name = 'de' THEN 'de Vries'
    WHEN family_name = 'Pérez' THEN 'Perez'
    ELSE family_name
  END
  WHERE family_name IN ('Hülkenberg', 'de', 'Pérez')

Query is running:   0%|          |

In [None]:
# adding driver id from Drivers table
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Lap_Times AS
    SELECT d.driver_id as fixed_di, l.*
    FROM formula1_int.Race_Lap_Times l
    LEFT JOIN formula1_int.Drivers d
      ON l.given_name = d.first_name AND l.family_name = d.last_name

Query is running:   0%|          |

In [None]:
# creating final table without extra columns from stg ergast drivers and corrected driver id
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Lap_Times AS
    SELECT
      fixed_di AS driver_id,
      year,
      round,
      race_name,
      circuit_name,
      location,
      country,
      race_date,
      lap_number,
      position,
      lap_time,
      _data_source,
      _load_time
    FROM formula1_int.Race_Lap_Times

Query is running:   0%|          |

In [None]:
# ensuring no repeated records
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Lap_Times AS
    SELECT DISTINCT *
    FROM formula1_int.Race_Lap_Times

Query is running:   0%|          |

In [None]:
# making sure PK had no nulls
%%bigquery
  SELECT driver_id, year, round
    FROM formula1_int.Race_Lap_Times
    WHERE driver_id IS NULL
      OR year IS NULL
      OR round IS NULL

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,driver_id,year,round


In [None]:
# check to see if there are the same amt of matching records as rows in race_lapt_times

%%bigquery

SELECT COUNT(*) AS matching_records
FROM saffatandsourik.formula1_int.Race_Lap_Times rlt
JOIN saffatandsourik.formula1_int.Races r
ON rlt.year = r.year AND rlt.round = r.round;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,matching_records
0,4311


In [None]:
%%bigquery

# merge the data

CREATE OR REPLACE TABLE formula1_int.Race_Lap_Times AS
SELECT rlt.*, r.race_id
FROM saffatandsourik.formula1_int.Race_Lap_Times rlt
JOIN saffatandsourik.formula1_int.Races r
ON rlt.year = r.year AND rlt.round = r.round;


Query is running:   0%|          |

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Lap_Times AS
    SELECT
      driver_id,
      race_id,
      year,
      round,
      circuit_name,
      location,
      country,
      race_date,
      lap_number,
      position,
      lap_time,
      _data_source,
      _load_time
    FROM formula1_int.Race_Lap_Times

Query is running:   0%|          |

## Races table

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Races AS
    SELECT
      race_id,
      circuit_id,
      year,
      round,
      name,
      url,
      PARSE_DATE('%m/%d/%Y', date) as date,
      time,
      fp1_date_time,
      fp2_date_time,
      fp3_date_time,
      quali_date_time,
      sprint_date_time,
      _data_source,
      _load_time
    FROM formula1_stg.ergast_races

Query is running:   0%|          |

In [None]:
%%bigquery
  SELECT *
  FROM formula1_int.Races
  WHERE race_id IS NULL

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,race_id,circuit_id,year,round,name,url,date,time,fp1_date_time,fp2_date_time,fp3_date_time,quali_date_time,sprint_date_time,_data_source,_load_time


In [None]:
%%bigquery
  SELECT race_id, COUNT(*)
  FROM formula1_int.Races
  GROUP BY race_id
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,race_id,f0_


## Criteria 8: Laps table

#####Converted sector_segments_1, sector_segments_2, and sector_segments_3 to array type.

In [None]:
# creating Laps table
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Laps AS
    SELECT
      session_key,
      driver_number,
      lap_number,
      i1_speed,
      i2_speed,
      date_start,
      lap_duration,
      is_pit_out_lap,
      duration_sector_1,
      duration_sector_2,
      duration_sector_3,
      segments_sector_1,
      segments_sector_2,
      segments_sector_3,
      _data_source,
      _load_time
    FROM formula1_stg.laps

Query is running:   0%|          |

In [None]:
# removing duplicates that are only different by load time
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Laps AS
    SELECT *
    FROM (
        SELECT *,
              ROW_NUMBER() OVER (PARTITION BY session_key, driver_number, lap_number ORDER BY _load_time) AS rn
        FROM formula1_int.Laps
    )
    WHERE rn = 1

Query is running:   0%|          |

In [None]:
# creating temp table of drivers_openf1 with just one instance of each driver
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.temp_openf1_drivers AS
    select distinct driver_number,
      CONCAT(UPPER(SUBSTRING(first_name, 1, 1)), LOWER(SUBSTRING(first_name, 2))) as first_name,
      CONCAT(UPPER(SUBSTRING(last_name, 1, 1)), LOWER(SUBSTRING(last_name, 2))) as last_name
    from formula1_stg.drivers_openf1
    order by driver_number;

Query is running:   0%|          |

In [None]:
# adding formatted full names from temp drivers
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Laps AS
    SELECT
      CONCAT(UPPER(SUBSTRING(d.first_name, 1, 1)), LOWER(SUBSTRING(d.first_name, 2))) as first_name,
      CONCAT(UPPER(SUBSTRING(d.last_name, 1, 1)), LOWER(SUBSTRING(d.last_name, 2))) as last_name,
      l.*
    FROM formula1_int.Laps l
    LEFT JOIN formula1_int.temp_openf1_drivers d
      ON l.driver_number = d.driver_number

Query is running:   0%|          |

In [None]:
# adding correct driver_id from Drivers based on first and last name
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Laps AS
    SELECT d.driver_id, l.*
    FROM formula1_int.Laps l
    JOIN formula1_int.Drivers d
      ON l.first_name = d.first_name AND l.last_name = d.last_name

Query is running:   0%|          |

In [None]:
# final table schema
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Laps AS
    SELECT
      driver_id,
      session_key,
      lap_number,
      i1_speed,
      i2_speed,
      date_start,
      lap_duration,
      is_pit_out_lap,
      duration_sector_1,
      duration_sector_2,
      duration_sector_3,
      segments_sector_1,
      segments_sector_2,
      segments_sector_3,
      _data_source,
      _load_time
    FROM formula1_int.Laps

Query is running:   0%|          |

In [None]:
# checking if PK has any nulls
%%bigquery
  SELECT driver_id, session_key, lap_number
  FROM formula1_int.Laps
  WHERE driver_id IS NULL OR session_key IS NULL OR lap_number IS NULL

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,driver_id,session_key,lap_number


In [None]:
%%bigquery
  DROP TABLE formula1_int.temp_openf1_drivers

Query is running:   0%|          |

In [None]:
%%bigquery
  SELECT
  ARRAY(
    SELECT CAST(x AS INT )
    FROM UNNEST(SPLIT(REPLACE(REPLACE(segments_sector_1, "[", ""), "]", ""), ", ")) AS x) AS segments_sector_1
  FROM formula1_int.Laps
  LIMIT 30

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,segments_sector_1
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
6,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
8,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
9,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"


In [None]:
# converting to array
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Laps AS
    SELECT
      driver_id,
      session_key,
      lap_number,
      date_start,
      i1_speed,
      i2_speed,
      lap_duration,
      is_pit_out_lap,
      duration_sector_1,
      duration_sector_2,
      duration_sector_3,
      SPLIT(REPLACE(REPLACE(segments_sector_1, "[", ""), "]", ""), ", ") AS segments_sector_1,
      SPLIT(REPLACE(REPLACE(segments_sector_2, "[", ""), "]", ""), ", ") AS segments_sector_2,
      SPLIT(REPLACE(REPLACE(segments_sector_3, "[", ""), "]", ""), ", ") AS segments_sector_3,
      _data_source,
      _load_time
    FROM formula1_int.Laps

Executing query with job ID: 6e6e1c79-4e5c-42bc-9226-99ea5eadc658
Query executing: 0.18s

In [114]:
# Certain records are essentially duplicate but are different slightly
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Laps AS
    WITH deduplicated AS (
      SELECT *,
            ROW_NUMBER() OVER (PARTITION BY session_key, lap_number, driver_id ORDER BY date_start ASC) AS row_num
      FROM formula1_int.Laps
    )
    SELECT *
    FROM deduplicated
    WHERE row_num = 1;

Query is running:   0%|          |

## Meetings table

In [None]:
# creating Meetings table
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Meetings AS
    SELECT
      meeting_key,
      meeting_name,
      meeting_official_name,
      location,
      country_key,
      country_code,
      country_name,
      circuit_key,
      circuit_short_name,
      start_datetime_utc,
      year,
      _data_source,
      _load_time
    FROM formula1_stg.meetings

In [None]:
# removing duplicates that are only different by load time
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Meetings AS
    SELECT *
    FROM (
        SELECT *,
              ROW_NUMBER() OVER (PARTITION BY meeting_key ORDER BY _load_time) AS rn
        FROM formula1_int.Meetings
    )
  WHERE rn = 1

In [None]:
# dropping temporary column rn
%%bigquery
  ALTER TABLE formula1_int.Meetings
  DROP COLUMN rn

## Pit table

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Pit AS
    SELECT
      driver_number,
      CAST(lap_number AS INT) AS lap_number,
      session_key,
      pit_duration,
      date,
      _data_source,
      _load_time
    FROM formula1_stg.pit

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Pit AS
    SELECT *
    FROM (
        SELECT *,
              ROW_NUMBER() OVER (PARTITION BY driver_number, session_key, lap_number ORDER BY _load_time) AS rn
        FROM formula1_int.Pit
    )
  WHERE rn = 1

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Pit AS
    SELECT
      CONCAT(UPPER(SUBSTRING(d.first_name, 1, 1)), LOWER(SUBSTRING(d.first_name, 2))) as first_name,
      CONCAT(UPPER(SUBSTRING(d.last_name, 1, 1)), LOWER(SUBSTRING(d.last_name, 2))) as last_name,
      p.*
    FROM formula1_int.Pit p
    LEFT JOIN formula1_stg.drivers_openf1 d
      ON p.driver_number = d.driver_number

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Pit AS
    SELECT DISTINCT *
    FROM formula1_int.Pit

In [None]:
# changing names to fit Drivers table naming convention
%%bigquery
  UPDATE formula1_int.Pit
  SET last_name = "de Vries"
  WHERE last_name = "Vries"

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Pit AS
    SELECT d.driver_id, p.*
    FROM formula1_int.Pit p
    LEFT JOIN formula1_int.Drivers d
      ON p.first_name = d.first_name AND p.last_name = d.last_name

In [None]:
%%bigquery
  ALTER TABLE formula1_int.Pit
  DROP COLUMN driver_number;

  ALTER TABLE formula1_int.Pit
  DROP COLUMN rn;

In [67]:
  %%bigquery
    ALTER TABLE formula1_int.Pit
    DROP COLUMN first_name;

    ALTER TABLE formula1_int.Pit
    DROP COLUMN last_name;

Executing query with job ID: 63d3475e-5f86-41c8-bc66-9f034131ba97
Query executing: 0.07s


ERROR:
 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/saffatandsourik/queries/63d3475e-5f86-41c8-bc66-9f034131ba97?maxResults=0&location=us-central1&prettyPrint=false: Exceeded rate limits: too many table update operations for this table. For more information, see https://cloud.google.com/bigquery/troubleshooting-errors at [4:3]

Location: us-central1
Job ID: 63d3475e-5f86-41c8-bc66-9f034131ba97



In [118]:
# Certain records are essentially duplicate but are different slightly
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Pit AS
    WITH deduplicated AS (
      SELECT *,
            ROW_NUMBER() OVER (PARTITION BY session_key, lap_number, driver_id ORDER BY date ASC) AS row_num
      FROM formula1_int.Pit
    )
    SELECT *
    FROM deduplicated
    WHERE row_num = 1;

Query is running:   0%|          |

## Race_Results table

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Results AS
    SELECT
      season,
      round,
      race_name,
      date,
      circuit_id,
      location,
      country,
      number,
      position,
      position_text,
      points,
      laps,
      status,
      permanent_number,
      code,
      _data_source,
      _load_time
    FROM formula1_stg.race_results;

In [None]:
# removing duplicates that are only different by load time
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Results AS
    SELECT *
    FROM (
        SELECT *,
              ROW_NUMBER() OVER (PARTITION BY  season,
                                  round,
                                  race_name,
                                  date,
                                  circuit_id,
                                  location,
                                  country,
                                  number,
                                  position,
                                  position_text,
                                  points,
                                  laps,
                                  status,
                                  permanent_number,
                                  code ORDER BY _load_time) AS rn
        FROM formula1_int.Race_Results
    )
  WHERE rn = 1;

In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Results AS
    SELECT
      CONCAT(UPPER(SUBSTRING(d.given_name, 1, 1)), LOWER(SUBSTRING(d.given_name, 2))) as first_name,
      CONCAT(UPPER(SUBSTRING(d.family_name, 1, 1)), LOWER(SUBSTRING(d.family_name, 2))) as last_name,
      r.*
    FROM formula1_int.Race_Results r
    LEFT JOIN formula1_stg.ergast_drivers d
      ON r.permanent_number = d.permanent_number;

In [None]:
# fixing specific names that are different than Drivers table
%%bigquery
  UPDATE formula1_int.Race_Results
  SET last_name = CASE
    WHEN last_name = 'Hülkenberg' THEN 'Hulkenberg'
    WHEN last_name = 'De vries' THEN 'de Vries'
    WHEN last_name = 'Pérez' THEN 'Perez'
    ELSE last_name
  END
  WHERE last_name IN ('Hülkenberg', 'De vries', 'Pérez');

In [None]:
# checking specific name
%%bigquery
  SELECT * FROM formula1_int.Race_Results
  WHERE first_name = "Nyck"
  LIMIT 5;

In [None]:
# adding driver_id from Drivers
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Results AS
    SELECT d.driver_id, r.*
    FROM formula1_int.Race_Results r
    LEFT JOIN formula1_int.Drivers d
      ON r.first_name = d.first_name AND r.last_name = d.last_name;

In [None]:
# remove repeated rows (including load time)
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Results AS
    SELECT DISTINCT *
    FROM formula1_int.Race_Results;

In [None]:
%%bigquery
  ALTER TABLE formula1_int.Race_Results
  DROP COLUMN first_name;

  ALTER TABLE formula1_int.Race_Results
  DROP COLUMN last_name;

  ALTER TABLE formula1_int.Race_Results
  DROP COLUMN location;

  ALTER TABLE formula1_int.Race_Results
  DROP COLUMN country;

  ALTER TABLE formula1_int.Race_Results
  DROP COLUMN code;

In [None]:
%%bigquery
  ALTER TABLE formula1_int.Race_Results
  DROP COLUMN rn;

  ALTER TABLE formula1_int.Race_Results
  DROP COLUMN permanent_number;

In [None]:
%%bigquery

# verify if tables match rows

SELECT COUNT(*) AS matching_records
FROM saffatandsourik.formula1_int.Race_Results rr
JOIN saffatandsourik.formula1_int.Races r
ON rr.season = r.year AND rr.round = r.round;


In [None]:
%%bigquery

#bring tables together
CREATE OR REPLACE TABLE formula1_int.Race_Results AS
  SELECT rr.*, r.race_id
    FROM saffatandsourik.formula1_int.Race_Results rr
    LEFT JOIN saffatandsourik.formula1_int.Races r
    ON rr.season = r.year AND rr.round = r.round



In [None]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Race_Results AS
    SELECT
      driver_id,
      race_id,
      season,
      round,
      date,
      circuit_id,
      number,
      position,
      position_text,
      points,
      laps,
      status,
      _data_source,
      _load_time
    FROM formula1_int.Race_Results;

## Car_Info table

In [None]:
# creating initial table
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Car_Info AS
    SELECT
      pos,
      SPLIT(TRIM(REGEXP_REPLACE(driver, r'\s+', ' ')), ' ')[SAFE_OFFSET(0)] AS first_name,
      SPLIT(TRIM(REGEXP_REPLACE(driver, r'\s+', ' ')), ' ')[SAFE_OFFSET(1)] AS last_name,
      car,
      pts,
      year,
      _data_source,
      _load_time
    FROM formula1_stg.historical_cars
    WHERE year IN (2023,2024)

In [None]:
# eliminating duplicates (other than load time)
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Car_Info AS
    SELECT *
    FROM (
        SELECT *,
              ROW_NUMBER() OVER (PARTITION BY pos, first_name, last_name, year, car ORDER BY _load_time) AS rn
        FROM formula1_int.Car_Info
    )
  WHERE rn = 1

In [None]:
# fixing name
%%bigquery
  UPDATE formula1_int.Car_Info
  SET last_name = 'de Vries'
  WHERE last_name = 'De'

In [None]:
# adding driver id from Drivers
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Car_Info AS
    SELECT
      d.driver_id,
      c.*
    FROM formula1_int.Car_Info c
    LEFT JOIN formula1_int.Drivers d
      ON d.first_name = c.first_name AND d.last_name = c.last_name

In [None]:
# final table before enrichment
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Car_Info AS
    SELECT
      driver_id,
      CAST(pos AS INT) AS pos,
      car,
      pts,
      year,
      _data_source,
      _load_time
    FROM formula1_int.Car_Info

## Car_Info Enrichment

In [None]:
%%bigquery
SELECT
    c.driver_id,
    CONCAT(d.first_name, ' ', d.last_name) AS driver_name,
    c.pos,
    c.car,
    c.pts,
    c.year,
    c._data_source,
    c._load_time
FROM saffatandsourik.formula1_int.Car_Info c
JOIN saffatandsourik.formula1_int.Drivers d
ON c.driver_id = d.driver_id;




In [86]:
import json
import pandas as pd
import vertexai
from vertexai.generative_models import GenerativeModel
from google.cloud import bigquery
from pandas_gbq import to_gbq

# Initialize BigQuery Client
bq_client = bigquery.Client()

# Query Car_info table
query = """
SELECT
    c.driver_id,
    CONCAT(d.first_name, ' ', d.last_name) AS driver_name,
    c.pos,
    c.car,
    c.pts,
    c.year
FROM saffatandsourik.formula1_int.Car_Info c
JOIN saffatandsourik.formula1_int.Drivers d
ON c.driver_id = d.driver_id
ORDER BY c.year DESC;
"""

rows = bq_client.query(query).result()

# Convert to df
df = pd.DataFrame([{k: v for k, v in row.items()} for row in rows])

# LLM Prompt
prompt = """You will receive a list of F1 driver performances, including driver name, car, points scored, and finishing positions.
For each driver and car combination, generate:
1. A Car Competitiveness Score (0-100) based on points, constructor history, and driver performance.
2. A Car Performance Category (Top Performer, Midfield, Backmarker) based on competitiveness.
3. A Constructor Reliability Rating (Highly Reliable, Moderately Reliable, Unreliable) based on historical performance trends.

Return the results as JSON:
[{"driver_id": int, "driver_name": string, "car_competitiveness": float, "car_performance_category": string, "constructor_reliability": string}]
"""


# LLM Function
def generate_car_analysis(df):
    vertexai.init(project="saffatandsourik", location="us-central1")
    model = GenerativeModel("gemini-2.0-flash")

    batch_size = 50
    results = []

    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size]
        input_str = batch.to_json(orient="records")

        response = model.generate_content([input_str, prompt])
        response_text = response.text.replace("```json", "").replace("```", "").strip()

        batch_results = json.loads(response_text)
        results.extend(batch_results)

    return results

# Generate New Columns
new_data = generate_car_analysis(df)

# Convert to DataFrame
df_enriched = pd.DataFrame(new_data)

# Merge with Original Data
df_final = df.merge(df_enriched, on="driver_id", how="left")

# Save to BigQuery
table_id = "saffatandsourik.formula1_int.Car_Info"
to_gbq(df_final, table_id, project_id="saffatandsourik", if_exists="replace")

100%|██████████| 1/1 [00:00<00:00, 6533.18it/s]


In [87]:
%%bigquery
CREATE OR REPLACE TABLE saffatandsourik.formula1_int.Car_Info AS
SELECT driver_id, pos, car, pts, year, car_competitiveness,car_performance_category, constructor_reliability
from saffatandsourik.formula1_int.Car_Info


Query is running:   0%|          |

## Criteria 10: Circuits table

#####Qualifying_Results, Race_Lap_Times, and Races tables all had detailed circuit information including circuit name, location, and country. This was redundant data for these tables so we consolidated them into Circuits which has all the data needed and connects to the other tables using circuit_id

In [141]:
%%bigquery
  CREATE OR REPLACE TABLE formula1_int.Circuits AS
    SELECT *
    FROM formula1_stg.circuits

Query is running:   0%|          |

In [161]:
%%bigquery
CREATE OR REPLACE TABLE formula1_int.Circuits AS
SELECT
    CASE
        WHEN circuit_name = 'Albert Park Circuit' THEN 'Australian Grand Prix'
        WHEN circuit_name = 'Las Vegas Strip Circuit' THEN 'Las Vegas Grand Prix'
        WHEN circuit_name = 'Lusail International Circuit' THEN 'Qatar Grand Prix'
        WHEN circuit_name = 'CM.com Circuit Zandvoort' THEN 'Dutch Grand Prix'
        WHEN circuit_name = 'Circuit de Barcelona-Catalunya' THEN 'Spanish Grand Prix'
        WHEN circuit_name = 'Hungaroring' THEN 'Hungarian Grand Prix'
        WHEN circuit_name = 'Autódromo Hermanos Rodríguez' THEN 'Mexico City Grand Prix'
        WHEN circuit_name = 'Monza Circuit' THEN 'Italian Grand Prix'
        WHEN circuit_name = 'Autódromo José Carlos Pace' THEN 'São Paulo Grand Prix'
        WHEN circuit_name = 'Circuit of the Americas' THEN 'United States Grand Prix'
        WHEN circuit_name = 'Jeddah Corniche Circuit' THEN 'Saudi Arabian Grand Prix'
        WHEN circuit_name = 'Marina Bay Street Circuit' THEN 'Singapore Grand Prix'
        WHEN circuit_name = 'Suzuka International Racing Course' THEN 'Japanese Grand Prix'
        WHEN circuit_name = 'Miami International Autodrome' THEN 'Miami Grand Prix'
        WHEN circuit_name = 'Bahrain International Circuit' THEN 'Bahrain Grand Prix'
        WHEN circuit_name = 'Shanghai International Circuit' THEN 'Chinese Grand Prix'
        WHEN circuit_name = 'Yas Marina Circuit' THEN 'Abu Dhabi Grand Prix'
        WHEN circuit_name = 'Circuit Gilles Villeneuve' THEN 'Canadian Grand Prix'
        WHEN circuit_name = 'Imola Circuit' THEN 'Emilia Romagna Grand Prix'
        WHEN circuit_name = 'Baku City Circuit' THEN 'Azerbaijan Grand Prix'
        WHEN circuit_name = 'Circuit de Spa-Francorchamps' THEN 'Belgian Grand Prix'
        WHEN circuit_name = 'Circuit de Monaco' THEN 'Monaco Grand Prix'
        WHEN circuit_name = 'Silverstone Circuit' THEN 'British Grand Prix'
        WHEN circuit_name = 'Red Bull Ring' THEN 'Austrian Grand Prix'
        ELSE circuit_name
    END AS circuit_name,
    * EXCEPT (circuit_name)
FROM formula1_int.Circuits;


Query is running:   0%|          |

%%bigquery
select distinct(circuit_name)

In [None]:
%%bigquery
select distinct(name), circuit_id
from saffatandsourik.formula1_int.Races

In [None]:
%%bigquery
select distinct(circuit_name)
from saffatandsourik.formula1_int.Circuits

In [143]:
%%bigquery
CREATE OR REPLACE TABLE saffatandsourik.formula1_int.Circuits_view AS
SELECT
    r.circuit_id,
    c.*
FROM saffatandsourik.formula1_int.Circuits_view c
LEFT JOIN saffatandsourik.formula1_int.Races r
ON c.circuit_name = r.name;

Query is running:   0%|          |

In [144]:
%%bigquery
CREATE OR REPLACE TABLE saffatandsourik.formula1_int.Circuits_view AS
SELECT
    CASE
        WHEN c.circuit_name = 'Algarve International Circuit' THEN 75
        WHEN c.circuit_name = 'Circuit Paul Ricard' THEN 34
    END AS circuit_id,
    c.*
EXCEPT (circuit_id)
FROM saffatandsourik.formula1_int.Circuits_view c;

Query is running:   0%|          |

In [146]:
%%bigquery

create or replace table saffatandsourik.formula1_int.Circuits AS
select distinct(circuit_id), circuit_name,
 city,
 country,
 latitude,
 longitude,
 capacity,
 fia_grade,
 circuit_status,
 _data_source,
 _load_time

from saffatandsourik.formula1_int.Circuits_view
order by circuit_id

Query is running:   0%|          |

In [147]:
%%bigquery

UPDATE saffatandsourik.formula1_int.Circuits
SET latitude = 36.112778,
    longitude = -115.168611,
    capacity = 100000,
    fia_grade = '1'
WHERE circuit_name = 'Las Vegas Grand Prix'
AND city = 'Paradise'
AND country = 'United States';

Query is running:   0%|          |

In [148]:
%%bigquery

DELETE FROM saffatandsourik.formula1_int.Circuits
WHERE circuit_id = 1
AND latitude IS NULL
AND longitude IS NULL;

Query is running:   0%|          |

In [162]:
%%bigquery
CREATE OR REPLACE TABLE formula1_int.Circuits AS
  SELECT
       ROW_NUMBER() OVER () AS circuit_id,
       circuit_name,
       city,
       country,
       latitude,
       longitude,
       capacity,
       fia_grade,
       circuit_status,
       _data_source,
       _load_time
FROM formula1_int.Circuits;

Query is running:   0%|          |

## Set PKs and FKs and Check Logical Constraints

In [102]:
%%bigquery

-- Sessions Table
ALTER TABLE saffatandsourik.formula1_int.Sessions
ADD PRIMARY KEY (session_key) NOT ENFORCED;

-- Meetings Table
ALTER TABLE saffatandsourik.formula1_int.Meetings
ADD PRIMARY KEY (meeting_key) NOT ENFORCED;

-- Car_Info Table
ALTER TABLE saffatandsourik.formula1_int.Car_Info
ADD PRIMARY KEY (car, driver_id) NOT ENFORCED;

-- Circuits Table
ALTER TABLE saffatandsourik.formula1_int.Circuits
ADD PRIMARY KEY (circuit_id) NOT ENFORCED;

-- Drivers Table
ALTER TABLE saffatandsourik.formula1_int.Drivers
ADD PRIMARY KEY (driver_id) NOT ENFORCED;

-- Laps Table
ALTER TABLE saffatandsourik.formula1_int.Laps
ADD PRIMARY KEY (session_key, lap_number, driver_id) NOT ENFORCED;

-- Pit Table
ALTER TABLE saffatandsourik.formula1_int.Pit
ADD PRIMARY KEY (session_key, lap_number, driver_id) NOT ENFORCED;

-- Qualifying_Results Table
ALTER TABLE saffatandsourik.formula1_int.Qualifying_Results
ADD PRIMARY KEY (driver_id, year, round) NOT ENFORCED;

-- Race_Lap_Times Table
ALTER TABLE `saffatandsourik.formula1_int.Race_Lap_Times`
ADD PRIMARY KEY (driver_id, race_id, round, lap_number) NOT ENFORCED;

-- Race_Results Table
ALTER TABLE saffatandsourik.formula1_int.Race_Results
ADD PRIMARY KEY (driver_id, race_id) NOT ENFORCED;

-- Races Table
ALTER TABLE saffatandsourik.formula1_int.Races
ADD PRIMARY KEY (race_id) NOT ENFORCED;


Executing query with job ID: 0fb30c97-d239-47c6-a204-08bb797420c5
Query executing: 0.10s


ERROR:
 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/saffatandsourik/queries/0fb30c97-d239-47c6-a204-08bb797420c5?maxResults=0&location=us-central1&prettyPrint=false: Already Exists: Constraint primary key at [2:1]

Location: us-central1
Job ID: 0fb30c97-d239-47c6-a204-08bb797420c5



In [103]:
%%bigquery

-- Sessions Table
ALTER TABLE saffatandsourik.formula1_int.Sessions
ADD CONSTRAINT sessions_fk_meeting_key FOREIGN KEY (meeting_key)
    REFERENCES saffatandsourik.formula1_int.Meetings (meeting_key) NOT ENFORCED;

-- Laps Table
ALTER TABLE saffatandsourik.formula1_int.Laps
ADD CONSTRAINT laps_fk_session_key FOREIGN KEY (session_key)
    REFERENCES saffatandsourik.formula1_int.Sessions (session_key) NOT ENFORCED;
ALTER TABLE saffatandsourik.formula1_int.Laps
ADD CONSTRAINT laps_fk_driver_id FOREIGN KEY (driver_id)
    REFERENCES saffatandsourik.formula1_int.Drivers (driver_id) NOT ENFORCED;

-- Pit Table
ALTER TABLE saffatandsourik.formula1_int.Pit
ADD CONSTRAINT pit_fk_session_key FOREIGN KEY (session_key)
    REFERENCES saffatandsourik.formula1_int.Sessions (session_key) NOT ENFORCED;
ALTER TABLE saffatandsourik.formula1_int.Pit
ADD CONSTRAINT pit_fk_driver_id FOREIGN KEY (driver_id)
    REFERENCES saffatandsourik.formula1_int.Drivers (driver_id) NOT ENFORCED;

-- Race_Results Table
ALTER TABLE saffatandsourik.formula1_int.Race_Results
ADD CONSTRAINT race_results_fk_race_id FOREIGN KEY (race_id)
    REFERENCES saffatandsourik.formula1_int.Races (race_id) NOT ENFORCED;
ALTER TABLE saffatandsourik.formula1_int.Race_Results
ADD CONSTRAINT race_results_fk_driver_id FOREIGN KEY (driver_id)
    REFERENCES saffatandsourik.formula1_int.Drivers (driver_id) NOT ENFORCED;

-- Race_Lap_Times Table
ALTER TABLE saffatandsourik.formula1_int.Race_Lap_Times
ADD CONSTRAINT race_lap_times_fk_race_id FOREIGN KEY (race_id)
    REFERENCES saffatandsourik.formula1_int.Races (race_id) NOT ENFORCED;
ALTER TABLE saffatandsourik.formula1_int.Race_Lap_Times
ADD CONSTRAINT race_lap_times_fk_driver_id FOREIGN KEY (driver_id)
    REFERENCES saffatandsourik.formula1_int.Drivers (driver_id) NOT ENFORCED;

-- Qualifying_Results Table
ALTER TABLE saffatandsourik.formula1_int.Qualifying_Results
ADD CONSTRAINT qualifying_results_fk_driver_id FOREIGN KEY (driver_id)
    REFERENCES saffatandsourik.formula1_int.Drivers (driver_id) NOT ENFORCED;


-- Races Table
ALTER TABLE saffatandsourik.formula1_int.Races
ADD CONSTRAINT races_fk_circuit_id FOREIGN KEY (circuit_id)
    REFERENCES saffatandsourik.formula1_int.Circuits (circuit_id) NOT ENFORCED;

-- Car_Info Table
ALTER TABLE saffatandsourik.formula1_int.Car_Info
ADD CONSTRAINT car_info_fk_driver_id FOREIGN KEY (driver_id)
    REFERENCES saffatandsourik.formula1_int.Drivers (driver_id) NOT ENFORCED;

Executing query with job ID: efa650a6-1340-4ee8-8fad-02cb07b26b00
Query executing: 0.12s


ERROR:
 400 GET https://bigquery.googleapis.com/bigquery/v2/projects/saffatandsourik/queries/efa650a6-1340-4ee8-8fad-02cb07b26b00?maxResults=0&location=us-central1&prettyPrint=false: Already Exists: Constraint sessions_fk_meeting_key at [2:1]

Location: us-central1
Job ID: efa650a6-1340-4ee8-8fad-02cb07b26b00



In [101]:
# Sessions
%%bigquery
  SELECT session_key, COUNT(*) AS repeats
  FROM formula1_int.Sessions
  GROUP BY session_key
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,session_key,repeats


In [104]:
# Meetings
%%bigquery
  SELECT meeting_key, COUNT(*) AS repeats
  FROM formula1_int.Meetings
  GROUP BY meeting_key
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,meeting_key,repeats


In [115]:
# Laps
%%bigquery
  SELECT session_key, lap_number, driver_id, COUNT(*) AS repeats
  FROM formula1_int.Laps
  GROUP BY session_key, lap_number, driver_id
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,session_key,lap_number,driver_id,repeats


In [119]:
# Pit
%%bigquery
  SELECT session_key, lap_number, driver_id, COUNT(*) AS repeats
  FROM formula1_int.Pit
  GROUP BY session_key, lap_number, driver_id
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,session_key,lap_number,driver_id,repeats


In [123]:
# Car_Info
%%bigquery
  SELECT car, driver_id, year, COUNT(*) AS repeats
  FROM formula1_int.Car_Info
  GROUP BY car, driver_id, year
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,car,driver_id,year,repeats


In [126]:
# Drivers
%%bigquery
  SELECT driver_id, COUNT(*) AS repeats
  FROM formula1_int.Drivers
  GROUP BY driver_id
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,driver_id,repeats


In [127]:
# Race_Results
%%bigquery
  SELECT driver_id, race_id, COUNT(*) AS repeats
  FROM formula1_int.Race_Results
  GROUP BY driver_id, race_id
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,driver_id,race_id,repeats


In [128]:
# Races
%%bigquery
  SELECT circuit_id, race_id, COUNT(*) AS repeats
  FROM formula1_int.Races
  GROUP BY circuit_id, race_id
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,circuit_id,race_id,repeats


In [130]:
# Qualifying_Results
%%bigquery
  SELECT driver_id, year, round, COUNT(*) AS repeats
  FROM formula1_int.Qualifying_Results
  GROUP BY driver_id, year, round
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,driver_id,year,round,repeats


In [133]:
# Race_Lap_Times
%%bigquery
  SELECT driver_id, year, race_id, lap_number, COUNT(*) AS repeats
  FROM formula1_int.Race_Lap_Times
  GROUP BY driver_id, year, race_id, lap_number
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,driver_id,year,race_id,lap_number,repeats


In [156]:
# Circuits
%%bigquery
  SELECT circuit_id, COUNT(*) AS repeats
  FROM formula1_int.Circuits
  GROUP BY circuit_id
  HAVING COUNT(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,circuit_id,repeats
