In [None]:
# Notebook: 07_Bulk_Ingestion.ipynb

import fastf1
import s3fs
import pandas as pd
import numpy as np
import time

# --- 1. Configure S3 (MinIO) Connection ---
endpoint_url = 'http://minIO:9000'
s3 = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': endpoint_url},
    key='minioadmin',
    secret='minioadmin'
)

# --- 2. Define the 2024 Calendar ---
# (A simplified list of race names)
# We use try/except so it won't crash if a race is misspelled
races_2024 = [
    (1, "Bahrain"), (2, "Saudi Arabia"), (3, "Australia"), (4, "Japan"), (5, "China"),
    (6, "Miami"), (7, "Emilia Romagna"), (8, "Monaco"), (9, "Canada"), (10, "Spain"),
    (11, "Austria"), (12, "Great Britain"), (13, "Hungary"), (14, "Belgium"), (15, "Netherlands"),
    (16, "Italy"), (17, "Azerbaijan"), (18, "Singapore"), (19, "United States"), (20, "Mexico"),
    (21, "Brazil"), (22, "Las Vegas"), (23, "Qatar"), (24, "Abu Dhabi")
]

# Ensure the output directory exists
s3.mkdirs('raw-data/laps', exist_ok=True)
print("S3 'raw-data/laps' directory is ready.")

# --- 3. Enable FastF1 Cache ---
fastf1.Cache.enable_cache('/home/jovyan/.cache/fastf1') 

# --- 4. Loop, Ingest, and Save ---
print("Starting bulk ingestion for 2024 season...")

for round_num, race_name in races_2024:
    file_path = f's3://raw-data/laps/2024_{round_num:02d}_{race_name}.parquet'
    
    # Check if file already exists in our data lake
    if s3.exists(file_path):
        print(f"Skipping {race_name} (Round {round_num}): Already exists.")
        continue

    try:
        print(f"Processing {race_name} (Round {round_num})...")
        
        # Load the RACE session
        session = fastf1.get_session(2024, race_name, 'R')
        session.load(telemetry=False, laps=True, weather=False) # Laps only
        laps_df = session.laps

        if laps_df.empty:
            print(f"No lap data found for {race_name}. Skipping.")
            continue

        # === THE PARQUET FIX (Apply what we learned) ===
        # 1. Convert timedelta columns
        timedelta_cols = laps_df.select_dtypes(include=['timedelta64[ns]']).columns
        for col in timedelta_cols:
            laps_df.loc[:, col] = laps_df[col].dt.total_seconds()

        # 2. Convert datetime columns
        datetime_cols = laps_df.select_dtypes(include=['datetime64[ns]']).columns
        for col in datetime_cols:
            laps_df.loc[:, col] = laps_df[col].astype(str) 
        # === END FIX ===

        # Save to MinIO Data Lake
        print(f"  -> Saving to {file_path}")
        with s3.open(file_path, 'wb') as f:
            laps_df.to_parquet(
                f,
                index=False,
                version='2.4',
                coerce_timestamps='us'
            )
        
        # Be nice to the API
        time.sleep(2) # 2-second delay

    except Exception as e:
        print(f"!!! FAILED to process {race_name}: {e}")

print("\n--- Bulk Ingestion Complete! ---")