In [None]:
# Notebook: 04_Telemetry_Ingestion.ipynb

import fastf1
import s3fs
import pandas as pd
import numpy as np

# --- 1. Configure S3 (MinIO) Connection ---
endpoint_url = 'http://minio:9000'
s3 = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': endpoint_url},
    key='minioadmin',
    secret='minioadmin'
)

# --- 2. Load Session Data ---
# Enable FastF1 cache
fastf1.Cache.enable_cache('/home/jovyan/.cache/fastf1') 

print("Loading session data for Qualifying...")
# Using 2024 Bahrain Qualifying
session = fastf1.get_session(2024, 'Bahrain', 'Q')
session.load(telemetry=True, laps=True, weather=False)

# --- 3. Get Fastest Laps for Two Drivers ---
print("Getting fastest laps for VER and LEC...")
ver_lap = session.laps.pick_driver('VER').pick_fastest()
lec_lap = session.laps.pick_driver('LEC').pick_fastest()

# --- 4. Get Telemetry Data for These Laps ---
print("Fetching telemetry data...")
ver_tel = ver_lap.get_telemetry().add_distance()
lec_tel = lec_lap.get_telemetry().add_distance()

# Add a 'Driver' column so we can group them later
ver_tel['Driver'] = 'VER'
lec_tel['Driver'] = 'LEC'

# --- 5. Combine and Clean Data ---
print("Combining and cleaning data...")
# Combine the two dataframes
telemetry_df = pd.concat([ver_tel, lec_tel])

# === THE PARQUET FIX (Apply what we learned) ===
# 1. Convert timedelta columns
timedelta_cols = telemetry_df.select_dtypes(include=['timedelta64[ns]']).columns
if not timedelta_cols.empty:
    print(f"Converting {len(timedelta_cols)} timedelta columns to seconds...")
    for col in timedelta_cols:
        telemetry_df.loc[:, col] = telemetry_df[col].dt.total_seconds()

# 2. Convert datetime columns
datetime_cols = telemetry_df.select_dtypes(include=['datetime64[ns]']).columns
if not datetime_cols.empty:
    print(f"Converting {len(datetime_cols)} datetime columns to string...")
    for col in datetime_cols:
        telemetry_df.loc[:, col] = telemetry_df[col].astype(str)
        
# 3. Fill NaNs (important for Spark)
# Telemetry has NaNs for Brake, RPM etc. Let's fill them.
numeric_cols = telemetry_df.select_dtypes(include=[np.number]).columns
telemetry_df[numeric_cols] = telemetry_df[numeric_cols].fillna(0)
# === END FIX ===

print(f"Total telemetry rows: {len(telemetry_df)}")
print(telemetry_df.head())

# --- 6. Save to MinIO Data Lake ---
file_path = 's3://raw-data/2024_bahrain_Q_telemetry.parquet'
print(f"Saving telemetry data to {file_path}...")

with s3.open(file_path, 'wb') as f:
    telemetry_df.to_parquet(
        f,
        index=False,
        version='2.4',
        coerce_timestamps='us'
    )

print("Telemetry ingestion complete!")