In [1]:
import fastf1
import s3fs
import pandas as pd

# --- 1. Configure S3 (MinIO) Connection ---
# This uses the service name 'minio' from docker-compose
endpoint_url = 'http://minio:9000'
s3 = s3fs.S3FileSystem(
    client_kwargs={'endpoint_url': endpoint_url},
    key='minioadmin',
    secret='minioadmin'
)

# --- 2. Create Buckets (Run once) ---
try:
    s3.mkdir('raw-data')
    s3.mkdir('processed-data')
    print("Buckets created!")
except FileExistsError:
    print("Buckets already exist.")

# --- 3. Ingest Data ---
# Enable FastF1 cache (mounted as a Docker volume)
fastf1.Cache.enable_cache('/home/jovyan/.cache/fastf1') 

print("Loading session data...")
session = fastf1.get_session(2024, 'Bahrain', 'R') # Race
session.load(telemetry=True, laps=True, weather=False)

# --- 4. Get Laps and Save to MinIO as Parquet ---
laps_df = session.laps

print("Original dtypes (sample):\n", laps_df.dtypes.head(15))

# === FIX 1: Convert timedelta columns to float (seconds) ===
timedelta_cols = laps_df.select_dtypes(include=['timedelta64[ns]']).columns
if not timedelta_cols.empty:
    print(f"\nConverting {len(timedelta_cols)} timedelta columns to seconds (float):")
    print(list(timedelta_cols))
    for col in timedelta_cols:
        laps_df.loc[:, col] = laps_df[col].dt.total_seconds()
        
# === FIX 2: Convert datetime columns to string (ISO format) ===
# This is the new part that fixes the error
datetime_cols = laps_df.select_dtypes(include=['datetime64[ns]']).columns
if not datetime_cols.empty:
    print(f"\nConverting {len(datetime_cols)} datetime columns to string:")
    print(list(datetime_cols))
    for col in datetime_cols:
        # Convert to string. NaT (Not a Time) becomes 'NaT' string
        laps_df.loc[:, col] = laps_df[col].astype(str) 

print("\nNew dtypes (sample):\n", laps_df.dtypes.head(15))
# === END FIX ===

file_path = 's3://raw-data/2024_bahrain_laps.parquet'

print(f"Saving laps to {file_path}...")
with s3.open(file_path, 'wb') as f:
    # === THE FIX: Force compatible Parquet format ===
    laps_df.to_parquet(
        f,
        index=False,
        version='2.4',  # Use an older, more stable Parquet version
        coerce_timestamps='us' # Coerce all timestamps to microseconds
    )
    # === END FIX ===

print("Ingestion Complete!")

print("Ingestion Complete!")

# You can repeat this for telemetry
# Note: Telemetry is huge! This is your "Big Data"
# car_data = session.car_data
# ... save to parquet

Buckets already exist.
Loading session data...


core           INFO 	Loading data for Bahrain Grand Prix - Race [v3.6.1]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for lap_count
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for car_data
req            INFO 	Using cached data for position_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '11', '55', '16', '63', '4', '44', '81', '14', '18', '24', '20', '3', '22', '23', '27', '31', '10', '77', '2']


Original dtypes (sample):
 Time                  timedelta64[ns]
Driver                         object
DriverNumber                   object
LapTime               timedelta64[ns]
LapNumber                     float64
Stint                         float64
PitOutTime            timedelta64[ns]
PitInTime             timedelta64[ns]
Sector1Time           timedelta64[ns]
Sector2Time           timedelta64[ns]
Sector3Time           timedelta64[ns]
Sector1SessionTime    timedelta64[ns]
Sector2SessionTime    timedelta64[ns]
Sector3SessionTime    timedelta64[ns]
SpeedI1                       float64
dtype: object

Converting 11 timedelta columns to seconds (float):
['Time', 'LapTime', 'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'LapStartTime']

Converting 1 datetime columns to string:
['LapStartDate']

New dtypes (sample):
 Time                  float64
Driver                 object
DriverNumber        