In [None]:
import fastf1 as ff1
import pandas as pd

ff1.Cache.enable_cache('Data/.cache')

In [None]:
YEARS_TO_COLLECT = range(2019, 2024)
all_race_data = []
for year in YEARS_TO_COLLECT:
    print(f"Fetching data for year: {year}")
    # Get the event schedule for the year
    try:
        schedule = ff1.get_event_schedule(year, include_testing=False)
    except Exception as e:
        print(f"Could not get schedule for {year}: {e}")
        continue
    # Loop through each event in the schedule
    for index, event in schedule.iterrows():
        if event['EventFormat'] not in ['conventional', 'sprint_qualifying', 'sprint_shootout']: # Adjust as needed based on F1 formats
            continue

        event_name = event['EventName']
        print(f"  Processing Event: {event_name}")

        try:
            # --- Load Qualifying Data ---
            qualify_session = ff1.get_session(year, event_name, 'Q')
            qualify_session.load(laps=False, telemetry=False, weather=False, messages=False, results=True) # Only need results for grid pos
            if qualify_session.results is None:
                print(f"    No Qualifying results found for {event_name}, {year}. Skipping.")
                continue
            # Get qualifying positions into a dictionary for easy lookup
            qualifying_positions = qualify_session.results.set_index('Abbreviation')['GridPosition'].to_dict()

            # --- Load Race Data ---
            race_session = ff1.get_session(year, event_name, 'R') # 'R' for Race
            # Load necessary data - start with results, maybe add laps later if needed for pace analysis
            # Be mindful of memory: telemetry=True can be very heavy
            race_session.load(laps=True, telemetry=False, weather=True, messages=False, results=True)
            if race_session.results is None:
                print(f"    No Race results found for {event_name}, {year}. Skipping.")
                continue

            for drv_abbr, result in race_session.results.set_index('Abbreviation').iterrows():
                grid_pos = qualifying_positions.get(drv_abbr, None) # Get grid pos from qualify data

                # Check if grid position is valid (e.g., 0 means Pit Lane start, handle appropriately)
                if grid_pos is None or pd.isna(grid_pos) or grid_pos == 0:
                    grid_pos = 20 # Or max grid size, or handle as missing data - needs careful thought!

                # Basic features for this driver *for this race*
                driver_data = {
                    'Year': year,
                    'EventName': event_name,
                    'DriverAbbreviation': drv_abbr,
                    'TeamName': result['TeamName'],
                    'GridPosition': grid_pos,
                    'FinishPosition': result['Position'],
                    'Status': result['Status'],
                    'Points': result['Points'],
                    # TARGET VARIABLE: Did the driver win? (Position == 1)
                    'WonRace': 1 if result['Position'] == 1 else 0
                    # Add more features here! E.g., from race_session.laps, race_session.weather_data
                    # You'll also need to engineer features based on *previous* races (lagged features)
                }
                all_race_data.append(driver_data)
        except ff1 as e:
            print(f"    Ergast error loading data for {event_name}, {year}: {e}. Skipping.")
        except ff1.RateLimitExceededError as e:
            print(f"    Rate limit exceeded for {event_name}, {year}: {e}. Consider adding delays.")
            # Potentially add a time.sleep(60) here and retry or break
        except Exception as e:
            # Catch other potential errors during session loading or processing
            print(f"    An error occurred processing {event_name}, {year}: {e}. Skipping.")
