#### Data Processing and Dataset Creation

This section sets up the necessary libraries and functions to process raw basketball game data and generate a structured dataset. The dataset will be created by merging two CSV files:

1. **Schedule File** (`schedule.csv`): Contains information about games, including dates, home and away teams, and the winning team.
2. **Game Logs File** (`gamelogs.csv`): Stores detailed statistical data for each game, broken down by team.

##### Key Components:
- **Library Imports**: We use `csv` for handling CSV files, `os` for file path manipulations, and `rich` for enhanced console output with progress tracking.
- **File Path Handling**: We determine the absolute paths of the input files to ensure proper loading regardless of the working directory.
- **Data Processing**: The `create_dataset` function:
  - Reads the schedule and game log data.
  - Uses a dictionary (`defaultdict`) for efficient lookups of team statistics by date.
  - Iterates through scheduled games, retrieving corresponding statistics from the game logs.
  - Merges relevant data into a structured format and writes it to `dataset.csv`.
  - Utilizes a progress bar to track processing.

By running this section, you will generate a cleaned and structured dataset ready for analysis or machine learning applications.

In [None]:
# Importing libraries
import csv
import os
from collections import defaultdict
from datetime import datetime


# Merge the gamelogs and csv files to create the dataset
def create_dataset(
    schedule_file: str,
    gamelogs_file: str,
    final_dataset_file: str,
    start_date: str = None,
    final_date: str = None,
):
    # Convert start and final date string to datetime object
    start_date_obj = None
    final_date_obj = None
    if start_date:
        start_date_obj: datetime = datetime.strptime(start_date, "%Y-%m-%d")
    if final_date:
        final_date_obj: datetime = datetime.strptime(final_date, "%Y-%m-%d")

    # Read schedule.csv into a list of dicts
    schedule: list[str] = list()
    with open(schedule_file, mode="r") as file:
        reader: csv.DictReader = csv.DictReader(file)
        for row in reader:
            game_date: datetime = datetime.strptime(row["date"], "%Y-%m-%d")
            if start_date_obj:
                if game_date < start_date_obj:
                    continue  # Skip games before the start_date
            if final_date_obj:
                if game_date > final_date_obj:
                    continue # Skip games after the final_date
            schedule.append(row)

    # Read gamelogs.csv into a list of dicts and create a dictionary for fast lookups
    gamelogs_dict = defaultdict(list)
    with open(gamelogs_file, mode="r") as file:
        reader: csv.DictReader = csv.DictReader(file)
        for row in reader:
            date: str = row["date"]
            team: str = row["team"]
            date_obj = datetime.strptime(date, "%Y-%m-%d")
            gamelogs_dict[team].append((date_obj, row))

    # Sort each team's gamelogs by date
    for team in gamelogs_dict:
        gamelogs_dict[team].sort(key=lambda x: x[0])

    # Remove output file if it exists
    if os.path.exists(final_dataset_file):
        os.remove(final_dataset_file)

    headers: list[str] = [
        "date",
        "home_team",
        "away_team",
        "winning_team",
        "home_pts",
        "home_fg",
        "home_fga",
        "home_fg_pct",
        "home_fg3",
        "home_fg3a",
        "home_fg3_pct",
        "home_fg2",
        "home_fg2a",
        "home_fg2_pct",
        "home_ft",
        "home_fta",
        "home_ft_pct",
        "home_orb",
        "home_drb",
        "home_trb",
        "home_ast",
        "home_stl",
        "home_blk",
        "home_tov",
        "home_pf",
        "home_ortg",
        "home_drtg",
        "home_pace",
        "home_ftr",
        "home_3ptar",
        "home_ts",
        "home_trb_pct",
        "home_ast_pct",
        "home_stl_pct",
        "home_blk_pct",
        "home_efg_pct",
        "home_tov_pct",
        "home_orb_pct",
        "home_ft_rate",
        "home_nrtg",
        "home_ast_tov",
        "home_ast_ratio",
        "away_pts",
        "away_fg",
        "away_fga",
        "away_fg_pct",
        "away_fg3",
        "away_fg3a",
        "away_fg3_pct",
        "away_fg2",
        "away_fg2a",
        "away_fg2_pct",
        "away_ft",
        "away_fta",
        "away_ft_pct",
        "away_orb",
        "away_drb",
        "away_trb",
        "away_ast",
        "away_stl",
        "away_blk",
        "away_tov",
        "away_pf",
        "away_ortg",
        "away_drtg",
        "away_pace",
        "away_ftr",
        "away_3ptar",
        "away_ts",
        "away_trb_pct",
        "away_ast_pct",
        "away_stl_pct",
        "away_blk_pct",
        "away_efg_pct",
        "away_tov_pct",
        "away_orb_pct",
        "away_ft_rate",
        "away_nrtg",
        "away_ast_tov",
        "away_ast_ratio",
    ]

    print("Starting the dataset creation process...\n")

    with open(final_dataset_file, mode="a", newline="") as file:
        writer: csv.writer = csv.writer(file)
        writer.writerow(headers)

        total_games = len(schedule)
        for i, game in enumerate(schedule, start=1):
            date: str = game["date"]
            home_team: str = game["home_team"]
            away_team: str = game["away_team"]
            winner: int = game["winning_team"]
            game_date = datetime.strptime(date, "%Y-%m-%d")

            home_stats = find_closest_stats(home_team, game_date, gamelogs_dict)
            away_stats = find_closest_stats(away_team, game_date, gamelogs_dict)

            if home_stats and away_stats:
                home_stats_clean = {
                    k: v for k, v in home_stats.items() if k not in ["date", "team"]
                }
                away_stats_clean = {
                    k: v for k, v in away_stats.items() if k not in ["date", "team"]
                }

                row: str = (
                    [date, home_team, away_team, winner]
                    + list(home_stats_clean.values())
                    + list(away_stats_clean.values())
                )
                writer.writerow(row)

            # Simple progress output
            if i % 50 == 0 or i == total_games:
                print(f"Processed {i}/{total_games} games...")

    print(f"\nFinal dataset successfully saved to {final_dataset_file} ✅")
    print("Dataset creation process completed!")


def find_closest_stats(
    team: str, target_date: datetime, gamelogs_dict: defaultdict
) -> dict:
    gamelogs = gamelogs_dict.get(team, [])
    closest_stats = None
    for date, stats in reversed(gamelogs):
        if date < target_date:
            closest_stats = stats
            break
    return closest_stats


create_dataset("./csv/results.csv", "./csv/averages.csv", "./csv/dataset.csv", "2000-10-31", "2025-04-13")