"""
# Merge Parking Occupancy and Traffic Data

This notebook combines the cleaned historical traffic data with the 
parking occupancy data. It uses a mapping file to link parking zones
to nearby traffic segments (ID_TRAM) and joins the datasets based on
timestamp and location.
"""

In [1]:
import pandas as pd
import geopandas as gpd # Optional: if using geospatial mapping
import numpy as np
import os
import time


In [2]:
# --- Configuration ---
# --- !!! REPLACE PARKING_DATA_FILE PLACEHOLDER !!! ---
PARKING_DATA_FILE = "../data/processed/parking_occupancy_processed.parquet" # Example path - REPLACE
PROCESSED_TRAMS_GPKG = "../data/processed/trams_processed.gpkg" # Use the processed TRAMS GeoPackage
CLEANED_TRAFFIC_FILE = "../data/cleaned/traffic_history_2022_2023_cleaned.parquet"
OUTPUT_DIR = "../data/features" # Save final feature set here
OUTPUT_FILENAME = "parking_traffic_merged_features.parquet"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)

# --- Column Names (Adjust as per your actual data) ---
# Parking Data
PARKING_TIMESTAMP_COL = 'Timestamp'
PARKING_ZONE_ID_COL = 'ParkingZoneID' # IMPORTANT: Assuming this corresponds to ID_TRAM for now
PARKING_TARGET_COL = 'OccupancyPercentage' # Your target variable

# Traffic Data
TRAFFIC_TIMESTAMP_COL = 'Timestamp'
TRAFFIC_ID_COL = 'ID_TRAM'
TRAFFIC_FEATURES = ['EstatActual', 'PrevisioActual']

# TRAMS GeoPackage (assuming it contains ID_TRAM and other relevant static features)
TRAMS_ID_COL = 'ID_TRAM' 
# Add other static features from trams_processed.gpkg you want to keep
TRAMS_STATIC_FEATURES = ['TARIFA_TYPE', 'APPLIES_ON_HOLIDAYS', 'PARKING_ONLY_IN_SCHEDULE', 'PLACES'] 

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [None]:
# --- 1. Load Data ---
print("Loading datasets...")
start_load_time = time.time()

gdf_trams = None # Initialize
df_parking = None
df_traffic = None

try:
    df_traffic = pd.read_parquet(CLEANED_TRAFFIC_FILE)
    # Keep only necessary columns and ensure timestamp type
    df_traffic = df_traffic[[TRAFFIC_TIMESTAMP_COL, TRAFFIC_ID_COL] + TRAFFIC_FEATURES]
    df_traffic[TRAFFIC_TIMESTAMP_COL] = pd.to_datetime(df_traffic[TRAFFIC_TIMESTAMP_COL])
    print(f"Loaded Traffic Data: {df_traffic.shape}")
except FileNotFoundError:
    print(f"ERROR: Cleaned traffic file not found at {CLEANED_TRAFFIC_FILE}")
    # exit() 

try:
    df_parking = pd.read_parquet(PARKING_DATA_FILE)
    # Keep only necessary columns and ensure timestamp type
    df_parking = df_parking[[PARKING_TIMESTAMP_COL, PARKING_ZONE_ID_COL, PARKING_TARGET_COL]]
    df_parking[PARKING_TIMESTAMP_COL] = pd.to_datetime(df_parking[PARKING_TIMESTAMP_COL])
    print(f"Loaded Parking Data: {df_parking.shape}")
except FileNotFoundError:
    print(f"ERROR: Parking data file not found at {PARKING_DATA_FILE}. Please update the path.")
    # exit() 
    
try:
    # Load the processed TRAMS GeoPackage
    gdf_trams = gpd.read_file(PROCESSED_TRAMS_GPKG, layer='trams')
    # Keep only necessary columns: ID and static features (+ geometry if needed later)
    cols_to_keep = [TRAMS_ID_COL] + TRAMS_STATIC_FEATURES + ['geometry']
    # Filter out columns not present in the GeoPackage
    cols_to_keep = [col for col in cols_to_keep if col in gdf_trams.columns]
    gdf_trams = gdf_trams[cols_to_keep]
    print(f"Loaded TRAMS GeoPackage: {gdf_trams.shape}")
except Exception as e:
    print(f"ERROR: Failed to load TRAMS GeoPackage from {PROCESSED_TRAMS_GPKG}: {e}")
    # exit() 

end_load_time = time.time()
print(f"Data loading completed in {end_load_time - start_load_time:.2f} seconds.")


In [None]:
# --- 2. Prepare Data for Merge ---
# Proceed only if all dataframes loaded successfully
if df_parking is not None and df_traffic is not None and gdf_trams is not None:
    print("\nPreparing data for merge...")

    # Ensure ID columns have compatible types for merging
    try:
        # Convert parking zone ID to the same type as TRAMS ID from GeoPackage
        parking_id_dtype = gdf_trams[TRAMS_ID_COL].dtype
        df_parking[PARKING_ZONE_ID_COL] = df_parking[PARKING_ZONE_ID_COL].astype(parking_id_dtype)
        # Convert traffic ID to the same type
        df_traffic[TRAFFIC_ID_COL] = df_traffic[TRAFFIC_ID_COL].astype(parking_id_dtype)
        print(f"Ensured {PARKING_ZONE_ID_COL}, {TRAFFIC_ID_COL}, and {TRAMS_ID_COL} have compatible type {parking_id_dtype}.")
    except Exception as e:
        print(f"Warning: Could not align ID column types - {e}. Merge might fail or be inefficient.")

    # --- 3. Perform Merge ---
    # We will merge parking data with traffic data first, then add static TRAM features.
    print("\nPerforming merge operations...")
    start_merge_time = time.time()

    # Step 3a: Merge parking data with traffic data
    # Assuming PARKING_ZONE_ID_COL directly maps to TRAFFIC_ID_COL (ID_TRAM)
    print("Merging parking data with traffic data...")
    df_merged = pd.merge(
        df_parking,
        df_traffic,
        left_on=[PARKING_TIMESTAMP_COL, PARKING_ZONE_ID_COL], 
        right_on=[TRAFFIC_TIMESTAMP_COL, TRAFFIC_ID_COL],
        how='left' # Keep all parking records, add traffic features where match found
    )
    print(f"Shape after merging parking with traffic: {df_merged.shape}")

    # Check for missing traffic data after merge
    missing_traffic_count = df_merged[TRAFFIC_FEATURES[0]].isnull().sum() # Check first traffic feature
    print(f"Rows with missing traffic data after merge: {missing_traffic_count}")

    # Clean up redundant columns from traffic merge
    df_merged.drop(columns=[TRAFFIC_TIMESTAMP_COL, TRAFFIC_ID_COL], inplace=True, errors='ignore')

    # Step 3b: Merge static TRAM features from GeoPackage
    print("\nMerging static TRAM features...")
    # Prepare gdf_trams for merge (drop geometry if not needed, keep only ID and static features)
    trams_features_to_merge = gdf_trams[[TRAMS_ID_COL] + TRAMS_STATIC_FEATURES].copy()
    # Ensure ID column type matches for this merge as well
    trams_features_to_merge[TRAMS_ID_COL] = trams_features_to_merge[TRAMS_ID_COL].astype(df_merged[PARKING_ZONE_ID_COL].dtype)

    df_merged = pd.merge(
        df_merged,
        trams_features_to_merge,
        left_on=PARKING_ZONE_ID_COL, # Use the parking zone ID
        right_on=TRAMS_ID_COL,       # Match with TRAM ID in GeoPackage features
        how='left'                   # Keep all merged parking/traffic rows
    )
    print(f"Shape after merging static TRAM features: {df_merged.shape}")

    # Check for missing static features after merge
    # Check if TRAMS_STATIC_FEATURES exist before checking isnull()
    cols_exist = [col for col in TRAMS_STATIC_FEATURES if col in df_merged.columns]
    missing_static_count = df_merged[cols_exist].isnull().sum().sum() if cols_exist else 0
    print(f"Total missing static TRAM features after merge: {missing_static_count}")

    # Clean up redundant ID column from TRAMS merge
    df_merged.drop(columns=[TRAMS_ID_COL], inplace=True, errors='ignore')

    end_merge_time = time.time()
    print(f"Merge operations completed in {end_merge_time - start_merge_time:.2f} seconds.")

    # --- 4. Post-Merge Handling for Missing Features ---\
    print("\n--- Post-Merge Handling ---")

    # Handle missing Traffic Features (EstatActual, PrevisioActual)
    # These are likely missing due to the Oct 2023 gap or mismatches.
    # Strategy: Fill with 0 (implies 'No Data' / 'Very Fluid' baseline)
    print(f"Filling {missing_traffic_count} instances of missing traffic features with 0...")
    df_merged[TRAFFIC_FEATURES] = df_merged[TRAFFIC_FEATURES].fillna(0).astype('Int8') # Ensure Int8 type

    # Handle missing Static TRAM Features 
    # These would be missing if a PARKING_ZONE_ID_COL value didn't exist in trams_processed.gpkg
    print(f"Handling {missing_static_count} instances of missing static TRAM features...")
    if 'TARIFA_TYPE' in df_merged.columns:
        df_merged['TARIFA_TYPE'].fillna('Unknown', inplace=True)
    if 'APPLIES_ON_HOLIDAYS' in df_merged.columns:
        # Fill NA then convert to bool
        df_merged['APPLIES_ON_HOLIDAYS'] = df_merged['APPLIES_ON_HOLIDAYS'].fillna(False).astype(bool) 
    if 'PARKING_ONLY_IN_SCHEDULE' in df_merged.columns:
        # Fill NA then convert to bool
        df_merged['PARKING_ONLY_IN_SCHEDULE'] = df_merged['PARKING_ONLY_IN_SCHEDULE'].fillna(False).astype(bool)
    if 'PLACES' in df_merged.columns:
         # Calculate median *after* potential merge, on the non-NA values
         places_median = df_merged['PLACES'].median()
         df_merged['PLACES'].fillna(places_median, inplace=True) 
         df_merged['PLACES'] = df_merged['PLACES'].astype(int) # Convert to int after filling NAs

    print("\nFinal check for missing values:")
    print(df_merged.isnull().sum())

    # --- 5. Save Merged Dataset ---
    print("\n--- Saving Final Merged Dataset ---")
    print(f"Final dataset shape: {df_merged.shape}")
    print(f"Saving to: {OUTPUT_FILE}")

    start_save_time = time.time()
    try:                 
        df_merged.to_parquet(OUTPUT_FILE, index=False, engine='pyarrow')
        end_save_time = time.time()
        print(f"Merged data saved successfully in {end_save_time - start_save_time:.2f} seconds.")
    except Exception as e:
        print(f"ERROR saving merged data: {e}")

    print("\n--- Merging Process Complete ---")

else:
    print("\n--- Merging Process Aborted: One or more input dataframes failed to load. ---")
