"""
# Traffic Data Cleaning Pipeline

This notebook loads the processed traffic data, applies cleaning steps 
identified during the quality audit (primarily handling missing values),
and saves the cleaned dataset.
"""

In [1]:
import pandas as pd
import numpy as np
import os
import time

In [2]:
# --- Configuration ---
INPUT_FILE = "../data/processed/traffic_history_2022_2023_processed.parquet"
OUTPUT_DIR = "../data/cleaned" # Save cleaned data to a new directory
OUTPUT_FILENAME = "traffic_history_2022_2023_cleaned.parquet"
OUTPUT_FILE = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)


In [3]:
# --- 1. Load Data ---
print(f"Loading data from: {INPUT_FILE}")
start_load_time = time.time()
df = pd.read_parquet(INPUT_FILE)
end_load_time = time.time()
print(f"Data loaded in {end_load_time - start_load_time:.2f} seconds.")
print(f"Original shape: {df.shape}")
print("\nMissing values before cleaning:")
print(df.isnull().sum())

Loading data from: ../data/processed/traffic_history_2022_2023_processed.parquet
Data loaded in 1.78 seconds.
Original shape: (106964597, 4)

Missing values before cleaning:
ID_TRAM               0
Timestamp             0
EstatActual       12221
PrevisioActual    12221
dtype: int64


In [4]:
# --- 2. Handle Missing Values ---
# Strategy: Forward fill within each segment group. 
# This assumes the status persists until a new reading is available.
# It handles the small number of missing values (0.011%) without complex imputation.

print("\nHandling missing values using forward fill within each ID_TRAM group...")
start_ffill_time = time.time()

# Sort ensures forward fill works correctly within each group's timeline
df.sort_values(by=['ID_TRAM', 'Timestamp'], inplace=True)

# Apply forward fill within each group
df['EstatActual'] = df.groupby('ID_TRAM')['EstatActual'].ffill()
df['PrevisioActual'] = df.groupby('ID_TRAM')['PrevisioActual'].ffill()

# Check for any remaining NaNs (could happen if the *first* record for a segment is NaN)
remaining_na = df[['EstatActual', 'PrevisioActual']].isnull().sum()
print("\nMissing values after forward fill:")
print(remaining_na)

if remaining_na.sum() > 0:
    print("\nHandling remaining NaNs (likely first entries for some segments)...")
    # Option 1: Fill with a default value (e.g., 0 for 'No Data' or median/mode)
    # df['EstatActual'].fillna(0, inplace=True) 
    # df['PrevisioActual'].fillna(0, inplace=True)
    
    # Option 2: Backward fill (fills first entries from the next valid one in group)
    df['EstatActual'] = df.groupby('ID_TRAM')['EstatActual'].bfill()
    df['PrevisioActual'] = df.groupby('ID_TRAM')['PrevisioActual'].bfill()
    
    # Option 3: Drop rows with remaining NaNs (if acceptable)
    # df.dropna(subset=['EstatActual', 'PrevisioActual'], inplace=True)
    
    print("\nMissing values after second pass (backward fill):")
    print(df[['EstatActual', 'PrevisioActual']].isnull().sum())
    
    # Final check - drop any rows if bfill didn't work (segment entirely NaN?)
    df.dropna(subset=['EstatActual', 'PrevisioActual'], inplace=True)


end_ffill_time = time.time()
print(f"Missing value handling done in {end_ffill_time - start_ffill_time:.2f} seconds.")


Handling missing values using forward fill within each ID_TRAM group...

Missing values after forward fill:
EstatActual       532
PrevisioActual    532
dtype: int64

Handling remaining NaNs (likely first entries for some segments)...

Missing values after second pass (backward fill):
EstatActual       0
PrevisioActual    0
dtype: int64
Missing value handling done in 13.68 seconds.


In [5]:
# --- 3. Final Data Check ---
print("\n--- Final Data Check ---")
print(f"Cleaned shape: {df.shape}")
print("\nMissing values after all cleaning:")
print(df.isnull().sum())
print("\nData types:")
print(df.dtypes) # Check if ffill changed dtypes (should be fine with Int8)


--- Final Data Check ---
Cleaned shape: (106964597, 4)

Missing values after all cleaning:
ID_TRAM           0
Timestamp         0
EstatActual       0
PrevisioActual    0
dtype: int64

Data types:
ID_TRAM                    int64
Timestamp         datetime64[ns]
EstatActual                 Int8
PrevisioActual              Int8
dtype: object


In [6]:

# --- 4. Save Cleaned Data ---
print(f"\nSaving cleaned data to: {OUTPUT_FILE}")
start_save_time = time.time()
try:
    df.to_parquet(OUTPUT_FILE, index=False, engine='pyarrow')
    end_save_time = time.time()
    print(f"Cleaned data saved successfully in {end_save_time - start_save_time:.2f} seconds.")
except Exception as e:
    print(f"ERROR saving cleaned data: {e}")

print("\n--- Data Cleaning Complete ---")


Saving cleaned data to: ../data/cleaned/traffic_history_2022_2023_cleaned.parquet
Cleaned data saved successfully in 3.19 seconds.

--- Data Cleaning Complete ---
