# -*- coding: utf-8 -*-
"""
Combine and Process Historical Traffic Status Data

This notebook loads the monthly historical traffic CSV files downloaded previously,
processes them (datetime conversion, resampling to 5-min intervals with 
forward-fill), combines them into a single DataFrame, and saves the result.
"""

In [1]:
import pandas as pd
import numpy as np
import glob
import os
import re
import time 


In [2]:

# --- Configuration ---
# Adjust DATA_DIR if your notebook structure is different
RAW_TRAFFIC_DIR = "../data/raw/traffic_history" 
PROCESSED_DIR = "../data/processed"
OUTPUT_FILENAME = "traffic_history_2022_2023_processed.parquet" # Using Parquet for efficiency
OUTPUT_FILE = os.path.join(PROCESSED_DIR, OUTPUT_FILENAME)

# Define the date range we want to process (inclusive)
START_YEAR, START_MONTH = 2022, 1
END_YEAR, END_MONTH = 2023, 12

# Ensure output directory exists
os.makedirs(PROCESSED_DIR, exist_ok=True)


In [3]:
# --- 1. Find Relevant CSV Files ---
print(f"Scanning for relevant CSV files in: {RAW_TRAFFIC_DIR}")
all_csv_files = glob.glob(os.path.join(RAW_TRAFFIC_DIR, "*.csv"))

files_to_process = []
hist_pattern = re.compile(r"^(\d{4})_(\d{2})_.*?_TRAMS_TRAMS\.csv$", re.IGNORECASE)

for file_path in all_csv_files:
    filename = os.path.basename(file_path)
    match = hist_pattern.match(filename)
    if match:
        year = int(match.group(1))
        month = int(match.group(2))
        # Filter for the specific date range
        if (year > START_YEAR or (year == START_YEAR and month >= START_MONTH)) and \
           (year < END_YEAR or (year == END_YEAR and month <= END_MONTH)):
            files_to_process.append(file_path)

# Sort files chronologically to process in order
files_to_process.sort() 

print(f"Found {len(files_to_process)} files to process for the period {START_YEAR}-{START_MONTH:02d} to {END_YEAR}-{END_MONTH:02d}.")
if not files_to_process:
    print("Error: No files found for the specified date range. Please check the directory and date range.")
    # Stop execution if no files found
    # exit() # Use exit() in a script, or just let the notebook stop


Scanning for relevant CSV files in: ../data/raw/traffic_history
Found 23 files to process for the period 2022-01 to 2023-12.


In [4]:
# --- 2. Process Files and Combine ---
processed_dfs = []
total_rows_processed = 0
expected_columns = ['ID_TRAM', 'DataHoraLectura', 'EstatActual', 'PrevisioActual']

print("\nStarting processing loop...")
start_time_loop = time.time()

for i, file_path in enumerate(files_to_process):
    filename = os.path.basename(file_path)
    print(f"Processing file {i+1}/{len(files_to_process)}: {filename}...")
    try:
        # Load CSV
        df_month = pd.read_csv(file_path, sep=',', header=0, on_bad_lines='warn')
        
        # Basic validation and renaming
        rename_map = {
             'idTram': 'ID_TRAM', 
             'data': 'DataHoraLectura', 
             'estatActual': 'EstatActual', 
             'estatPrevist': 'PrevisioActual' 
        }
        # Check if expected columns exist before renaming
        missing_cols = [col for col in rename_map.keys() if col not in df_month.columns]
        if missing_cols:
            print(f"  Warning: Missing expected columns {missing_cols} in {filename}. Skipping file.")
            continue # Skip this file if columns are missing
            
        df_month.rename(columns=rename_map, inplace=True)
        
        # Select only necessary columns early to save memory
        df_month = df_month[['ID_TRAM', 'DataHoraLectura', 'EstatActual', 'PrevisioActual']]

        # Convert DataHoraLectura to datetime
        df_month['Timestamp'] = pd.to_datetime(df_month['DataHoraLectura'], format='%Y%m%d%H%M%S', errors='coerce')
        df_month.dropna(subset=['Timestamp'], inplace=True) # Drop rows with invalid dates
        
        if df_month.empty:
             print(f"  Warning: No valid data after datetime conversion in {filename}. Skipping.")
             continue

        # --- Handle Duplicate Timestamps before setting index ---
        # Sort first to ensure 'last' is deterministic if needed
        df_month.sort_values(by=['ID_TRAM', 'Timestamp'], inplace=True)
        # Keep the last entry for any ID_TRAM/Timestamp duplicates
        rows_before_dedup = len(df_month)
        df_month.drop_duplicates(subset=['ID_TRAM', 'Timestamp'], keep='last', inplace=True)
        rows_after_dedup = len(df_month)
        if rows_before_dedup > rows_after_dedup:
             print(f"  Dropped {rows_before_dedup - rows_after_dedup} duplicate timestamp entries for specific TRAMs.")

        # --- Resampling and Forward Filling ---
        # This is memory intensive!
        print(f"  Resampling {filename} to 5-minute intervals (this may take time)...")
        start_resample_time = time.time()
        
        df_month.set_index('Timestamp', inplace=True)
        # Group by tram, then resample each group and forward fill
        # Selecting columns inside apply for potential memory saving
        df_resampled = df_month.groupby('ID_TRAM', group_keys=True).apply(
            lambda g: g[['EstatActual', 'PrevisioActual']].resample('5min').ffill(),
            include_groups=False # Avoid adding ID_TRAMlevel to MultiIndex if pandas version supports it
        )

        # Reset index to bring 'ID_TRAM' and 'Timestamp' back as columns
        # Check if ID_TRAM is in index levels after apply
        if 'ID_TRAM' in df_resampled.index.names:
            df_resampled.reset_index(inplace=True)
        else: 
            # If group_keys=False worked or pandas version differs
            df_resampled = df_resampled.reset_index()
            # Need to manually add ID_TRAM back if it wasn't preserved - this is tricky!
            # A safer approach might be to loop through trams if the above fails.
            # For now, assume ID_TRAM is in the index or needs re-merging (less efficient)
            print(f"  Warning: Resampling might require adjustment if ID_TRAM is lost.")


        end_resample_time = time.time()
        print(f"  Resampling done in {end_resample_time - start_resample_time:.2f} seconds.")

        # Keep only essential columns after resampling
        df_resampled = df_resampled[['ID_TRAM', 'Timestamp', 'EstatActual', 'PrevisioActual']]
        
        # Convert status columns to integer type (after ffill they might be float)
        # Using nullable integer type Int8 for memory efficiency, allows NaN if ffill didn't cover start
        df_resampled['EstatActual'] = df_resampled['EstatActual'].astype('Int8') 
        df_resampled['PrevisioActual'] = df_resampled['PrevisioActual'].astype('Int8')

        processed_dfs.append(df_resampled)
        total_rows_processed += len(df_resampled)
        print(f"  Finished processing {filename}. Rows added: {len(df_resampled)}")

    except Exception as e:
        print(f"  ERROR processing file {filename}: {e}")


Starting processing loop...
Processing file 1/23: 2022_01_Gener_TRAMS_TRAMS.csv...
  Resampling 2022_01_Gener_TRAMS_TRAMS.csv to 5-minute intervals (this may take time)...
  Resampling done in 0.27 seconds.
  Finished processing 2022_01_Gener_TRAMS_TRAMS.csv. Rows added: 4705056
Processing file 2/23: 2022_02_Febrer_TRAMS_TRAMS.csv...
  Resampling 2022_02_Febrer_TRAMS_TRAMS.csv to 5-minute intervals (this may take time)...
  Resampling done in 0.24 seconds.
  Finished processing 2022_02_Febrer_TRAMS_TRAMS.csv. Rows added: 4249728
Processing file 3/23: 2022_03_Marc_TRAMS_TRAMS.csv...
  Resampling 2022_03_Marc_TRAMS_TRAMS.csv to 5-minute intervals (this may take time)...
  Resampling done in 0.27 seconds.
  Finished processing 2022_03_Marc_TRAMS_TRAMS.csv. Rows added: 4705056
Processing file 4/23: 2022_04_Abril_TRAMS_TRAMS.csv...
  Resampling 2022_04_Abril_TRAMS_TRAMS.csv to 5-minute intervals (this may take time)...
  Resampling done in 0.31 seconds.
  Finished processing 2022_04_Abril_

In [5]:
# --- 3. Concatenate All Processed DataFrames ---
if processed_dfs:
    print("\nConcatenating all processed monthly DataFrames...")
    start_concat_time = time.time()
    combined_df = pd.concat(processed_dfs, ignore_index=True)
    end_concat_time = time.time()
    print(f"Concatenation complete in {end_concat_time - start_concat_time:.2f} seconds.")
    print(f"Total rows in combined dataset: {len(combined_df)}")
    
    # --- 4. Save Combined Data ---
    print(f"\nSaving combined data to: {OUTPUT_FILE}")
    try:
        start_save_time = time.time()
        combined_df.to_parquet(OUTPUT_FILE, index=False, engine='pyarrow') # Or 'fastparquet'
        end_save_time = time.time()
        print(f"Saved successfully in {end_save_time - start_save_time:.2f} seconds.")
    except Exception as e:
        print(f"ERROR saving combined data: {e}")
        
    # Optional: Display info of final DataFrame
    print("\n--- Info of Final Combined DataFrame ---")
    combined_df.info(memory_usage='deep')

else:
    print("\nNo dataframes were processed. Skipping concatenation and saving.")

end_time_loop = time.time()
print(f"\nTotal processing time: {(end_time_loop - start_time_loop) / 60:.2f} minutes")
print("\n--- End of Notebook ---")


Concatenating all processed monthly DataFrames...
Concatenation complete in 0.67 seconds.
Total rows in combined dataset: 106964597

Saving combined data to: ../data/processed/traffic_history_2022_2023_processed.parquet
Saved successfully in 4.42 seconds.

--- Info of Final Combined DataFrame ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106964597 entries, 0 to 106964596
Data columns (total 4 columns):
 #   Column          Dtype         
---  ------          -----         
 0   ID_TRAM         int64         
 1   Timestamp       datetime64[ns]
 2   EstatActual     Int8          
 3   PrevisioActual  Int8          
dtypes: Int8(2), datetime64[ns](1), int64(1)
memory usage: 2.0 GB

Total processing time: 0.56 minutes

--- End of Notebook ---
