# -*- coding: utf-8 -*-
"""
Exploration and Refinement of TRAMS.csv Data

This notebook performs quality checks on TRAMS.csv, constructs geometries,
and merges information from TARIFES.csv and HORARIS.csv.
"""

In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import os
import numpy as np
from shapely.geometry import Point, LineString # To construct geometry
from shapely.errors import GEOSException # For error handling

In [2]:
# --- Configuration ---
DATA_DIR = "../data/raw" # Adjust if your notebook structure is different
TRAMS_FILE = os.path.join(DATA_DIR, "TRAMS.csv")
TARIFES_FILE = os.path.join(DATA_DIR, "TARIFES.csv") # Need this again
HORARIS_FILE = os.path.join(DATA_DIR, "HORARIS.csv") # Need this again


In [3]:
# --- Load TRAMS Data ---
print(f"Loading TRAMS data from: {TRAMS_FILE}")
try:
    # Initial load - might need dtype specification if columns were mixed
    df_trams_initial = pd.read_csv(TRAMS_FILE)
    print("TRAMS data loaded successfully.")
    # Explicitly copy to avoid SettingWithCopyWarning later
    df_trams = df_trams_initial.copy() 
except FileNotFoundError:
    print(f"Error: File not found at {TRAMS_FILE}")
    df_trams = pd.DataFrame()

Loading TRAMS data from: ../data/raw/TRAMS.csv
TRAMS data loaded successfully.


In [4]:
# --- Basic Info Recap ---
if not df_trams.empty:
    print("--- Basic Info Recap ---")
    print(f"Shape (Rows, Columns): {df_trams.shape}")
    print("\nColumns and Data Types:")
    df_trams.info()
else:
    print("TRAMS DataFrame is empty. Stopping execution.")
    # If using in a script, you might exit here. In notebook, subsequent cells might fail.

# === Data Quality Checks (Run these first if desired) ===
# (You can uncomment and run these blocks from previous steps if needed)
# print("\n--- Running Data Quality Checks ---")
# ... (Include blocks for Missing Value, Uniqueness, Foreign Key, Numeric, Text analysis here if wanted) ...
# print("\n--- Finished Data Quality Checks ---")


--- Basic Info Recap ---
Shape (Rows, Columns): (17275, 17)

Columns and Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17275 entries, 0 to 17274
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   ID_TRAM      17275 non-null  int64  
 1   UTM_IX       17275 non-null  float64
 2   UTM_FX       17275 non-null  float64
 3   UTM_IY       17275 non-null  float64
 4   UTM_FY       17275 non-null  float64
 5   LATITUD_I    17275 non-null  float64
 6   LONGITUD_I   17275 non-null  float64
 7   LATITUD_F    17275 non-null  float64
 8   LONGITUD_F   17275 non-null  float64
 9   ID_TARIFA    17275 non-null  int64  
 10  ID_HORARIO   17275 non-null  int64  
 11  ID_TARIFA1   11643 non-null  float64
 12  ID_HORARIO1  17275 non-null  int64  
 13  TIPUS_TRAM   17275 non-null  object 
 14  ADREÇA       17275 non-null  object 
 15  COLOR_RGB    17275 non-null  object 
 16  PLACES       17275 non-null  int64  
dtypes: 

In [5]:
# --- Load and Process TARIFES Data for Lookup ---
print(f"\nLoading TARIFES data from: {TARIFES_FILE}")
tariff_type_lookup = {} # Initialize empty
try:
    df_tarifes = pd.read_csv(TARIFES_FILE)
    print("TARIFES data loaded.")

    # Create mapping functions directly here (simplified from notebook 03)
    def map_tariff_type(code):
        code_str = str(code) # Ensure it's a string for startswith etc.
        if code_str in ['A', 'B', 'C', 'D']: return f"Blue Zone {code_str}"
        if code_str == 'RES': return "Resident"
        if code_str.startswith('GR'): return "Green Zone"
        if code_str.startswith('BU'): return "Bus Zone"
        if code_str.startswith('MA') or code_str.startswith('MB'): return "Motorcycle"
        if code_str.startswith('ED'): return "School/Special Zone"
        if code_str == 'CYD': return "Loading/Unloading (DUM)"
        return "Unknown"

    df_tarifes['TARIFA_TYPE'] = df_tarifes['CODI_TARIFA'].apply(map_tariff_type)

    # Create lookup dict: ID_TARIFA (as int key) -> TARIFA_TYPE
    # Ensure ID_TARIFA is treated as integer for lookup
    tariff_type_lookup = df_tarifes.set_index('ID_TARIFA')['TARIFA_TYPE'].to_dict()
    print("Created Tariff Type lookup dictionary (using integer ID_TARIFA as key).")

except FileNotFoundError:
    print(f"Error: File not found at {TARIFES_FILE}. Tariff lookup will be empty.")


Loading TARIFES data from: ../data/raw/TARIFES.csv
TARIFES data loaded.
Created Tariff Type lookup dictionary (using integer ID_TARIFA as key).


In [6]:
# --- Load and Process HORARIS Data for Lookup ---
print(f"\nLoading HORARIS data from: {HORARIS_FILE}")
schedule_flags_lookup = {} # Initialize empty
try:
    df_horaris = pd.read_csv(HORARIS_FILE)
    print("HORARIS data loaded.")

    # Create lookup dict: ID_HORARI (as int key) -> [APPLIES_ON_HOLIDAYS (bool), PARKING_ONLY_IN_SCHEDULE (bool)]
    # Ensure index is integer
    df_horaris_indexed = df_horaris.set_index('ID_HORARI')
    schedule_flags_lookup = df_horaris_indexed[['INCLUS_FESTIUS', 'PARQUING_SOLS_DINS_HORARI']].astype(bool).T.to_dict('list')
    
    # Add handling for the invalid ID=0 if it exists
    if 0 not in schedule_flags_lookup:
         schedule_flags_lookup[0] = [None, None] # Assign None flags for ID 0
         
    print("Created Schedule Flags lookup dictionary (using integer ID_HORARI as key).")

except FileNotFoundError:
    print(f"Error: File not found at {HORARIS_FILE}. Schedule lookup will be empty.")


Loading HORARIS data from: ../data/raw/HORARIS.csv
HORARIS data loaded.
Created Schedule Flags lookup dictionary (using integer ID_HORARI as key).


In [7]:
# --- Geometry Construction (from Coordinates) ---
gdf_trams = None # Initialize to None
if not df_trams.empty and all(col in df_trams.columns for col in ['LONGITUD_I', 'LATITUD_I', 'LONGITUD_F', 'LATITUD_F']):
    print("\n--- Geometry Construction (from Coordinates) ---")

    invalid_geom_count = 0
    zero_length_count = 0
    geometries = []
    try:
        print("Constructing LineString geometries...")
        
        for index, row in df_trams.iterrows():
            try:
                # Check for NaN values before creating points
                if pd.isna(row['LONGITUD_I']) or pd.isna(row['LATITUD_I']) or \
                   pd.isna(row['LONGITUD_F']) or pd.isna(row['LATITUD_F']):
                   geometries.append(None)
                   invalid_geom_count += 1 
                   continue

                point_start = Point(row['LONGITUD_I'], row['LATITUD_I'])
                point_end = Point(row['LONGITUD_F'], row['LATITUD_F'])

                if point_start.equals(point_end):
                    zero_length_count += 1
                    geometries.append(None) 
                    invalid_geom_count += 1
                    continue
                
                line = LineString([point_start, point_end])
                
                if not line.is_valid:
                    invalid_geom_count += 1
                    geometries.append(None) 
                else:
                    geometries.append(line)

            except (GEOSException, ValueError) as e:
                 print(f"Error creating geometry for row index {index}, ID_TRAM {row.get('ID_TRAM', 'N/A')}: {e}")
                 invalid_geom_count += 1
                 geometries.append(None)

        print(f"Invalid/problematic geometries found: {invalid_geom_count} (includes {zero_length_count} zero-length)")

        # --- Create GeoDataFrame ---
        print("Creating GeoDataFrame...")
        # Use WGS84 (EPSG:4326) as CRS since coordinates are Lat/Lon
        gdf_trams = gpd.GeoDataFrame(df_trams, geometry=geometries, crs="EPSG:4326") 
        # Remove rows where geometry construction failed
        gdf_trams = gdf_trams[gdf_trams.geometry.notna()].copy() # Use .copy() after filtering
        print(f"GeoDataFrame created with {len(gdf_trams)} valid geometries.")

    except ImportError:
        print("\nError: GeoPandas or Shapely not installed. Cannot construct geometries.")
    except Exception as e:
        print(f"\nAn error occurred during geometry processing: {e}")
else:
    print("\nSkipping Geometry Construction: DataFrame empty or coordinate columns missing.")




--- Geometry Construction (from Coordinates) ---
Constructing LineString geometries...
Invalid/problematic geometries found: 1 (includes 1 zero-length)
Creating GeoDataFrame...
GeoDataFrame created with 17274 valid geometries.


In [10]:
# --- Merge/Map Tariff and Schedule Information ---

# Check if gdf_trams was successfully created
if gdf_trams is None or gdf_trams.empty:
     print("\nError: GeoDataFrame 'gdf_trams' is not available. Cannot perform mapping.")
elif 'tariff_type_lookup' not in locals() or not tariff_type_lookup:
    print("\nError: Tariff lookup dictionary not available. Cannot perform mapping.")
elif 'schedule_flags_lookup' not in locals() or not schedule_flags_lookup:
     print("\nError: Schedule flags lookup dictionary not available. Cannot perform mapping.")
else:
    print("\n--- Mapping Tariff and Schedule Info onto gdf_trams ---")

    # --- Map Tariff Type using ID_TARIFA (Ensure key type matches lookup dict) ---
    try:
        # Convert ID_TARIFA column to numeric first to match integer keys in lookup
        tariff_keys_int = pd.to_numeric(gdf_trams['ID_TARIFA'], errors='coerce').astype('Int64') # Use nullable Int
        gdf_trams['TARIFA_TYPE'] = tariff_keys_int.map(tariff_type_lookup)
        print("Mapped TARIFA_TYPE using ID_TARIFA (as integer key).")
    except Exception as e:
         print(f"Error mapping TARIFA_TYPE: {e}. Check key types.")
         gdf_trams['TARIFA_TYPE'] = None # Assign None if mapping fails


    # --- Map Schedule Flags using ID_HORARIO1 (Int) ---
    try:
        # Ensure ID_HORARIO1 is integer type for mapping
        schedule_keys_int = gdf_trams['ID_HORARIO1'].astype(int) # Assume it can be cast directly
        schedule_map_result = schedule_keys_int.map(schedule_flags_lookup)
        
        # Extract flags safely
        gdf_trams['APPLIES_ON_HOLIDAYS'] = schedule_map_result.apply(lambda x: x[0] if isinstance(x, list) and len(x) > 0 else None)
        gdf_trams['PARKING_ONLY_IN_SCHEDULE'] = schedule_map_result.apply(lambda x: x[1] if isinstance(x, list) and len(x) > 1 else None)
        print("Mapped APPLIES_ON_HOLIDAYS and PARKING_ONLY_IN_SCHEDULE using ID_HORARIO1.")
    except Exception as e:
        print(f"Error mapping schedule flags: {e}. Check key types and lookup.")
        gdf_trams['APPLIES_ON_HOLIDAYS'] = None
        gdf_trams['PARKING_ONLY_IN_SCHEDULE'] = None


    # --- Analyze Final Mapping Results ---
    print("\n--- Analysis of Mapped Data ---")
    if 'TARIFA_TYPE' in gdf_trams.columns:
        print("\nFinal Distribution of Tariff Types:")
        print(gdf_trams['TARIFA_TYPE'].value_counts(dropna=False)) 
        print(f"Total Trams with missing/unmapped Tariff Type: {gdf_trams['TARIFA_TYPE'].isnull().sum()}")
    else:
        print("\nTARIFA_TYPE column not created.")

    if 'APPLIES_ON_HOLIDAYS' in gdf_trams.columns:
        print("\nFinal Distribution of APPLIES_ON_HOLIDAYS:")
        print(gdf_trams['APPLIES_ON_HOLIDAYS'].value_counts(dropna=False))
        missing_hol_flag_count = gdf_trams['APPLIES_ON_HOLIDAYS'].isnull().sum()
        print(f"Total Trams with missing/unmapped Holiday Flag: {missing_hol_flag_count}")
    else:
        print("\nAPPLIES_ON_HOLIDAYS column not created.")

    if 'PARKING_ONLY_IN_SCHEDULE' in gdf_trams.columns:
        print("\nFinal Distribution of PARKING_ONLY_IN_SCHEDULE:")
        print(gdf_trams['PARKING_ONLY_IN_SCHEDULE'].value_counts(dropna=False))
        missing_parkonly_flag_count = gdf_trams['PARKING_ONLY_IN_SCHEDULE'].isnull().sum()
        print(f"Total Trams with missing/unmapped Parking Only Flag: {missing_parkonly_flag_count}")
    else:
         print("\nPARKING_ONLY_IN_SCHEDULE column not created.")


    print("\n--- Enriched GeoDataFrame Head ---")
    # Display relevant columns
    cols_to_show = ['ID_TRAM', 'ID_TARIFA', 'TARIFA_TYPE', 'ID_HORARIO1', 'APPLIES_ON_HOLIDAYS', 'PARKING_ONLY_IN_SCHEDULE', 'ADREÇA', 'PLACES', 'geometry']
    # Ensure columns exist before selecting
    cols_to_show = [col for col in cols_to_show if col in gdf_trams.columns]
    print(gdf_trams[cols_to_show].head())

    # --- Save Processed Data ---
    OUTPUT_DIR = "../data/processed" 
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    OUTPUT_FILE = os.path.join(OUTPUT_DIR, "trams_processed.gpkg") # GeoPackage format

    try:
        print(f"\nSaving processed GeoDataFrame to: {OUTPUT_FILE}")
        gdf_trams.to_file(OUTPUT_FILE, driver="GPKG", layer="trams")
        print("Saved successfully.")
    except Exception as e:
        print(f"Error saving GeoDataFrame: {e}")
print("\n--- End of Notebook ---") # Added end marker


--- Mapping Tariff and Schedule Info onto gdf_trams ---
Mapped TARIFA_TYPE using ID_TARIFA (as integer key).
Mapped APPLIES_ON_HOLIDAYS and PARKING_ONLY_IN_SCHEDULE using ID_HORARIO1.

--- Analysis of Mapped Data ---

Final Distribution of Tariff Types:
TARIFA_TYPE
Motorcycle                 11643
Loading/Unloading (DUM)     3095
Blue Zone B                 1046
Resident                     850
Blue Zone A                  539
Bus Zone                     100
NaN                            1
Name: count, dtype: int64
Total Trams with missing/unmapped Tariff Type: 1

Final Distribution of APPLIES_ON_HOLIDAYS:
APPLIES_ON_HOLIDAYS
False    17067
True       206
None         1
Name: count, dtype: int64
Total Trams with missing/unmapped Holiday Flag: 1

Final Distribution of PARKING_ONLY_IN_SCHEDULE:
PARKING_ONLY_IN_SCHEDULE
False    16882
True       391
None         1
Name: count, dtype: int64
Total Trams with missing/unmapped Parking Only Flag: 1

--- Enriched GeoDataFrame Head ---
   ID_

In [11]:
# --- Save Processed Data ---
import os 

# Ensure gdf_trams exists and is not empty before saving
if 'gdf_trams' in locals() and not gdf_trams.empty:
    OUTPUT_DIR = "../data/processed" 
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    OUTPUT_FILE = os.path.join(OUTPUT_DIR, "trams_processed.gpkg") # GeoPackage format

    try:
        print(f"\nSaving processed GeoDataFrame to: {OUTPUT_FILE}")
        # Ensure all column types are supported by GeoPackage driver before saving
        # Object columns can sometimes cause issues, check/convert if needed
        # Example: gdf_trams['some_object_column'] = gdf_trams['some_object_column'].astype(str)
        gdf_trams.to_file(OUTPUT_FILE, driver="GPKG", layer="trams")
        print("Saved successfully.")
    except Exception as e:
        print(f"Error saving GeoDataFrame: {e}")
        print("You might need to check column data types for GeoPackage compatibility (e.g., convert object columns to string).")
else:
    print("\nSkipping save: gdf_trams GeoDataFrame not found or is empty.")



Saving processed GeoDataFrame to: ../data/processed/trams_processed.gpkg
Saved successfully.
