# -*- coding: utf-8 -*-
"""
# Traffic Data Exploration (2022-2023)

This notebook analyzes the combined historical traffic dataset to identify patterns and trends
across time-of-day, weekdays, seasons, and geographic segments.

The dataset covers January 2022 through December 2023 (except October 2023)
with 5-minute interval traffic status for Barcelona road segments.
"""


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
import gc
from datetime import datetime
import calendar


In [2]:
# Set plot styles
plt.style.use('ggplot')
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12

In [3]:
# --- Configuration ---
PARQUET_FILE = "../data/processed/traffic_history_2022_2023_processed.parquet"
SAMPLE_SIZE = 5_000_000  # For memory-intensive operations, use sample for initial exploration


In [4]:
# --- Helper Functions ---
def create_time_features(df):
    """Extract time-based features from timestamp column."""
    # Create time-based features
    df['year'] = df['Timestamp'].dt.year
    df['month'] = df['Timestamp'].dt.month
    df['day'] = df['Timestamp'].dt.day
    df['dayofweek'] = df['Timestamp'].dt.dayofweek  # 0=Monday, 6=Sunday
    df['hour'] = df['Timestamp'].dt.hour
    df['minute'] = df['Timestamp'].dt.minute
    
    # Create categorical period features
    df['weekday'] = df['dayofweek'].apply(lambda x: calendar.day_name[x])
    df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)
    df['month_name'] = df['month'].apply(lambda x: calendar.month_name[x])
    
    # Time of day categories (can customize these)
    time_categories = [
        (0, 6, 'Night'),
        (6, 10, 'Morning Rush'),
        (10, 16, 'Daytime'),
        (16, 20, 'Evening Rush'),
        (20, 24, 'Evening')
    ]
    
    def categorize_hour(h):
        for start, end, label in time_categories:
            if start <= h < end:
                return label
        return 'Unknown'
    
    df['time_of_day'] = df['hour'].apply(categorize_hour)
    
    # Create season column
    def get_season(month):
        if month in [12, 1, 2]:
            return 'Winter'
        elif month in [3, 4, 5]:
            return 'Spring'
        elif month in [6, 7, 8]:
            return 'Summer'
        else:
            return 'Fall'
    
    df['season'] = df['month'].apply(get_season)
    
    return df

def decode_traffic_status(status_value):
    """Decode numeric traffic status values to descriptive labels."""
    status_map = {
        0: 'No Data',
        1: 'Very Fluid',
        2: 'Fluid',
        3: 'Dense',
        4: 'Very Dense',
        5: 'Congested',
        6: 'Blocked'
    }
    return status_map.get(status_value, f'Unknown ({status_value})')

# ---------- START OF ANALYSIS ----------


In [5]:
# --- 1. Load the data ---
print(f"Loading traffic data from {PARQUET_FILE}...")
df_traffic = pd.read_parquet(PARQUET_FILE)

# Display basic info
print("\n--- Basic Dataset Information ---")
print(f"Dataset shape: {df_traffic.shape}")
print(f"Memory usage: {df_traffic.memory_usage().sum() / 1024**2:.2f} MB")
print("\nColumn information:")
df_traffic.info()

print("\n--- Summary Statistics ---")
print(df_traffic.describe())

# Check for missing values
print("\n--- Missing Values ---")
print(df_traffic.isna().sum())


Loading traffic data from ../data/processed/traffic_history_2022_2023_processed.parquet...

--- Basic Dataset Information ---
Dataset shape: (106964597, 4)
Memory usage: 2040.19 MB

Column information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 106964597 entries, 0 to 106964596
Data columns (total 4 columns):
 #   Column          Dtype         
---  ------          -----         
 0   ID_TRAM         int64         
 1   Timestamp       datetime64[ns]
 2   EstatActual     Int8          
 3   PrevisioActual  Int8          
dtypes: Int8(2), datetime64[ns](1), int64(1)
memory usage: 2.0 GB

--- Summary Statistics ---
            ID_TRAM                      Timestamp  EstatActual  \
count  1.069646e+08                      106964597  106952376.0   
mean   2.668614e+02  2022-12-19 13:40:13.519811328     1.097657   
min    1.000000e+00            2022-01-01 00:00:00          0.0   
25%    1.330000e+02            2022-06-25 09:10:00          0.0   
50%    2.660000e+02            2022-1

In [6]:
# --- 2. Explore traffic status distributions ---
print("\n--- Traffic Status Distribution ---")
print("EstatActual (Current Status):")
estat_counts = df_traffic['EstatActual'].value_counts().sort_index()
for status, count in estat_counts.items():
    percent = 100 * count / len(df_traffic)
    print(f"  {status} ({decode_traffic_status(status)}): {count:,} ({percent:.2f}%)")

print("\nPrevisioActual (Forecast Status):")
previsio_counts = df_traffic['PrevisioActual'].value_counts().sort_index()
for status, count in previsio_counts.items():
    percent = 100 * count / len(df_traffic)
    print(f"  {status} ({decode_traffic_status(status)}): {count:,} ({percent:.2f}%)")


--- Traffic Status Distribution ---
EstatActual (Current Status):
  0 (No Data): 45,327,003 (42.38%)
  1 (Very Fluid): 22,201,398 (20.76%)
  2 (Fluid): 30,331,548 (28.36%)
  3 (Dense): 5,153,886 (4.82%)
  4 (Very Dense): 1,868,649 (1.75%)
  5 (Congested): 823,123 (0.77%)
  6 (Blocked): 1,246,769 (1.17%)

PrevisioActual (Forecast Status):
  0 (No Data): 50,538,664 (47.25%)
  1 (Very Fluid): 19,969,789 (18.67%)
  2 (Fluid): 25,870,938 (24.19%)
  3 (Dense): 6,164,167 (5.76%)
  4 (Very Dense): 2,177,776 (2.04%)
  5 (Congested): 984,273 (0.92%)
  6 (Blocked): 1,246,769 (1.17%)


In [7]:
# --- 3. Create time-based features for a sample ---
print(f"\nTaking a sample of {SAMPLE_SIZE:,} rows for detailed time analysis...")
df_sample = df_traffic.sample(n=min(SAMPLE_SIZE, len(df_traffic)), random_state=42)
df_sample = create_time_features(df_sample)

# Free memory
del df_traffic
gc.collect()

print("\nSample with time features:")
print(df_sample.head())


Taking a sample of 5,000,000 rows for detailed time analysis...

Sample with time features:
          ID_TRAM           Timestamp  EstatActual  PrevisioActual  year  \
82444374      408 2023-06-29 01:55:00            1               1  2023   
60990859       56 2023-02-14 21:10:00            0               0  2023   
3409026       382 2022-01-26 21:30:00            0               0  2022   
77531139      376 2023-05-13 09:50:00            2               2  2023   
8949472       534 2022-02-10 13:20:00            2               2  2022   

          month  day  dayofweek  hour  minute    weekday  is_weekend  \
82444374      6   29          3     1      55   Thursday           0   
60990859      2   14          1    21      10    Tuesday           0   
3409026       1   26          2    21      30  Wednesday           0   
77531139      5   13          5     9      50   Saturday           1   
8949472       2   10          3    13      20   Thursday           0   

         month_na

In [8]:
# --- 4. Time of Day Analysis ---
print("\n--- Traffic by Time of Day ---")
# Calculate average traffic status by hour
hourly_traffic = df_sample.groupby('hour')['EstatActual'].mean().reset_index()
hourly_traffic.columns = ['Hour', 'Average Traffic Status']

# Create a bar plot
plt.figure(figsize=(14, 7))
sns.barplot(x='Hour', y='Average Traffic Status', data=hourly_traffic)
plt.title('Average Traffic Status by Hour (0 = No Data, 5 = Congested)')
plt.xlabel('Hour of Day')
plt.ylabel('Average Traffic Status')
plt.xticks(range(0, 24))
plt.tight_layout()
plt.savefig('traffic_by_hour.png')
plt.close()
print("Plot saved: traffic_by_hour.png")



--- Traffic by Time of Day ---
Plot saved: traffic_by_hour.png


In [9]:
# --- 5. Day of Week Analysis ---
print("\n--- Traffic by Day of Week ---")
# Calculate average traffic by day of week and hour (heatmap)
day_hour_traffic = df_sample.groupby(['dayofweek', 'hour'])['EstatActual'].mean().reset_index()

# Ensure numeric data types before pivoting
day_hour_traffic['EstatActual'] = pd.to_numeric(day_hour_traffic['EstatActual'], errors='coerce')
day_hour_traffic['dayofweek'] = pd.to_numeric(day_hour_traffic['dayofweek'], errors='coerce')
day_hour_traffic['hour'] = pd.to_numeric(day_hour_traffic['hour'], errors='coerce')

# Create pivot table with explicit float values
day_hour_pivot = day_hour_traffic.pivot(index='dayofweek', columns='hour', values='EstatActual')

# Fill any NaN values that might cause dtype issues
day_hour_pivot = day_hour_pivot.fillna(0).astype(float)

# Create a heatmap
plt.figure(figsize=(16, 8))
sns.heatmap(day_hour_pivot, cmap='YlOrRd', annot=False, fmt=".2f", 
            xticklabels=range(24),
            yticklabels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Average Traffic Status by Day of Week and Hour')
plt.xlabel('Hour of Day')
plt.ylabel('Day of Week')
plt.tight_layout()
plt.savefig('traffic_day_hour_heatmap.png')
plt.close()
print("Plot saved: traffic_day_hour_heatmap.png")


--- Traffic by Day of Week ---
Plot saved: traffic_day_hour_heatmap.png


In [10]:
# --- 6. Monthly Analysis ---
print("\n--- Traffic by Month ---")
# Calculate average traffic by month
monthly_traffic = df_sample.groupby(['year', 'month'])['EstatActual'].mean().reset_index()
monthly_traffic['year_month'] = monthly_traffic['year'].astype(str) + '-' + monthly_traffic['month'].astype(str).str.zfill(2)

# Create a bar plot
plt.figure(figsize=(14, 7))
sns.barplot(x='year_month', y='EstatActual', data=monthly_traffic)
plt.title('Average Traffic Status by Month')
plt.xlabel('Year-Month')
plt.ylabel('Average Traffic Status')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('traffic_by_month.png')
plt.close()
print("Plot saved: traffic_by_month.png")


--- Traffic by Month ---
Plot saved: traffic_by_month.png


In [11]:
# --- 7. Yearly Comparison ---
print("\n--- Yearly Comparison (2022 vs 2023) ---")
yearly_traffic = df_sample.groupby(['year', 'month'])['EstatActual'].mean().reset_index()

# Create line plots for yearly comparison
plt.figure(figsize=(14, 7))
for year in yearly_traffic['year'].unique():
    year_data = yearly_traffic[yearly_traffic['year'] == year]
    plt.plot(year_data['month'], year_data['EstatActual'], marker='o', linewidth=2, label=str(year))

plt.title('Monthly Average Traffic Status: 2022 vs 2023')
plt.xlabel('Month')
plt.ylabel('Average Traffic Status')
plt.xticks(range(1, 13), calendar.month_abbr[1:13])
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.savefig('yearly_comparison.png')
plt.close()
print("Plot saved: yearly_comparison.png")


--- Yearly Comparison (2022 vs 2023) ---
Plot saved: yearly_comparison.png


In [12]:
# --- 8. Top Congested Segments Analysis ---
print("\n--- Top Congested Road Segments ---")
segment_stats = df_sample.groupby('ID_TRAM')['EstatActual'].agg(['mean', 'median', 'count']).reset_index()
segment_stats = segment_stats.sort_values('mean', ascending=False)

print("\nTop 10 Most Congested Segments:")
print(segment_stats.head(10))

# Plot top 5 congested segments
top_segments = segment_stats.head(5)['ID_TRAM'].tolist()
top_segments_data = df_sample[df_sample['ID_TRAM'].isin(top_segments)]

plt.figure(figsize=(14, 8))
sns.boxplot(x='ID_TRAM', y='EstatActual', data=top_segments_data)
plt.title('Traffic Status Distribution for Top 5 Congested Segments')
plt.xlabel('Segment ID')
plt.ylabel('Traffic Status')
plt.tight_layout()
plt.savefig('top_congested_segments.png')
plt.close()
print("Plot saved: top_congested_segments.png")


--- Top Congested Road Segments ---

Top 10 Most Congested Segments:
     ID_TRAM      mean  median  count
258      259  5.999369     6.0   9511
256      257  5.974387     6.0   9136
257      258  5.972547     6.0   9325
255      256  5.970348     6.0   9443
184      185  4.295428     6.0   9383
186      187  3.294168     2.0   9362
131      132  2.762374     3.0   9334
441      443  2.654571     3.0   9342
443      445  2.558704     3.0   9352
150      151  2.546475     2.0   9532
Plot saved: top_congested_segments.png


In [13]:
# --- 9. Time of Day Patterns Across Segments ---
print("\n--- Time of Day Patterns Across Different Segments ---")
# Get 5 random segments for comparison
random_segments = df_sample['ID_TRAM'].drop_duplicates().sample(5).tolist()
random_segments_data = df_sample[df_sample['ID_TRAM'].isin(random_segments)]

# Plot hour patterns by segment using Plotly
fig = px.line(random_segments_data.groupby(['ID_TRAM', 'hour'])['EstatActual'].mean().reset_index(),
              x='hour', y='EstatActual', color='ID_TRAM', 
              title='Hourly Traffic Patterns for 5 Random Segments',
              labels={'hour': 'Hour of Day', 'EstatActual': 'Average Traffic Status', 'ID_TRAM': 'Segment ID'})
fig.write_html('segment_hourly_patterns.html')
print("Interactive plot saved: segment_hourly_patterns.html")



--- Time of Day Patterns Across Different Segments ---
Interactive plot saved: segment_hourly_patterns.html


In [14]:
# --- 10. Show Summary Findings ---
print("\n--- Summary of Key Findings ---")
print("1. Time of Day Patterns: Peak traffic hours are visible in the hourly plots")
print("2. Day of Week Patterns: Weekday vs weekend patterns in the heatmap")
print("3. Seasonal Variations: Monthly trends show seasonal traffic patterns")
print("4. Year-over-Year: Differences between 2022 and 2023 traffic conditions")
print("5. Road Segment Analysis: Identified top congested road segments")

print("\nExploration completed! Check the generated plots for visual insights.")


--- Summary of Key Findings ---
1. Time of Day Patterns: Peak traffic hours are visible in the hourly plots
2. Day of Week Patterns: Weekday vs weekend patterns in the heatmap
3. Seasonal Variations: Monthly trends show seasonal traffic patterns
4. Year-over-Year: Differences between 2022 and 2023 traffic conditions
5. Road Segment Analysis: Identified top congested road segments

Exploration completed! Check the generated plots for visual insights.
