
"""
Traffic Data Quality Audit (2022-2023)

This notebook performs quality checks on the processed traffic dataset to ensure:
1. Temporal completeness and consistency
2. Value distributions and anomalies
3. Missing data patterns
4. Spatial coverage
5. Data integrity across joins
"""

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import calendar
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Configuration
plt.style.use('ggplot')
TRAFFIC_FILE = "../data/processed/traffic_history_2022_2023_processed.parquet"
TRAMS_FILE = "../data/raw/traffic_history/trams_info.csv"  # If you have segment metadata

In [3]:
# 1. Load and Initial Check
print("Loading traffic data...")
df = pd.read_parquet(TRAFFIC_FILE)
print(f"\nDataset Shape: {df.shape}")
print(f"Memory Usage: {df.memory_usage().sum() / 1024**2:.2f} MB")


Loading traffic data...

Dataset Shape: (106964597, 4)
Memory Usage: 2040.19 MB


In [4]:
# 2. Temporal Completeness Check
print("\n=== Temporal Completeness Audit ===")

# Check date range
date_range = df['Timestamp'].agg(['min', 'max'])
print("\nDate Range:")
print(f"Start: {date_range['min']}")
print(f"End: {date_range['max']}")

# Check for missing dates
all_dates = pd.date_range(start=date_range['min'].date(), 
                         end=date_range['max'].date(), 
                         freq='D')
dates_present = df['Timestamp'].dt.date.unique()
missing_dates = set(all_dates.date) - set(dates_present)
if missing_dates:
    print("\nMissing Dates:")
    for date in sorted(missing_dates):
        print(f"- {date}")

# Check 5-minute interval consistency
print("\nChecking 5-minute interval consistency...")
sample_segment = df['ID_TRAM'].iloc[0]
segment_data = df[df['ID_TRAM'] == sample_segment].copy()
segment_data['time_diff'] = segment_data['Timestamp'].diff()
irregular_intervals = segment_data[segment_data['time_diff'] != timedelta(minutes=5)]
if not irregular_intervals.empty:
    print(f"\nFound {len(irregular_intervals)} irregular intervals in sample segment {sample_segment}")
    print("Sample of irregular intervals:")
    print(irregular_intervals.head())



=== Temporal Completeness Audit ===

Date Range:
Start: 2022-01-01 00:00:00
End: 2023-12-31 23:50:00

Missing Dates:
- 2023-10-01
- 2023-10-02
- 2023-10-03
- 2023-10-04
- 2023-10-05
- 2023-10-06
- 2023-10-07
- 2023-10-08
- 2023-10-09
- 2023-10-10
- 2023-10-11
- 2023-10-12
- 2023-10-13
- 2023-10-14
- 2023-10-15
- 2023-10-16
- 2023-10-17
- 2023-10-18
- 2023-10-19
- 2023-10-20
- 2023-10-21
- 2023-10-22
- 2023-10-23
- 2023-10-24
- 2023-10-25
- 2023-10-26
- 2023-10-27
- 2023-10-28
- 2023-10-29
- 2023-10-30
- 2023-10-31

Checking 5-minute interval consistency...

Found 4 irregular intervals in sample segment 1
Sample of irregular intervals:
          ID_TRAM           Timestamp  EstatActual  PrevisioActual  \
0               1 2022-01-01 00:00:00         <NA>            <NA>   
78929261        1 2023-06-01 00:10:00         <NA>            <NA>   
93024069        1 2023-09-01 00:10:00         <NA>            <NA>   
97619485        1 2023-11-01 00:05:00         <NA>            <NA>   

     

In [5]:
# 3. Value Distribution Analysis
print("\n=== Value Distribution Audit ===")

# Traffic status distribution
status_dist = df['EstatActual'].value_counts().sort_index()
print("\nTraffic Status Distribution:")
for status, count in status_dist.items():
    percentage = 100 * count / len(df)
    print(f"Status {status}: {count:,} ({percentage:.2f}%)")

# Check for invalid values
valid_status_values = {0, 1, 2, 3, 4, 5, 6}  # Expected valid values
invalid_status = df[~df['EstatActual'].isin(valid_status_values)]
if not invalid_status.empty:
    print("\nFound invalid status values:")
    print(invalid_status['EstatActual'].value_counts())


=== Value Distribution Audit ===

Traffic Status Distribution:
Status 0: 45,327,003 (42.38%)
Status 1: 22,201,398 (20.76%)
Status 2: 30,331,548 (28.36%)
Status 3: 5,153,886 (4.82%)
Status 4: 1,868,649 (1.75%)
Status 5: 823,123 (0.77%)
Status 6: 1,246,769 (1.17%)

Found invalid status values:
Series([], Name: count, dtype: Int64)


In [6]:
# 4. Spatial Coverage Analysis
print("\n=== Spatial Coverage Audit ===")

# Count unique segments
n_segments = df['ID_TRAM'].nunique()
print(f"\nTotal unique segments: {n_segments}")

# Segment coverage over time
segment_dates = df.groupby('ID_TRAM')['Timestamp'].agg(['min', 'max'])
print("\nSegment temporal coverage summary:")
print(segment_dates.describe())

# Check for segments with gaps
print("\nChecking for segments with significant gaps...")
for segment in df['ID_TRAM'].unique():
    segment_data = df[df['ID_TRAM'] == segment].copy()
    segment_data = segment_data.sort_values('Timestamp')
    segment_data['time_diff'] = segment_data['Timestamp'].diff()
    large_gaps = segment_data[segment_data['time_diff'] > timedelta(hours=1)]
    if not large_gaps.empty:
        print(f"\nSegment {segment} has {len(large_gaps)} gaps > 1 hour")
        print("Largest gap:", segment_data['time_diff'].max())



=== Spatial Coverage Audit ===

Total unique segments: 532

Segment temporal coverage summary:
                                 min                            max
count                            532                            532
mean   2022-01-01 20:23:38.233082624  2023-12-31 23:49:59.999999744
min              2022-01-01 00:00:00            2023-12-31 23:50:00
25%              2022-01-01 00:00:00            2023-12-31 23:50:00
50%              2022-01-01 00:00:00            2023-12-31 23:50:00
75%              2022-01-01 00:00:00            2023-12-31 23:50:00
max              2022-04-01 09:55:00            2023-12-31 23:50:00

Checking for segments with significant gaps...

Segment 1 has 1 gaps > 1 hour
Largest gap: 31 days 00:10:00

Segment 2 has 1 gaps > 1 hour
Largest gap: 31 days 00:10:00

Segment 3 has 1 gaps > 1 hour
Largest gap: 31 days 00:10:00

Segment 4 has 1 gaps > 1 hour
Largest gap: 31 days 00:10:00

Segment 5 has 1 gaps > 1 hour
Largest gap: 31 days 00:10:00

Segmen

In [7]:
# 5. Data Consistency Checks
print("\n=== Data Consistency Audit ===")

# Check relationship between EstatActual and PrevisioActual
correlation = df['EstatActual'].corr(df['PrevisioActual'])
print(f"\nCorrelation between actual and forecast: {correlation:.3f}")

# Create confusion matrix between actual and forecast
conf_matrix = pd.crosstab(df['EstatActual'], df['PrevisioActual'], 
                         normalize='index')
print("\nNormalized confusion matrix (Actual vs Forecast):")
print(conf_matrix)


=== Data Consistency Audit ===

Correlation between actual and forecast: 0.908

Normalized confusion matrix (Actual vs Forecast):
PrevisioActual         0         1         2         3         4         5  \
EstatActual                                                                  
0               0.999888  0.000037  0.000040  0.000011  0.000005  0.000019   
1               0.120078  0.759182  0.106444  0.011368  0.001433  0.001495   
2               0.079978  0.095850  0.732040  0.082507  0.008115  0.001510   
3               0.016417  0.033377  0.223972  0.565681  0.142013  0.018538   
4               0.013081  0.013949  0.065444  0.230522  0.520584  0.156420   
5               0.019298  0.009550  0.030886  0.075924  0.236736  0.627605   
6               0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   

PrevisioActual    6  
EstatActual          
0               0.0  
1               0.0  
2               0.0  
3               0.0  
4               0.0  
5           

In [8]:
# 6. Visualize Quality Metrics
print("\n=== Creating Quality Visualization ===")

# Data completeness heatmap
plt.figure(figsize=(15, 8))
daily_counts = df.groupby([df['Timestamp'].dt.date, 'ID_TRAM']).size().unstack()
sns.heatmap(daily_counts.notna(), cmap='RdYlGn', cbar_kws={'label': 'Has Data'})
plt.title('Data Completeness by Segment and Date')
plt.xlabel('Segment ID')
plt.ylabel('Date')
plt.tight_layout()
plt.savefig('data_completeness_heatmap.png')
plt.close()


=== Creating Quality Visualization ===


In [11]:
# 6.1: Distribution of Missing Values Over Time
print("\\nVisualizing missing values distribution over time...")
missing_data = df[df['EstatActual'].isna()].copy()
if not missing_data.empty:
    missing_data['year_month'] = missing_data['Timestamp'].dt.to_period('M')
    monthly_missing = missing_data.groupby('year_month').size()
    
    plt.figure(figsize=(14, 7))
    monthly_missing.plot(kind='bar')
    plt.title('Number of Missing Traffic Status Records per Month')
    plt.xlabel('Year-Month')
    plt.ylabel('Number of Missing Records')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('missing_values_by_month.png')
    plt.close()
    print("Plot saved: missing_values_by_month.png")
else:
    print("No missing values found to visualize.")


\nVisualizing missing values distribution over time...
Plot saved: missing_values_by_month.png


In [12]:
# 6.2: Forecast Error Distribution
print("\\nAnalyzing forecast error distribution...")
df_no_na = df.dropna(subset=['EstatActual', 'PrevisioActual']).copy()
df_no_na['ForecastError'] = (df_no_na['EstatActual'].astype(int) - df_no_na['PrevisioActual'].astype(int))

plt.figure(figsize=(12, 6))
sns.histplot(df_no_na['ForecastError'], bins=range(-6, 7), kde=False)
plt.title('Distribution of Forecast Error (Actual - Forecast)')
plt.xlabel('Error Value')
plt.ylabel('Frequency')
plt.grid(axis='y')
plt.tight_layout()
plt.savefig('forecast_error_distribution.png')
plt.close()
print("Plot saved: forecast_error_distribution.png")
del df_no_na # Free memory

\nAnalyzing forecast error distribution...
Plot saved: forecast_error_distribution.png


In [13]:
# 6.3: Interval Consistency Check (Sample Segments)
print("\\nChecking interval consistency for sample segments...")
sample_segment_ids = np.random.choice(df['ID_TRAM'].unique(), 3, replace=False)
plt.figure(figsize=(15, 5 * len(sample_segment_ids)))
for i, segment_id in enumerate(sample_segment_ids):
    segment_data = df[df['ID_TRAM'] == segment_id].copy()
    segment_data['time_diff_minutes'] = segment_data['Timestamp'].diff().dt.total_seconds() / 60
    segment_data = segment_data.dropna(subset=['time_diff_minutes']) # Drop first NaN
    
    ax = plt.subplot(len(sample_segment_ids), 1, i + 1)
    sns.histplot(segment_data['time_diff_minutes'], bins=np.arange(0, 65, 5), kde=False, ax=ax)
    ax.set_title(f'Time Interval Distribution for Segment {segment_id}')
    ax.set_xlabel('Time Difference (Minutes)')
    ax.set_ylabel('Frequency')
    ax.set_xlim(0, 60) # Focus on intervals up to 1 hour
    ax.grid(axis='y')

plt.tight_layout()
plt.savefig('interval_consistency_samples.png')
plt.close()
print("Plot saved: interval_consistency_samples.png")



\nChecking interval consistency for sample segments...
Plot saved: interval_consistency_samples.png


In [9]:
# 7. Summary Statistics
print("\n=== Summary Statistics ===")
print("\nNumerical columns summary:")
print(df.describe())

print("\nMissing values:")
print(df.isnull().sum())



=== Summary Statistics ===

Numerical columns summary:
            ID_TRAM                      Timestamp  EstatActual  \
count  1.069646e+08                      106964597  106952376.0   
mean   2.668614e+02  2022-12-19 13:40:13.519811328     1.097657   
min    1.000000e+00            2022-01-01 00:00:00          0.0   
25%    1.330000e+02            2022-06-25 09:10:00          0.0   
50%    2.660000e+02            2022-12-16 21:55:00          1.0   
75%    3.990000e+02            2023-06-09 10:50:00          2.0   
max    5.390000e+02            2023-12-31 23:50:00          6.0   
std    1.544199e+02                            NaN     1.209313   

       PrevisioActual  
count     106952376.0  
mean         1.040811  
min               0.0  
25%               0.0  
50%               1.0  
75%               2.0  
max               6.0  
std          1.251094  

Missing values:
ID_TRAM               0
Timestamp             0
EstatActual       12221
PrevisioActual    12221
dtype: int6

In [10]:
# 8. Generate Quality Report
report = f"""
Traffic Data Quality Report
==========================
Generated on: {datetime.now()}

Dataset Overview:
----------------
- Total Records: {len(df):,}
- Date Range: {date_range['min']} to {date_range['max']}
- Number of Segments: {n_segments}
- Memory Usage: {df.memory_usage().sum() / 1024**2:.2f} MB

Temporal Completeness:
---------------------
- Missing Dates: {len(missing_dates)}
- 5-min Interval Consistency: {len(irregular_intervals)} irregularities found

Value Distribution:
-----------------
- Valid Status Values: {len(status_dist)} unique values
- Invalid Status Values: {len(invalid_status)} records

Spatial Coverage:
---------------
- Complete Segment Coverage: {(segment_dates['max'] - segment_dates['min']).describe()}

Data Consistency:
---------------
- Actual-Forecast Correlation: {correlation:.3f}

Quality Metrics:
--------------
- Data Completeness: {100 * (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))):.2f}%
- Temporal Coverage: {100 * (1 - len(missing_dates) / len(all_dates)):.2f}%
"""

# Save report
with open('traffic_data_quality_report.txt', 'w') as f:
    f.write(report)

print("\nQuality audit complete. Report saved to 'traffic_data_quality_report.txt'")


Quality audit complete. Report saved to 'traffic_data_quality_report.txt'


In [14]:
# --- 7. Detailed Segment/Time Analysis ---
print("\\n=== Detailed Segment/Time Analysis ===")

# 7.1: Analysis of Segment with Most Missing Values
print("\\nAnalyzing segment with most missing values...")
missing_by_segment = df[df['EstatActual'].isna()]['ID_TRAM'].value_counts()
if not missing_by_segment.empty:
    top_missing_segment = missing_by_segment.index[0]
    print(f"Segment with most missing values: {top_missing_segment} ({missing_by_segment.iloc[0]} missing)")
    
    segment_missing_data = df[(df['ID_TRAM'] == top_missing_segment) & (df['EstatActual'].isna())].copy()
    segment_missing_data['date'] = segment_missing_data['Timestamp'].dt.date
    missing_trend = segment_missing_data.groupby('date').size()
    
    plt.figure(figsize=(14, 7))
    missing_trend.plot()
    plt.title(f'Daily Missing Records for Segment {top_missing_segment}')
    plt.xlabel('Date')
    plt.ylabel('Number of Missing Records')
    plt.tight_layout()
    plt.savefig(f'segment_{top_missing_segment}_missing_trend.png')
    plt.close()
    print(f"Plot saved: segment_{top_missing_segment}_missing_trend.png")
else:
    print("No missing values to analyze by segment.")

\n=== Detailed Segment/Time Analysis ===
\nAnalyzing segment with most missing values...
Segment with most missing values: 1 (23 missing)
Plot saved: segment_1_missing_trend.png


In [15]:
# 7.2: Analysis of Time Period with Most Missing Values
# (Assuming monthly analysis from 6.1 is sufficient, otherwise group by day/hour)
print("\\nRefer to 'missing_values_by_month.png' for time periods with most missing values.")


\nRefer to 'missing_values_by_month.png' for time periods with most missing values.


In [16]:
# 7.3: Quality Check for Top Congested Segment (ID 185)
print("\\nAnalyzing data quality for Segment 185 (Top Congested)...")
segment_185_data = df[df['ID_TRAM'] == 185].copy()
missing_185 = segment_185_data['EstatActual'].isna().sum()
print(f"Missing values for Segment 185: {missing_185}")

segment_185_data['time_diff_minutes'] = segment_185_data['Timestamp'].diff().dt.total_seconds() / 60
gaps_185 = segment_185_data[segment_185_data['time_diff_minutes'] > 60] # Gaps > 1 hour
print(f"Number of gaps > 1 hour for Segment 185: {len(gaps_185)}")
if not gaps_185.empty:
    print(f"Largest gap for Segment 185: {segment_185_data['time_diff_minutes'].max():.2f} minutes")


\nAnalyzing data quality for Segment 185 (Top Congested)...
Missing values for Segment 185: 23
Number of gaps > 1 hour for Segment 185: 1
Largest gap for Segment 185: 44650.00 minutes


In [17]:
# 7.4: Quality Check for August (Lowest Traffic Month)
print("\\nAnalyzing data quality for August months...")
august_data = df[df['Timestamp'].dt.month == 8].copy()
missing_august = august_data['EstatActual'].isna().sum()
total_august = len(august_data)
print(f"Missing values in August: {missing_august} ({100 * missing_august / total_august:.3f}%)")

# Check interval consistency for a random segment during August
random_segment_aug = np.random.choice(august_data['ID_TRAM'].unique())
august_segment_data = august_data[august_data['ID_TRAM'] == random_segment_aug].copy()
august_segment_data['time_diff_minutes'] = august_segment_data['Timestamp'].diff().dt.total_seconds() / 60
gaps_august = august_segment_data[august_segment_data['time_diff_minutes'] > 60]
print(f"Gaps > 1 hour for Segment {random_segment_aug} in August: {len(gaps_august)}")


\nAnalyzing data quality for August months...
Missing values in August: 1064 (0.011%)
Gaps > 1 hour for Segment 215 in August: 1
