# Data Validation Report

This notebook validates the NYC Yellow Taxi trip data and provides:
- **Completeness Check**: Identifies missing months in the dataset
- **Schema Validation**: Ensures consistency across all Parquet files
- **Data Quality Metrics**: Row counts, null values, and data ranges
- **Schema Documentation**: Detailed field descriptions for use in other notebooks

Run this notebook before performing analysis to ensure data integrity.

In [1]:
import duckdb
import polars as pl
from pathlib import Path
from datetime import datetime, timedelta
import json

## 1. Data Inventory

Scan the data directory and catalog all available files.

In [2]:
# Configuration
data_dir = Path("data")
data_type = "yellow"
pattern = f"{data_type}_tripdata_*.parquet"

# Find all data files
data_files = sorted(data_dir.glob(pattern))

print(f"Data Directory: {data_dir.absolute()}")
print(f"Pattern: {pattern}")
print(f"Files Found: {len(data_files)}")
print("\n" + "="*80)

Data Directory: C:\Users\Rob\Desktop\ai-for-the-rest\examples\nyc-congestion-pricing\data
Pattern: yellow_tripdata_*.parquet
Files Found: 59



In [3]:
# Extract file metadata
file_info = []
for file_path in data_files:
    # Parse year-month from filename
    # Format: yellow_tripdata_YYYY-MM.parquet
    parts = file_path.stem.split("_")
    if len(parts) >= 3:
        year_month = parts[2]  # e.g., "2021-01"
        year, month = map(int, year_month.split("-"))
        
        file_info.append({
            "filename": file_path.name,
            "year": year,
            "month": month,
            "year_month": year_month,
            "size_mb": file_path.stat().st_size / (1024 * 1024),
            "path": str(file_path)
        })

# Convert to Polars DataFrame for easy analysis
if len(file_info) > 0:
    files_df = pl.DataFrame(file_info).sort(["year", "month"])
    print(f"\nFile Inventory ({len(files_df)} files):")
    print(files_df.select(["year_month", "size_mb"]))
else:
    files_df = pl.DataFrame()
    print("\nNo valid data files found")


File Inventory (59 files):
shape: (59, 2)
┌────────────┬───────────┐
│ year_month ┆ size_mb   │
│ ---        ┆ ---       │
│ str        ┆ f64       │
╞════════════╪═══════════╡
│ 2021-01    ┆ 20.681445 │
│ 2021-02    ┆ 20.768412 │
│ 2021-03    ┆ 28.617718 │
│ 2021-04    ┆ 32.442627 │
│ 2021-05    ┆ 36.948854 │
│ …          ┆ …         │
│ 2025-07    ┆ 63.842514 │
│ 2025-08    ┆ 59.407943 │
│ 2025-09    ┆ 69.077439 │
│ 2025-10    ┆ 71.780766 │
│ 2025-11    ┆ 67.838912 │
└────────────┴───────────┘


## 2. Completeness Check

Identify missing months in the expected date range.

In [4]:
if len(files_df) > 0:
    # Determine expected date range
    min_year = files_df["year"].min()
    min_month = files_df.filter(pl.col("year") == min_year)["month"].min()
    max_year = files_df["year"].max()
    max_month = files_df.filter(pl.col("year") == max_year)["month"].max()
    
    print(f"Date Range: {min_year}-{min_month:02d} to {max_year}-{max_month:02d}")
    
    # Generate expected months
    start_date = datetime(min_year, min_month, 1)
    end_date = datetime(max_year, max_month, 1)
    
    expected_months = []
    current = start_date
    while current <= end_date:
        expected_months.append(f"{current.year}-{current.month:02d}")
        # Move to next month
        if current.month == 12:
            current = current.replace(year=current.year + 1, month=1)
        else:
            current = current.replace(month=current.month + 1)
    
    # Find missing months
    available_months = set(files_df["year_month"].to_list())
    missing_months = [m for m in expected_months if m not in available_months]
    
    print(f"\nExpected Months: {len(expected_months)}")
    print(f"Available Months: {len(available_months)}")
    print(f"Missing Months: {len(missing_months)}")
    
    if missing_months:
        print(f"\n⚠️  ALERT: Missing data for the following months:")
        for month in missing_months:
            print(f"   - {month}")
    else:
        print("\n✓ All expected months are present")
else:
    print("⚠️  No data files found!")

Date Range: 2021-01 to 2025-11

Expected Months: 59
Available Months: 59
Missing Months: 0

✓ All expected months are present


## 3. Schema Validation

Verify that all files have consistent schemas.

In [5]:
# Get schema from the first file as reference
if len(data_files) > 0:
    reference_file = data_files[0]
    
    # Use DuckDB to get schema information
    reference_schema = duckdb.sql(f"""
        SELECT * FROM '{reference_file}' LIMIT 0
    """).description
    
    schema_columns = [col[0] for col in reference_schema]
    schema_types = [col[1] for col in reference_schema]
    
    print(f"Reference Schema (from {reference_file.name}):")
    print(f"Total Columns: {len(schema_columns)}\n")
    
    # Display schema
    schema_df = pl.DataFrame({
        "column_name": schema_columns,
        "data_type": [str(t) for t in schema_types]
    })
    print(schema_df)
else:
    print("No files to analyze")
    schema_columns = []
    schema_types = []

Reference Schema (from yellow_tripdata_2021-01.parquet):
Total Columns: 19

shape: (19, 2)
┌───────────────────────┬───────────┐
│ column_name           ┆ data_type │
│ ---                   ┆ ---       │
│ str                   ┆ str       │
╞═══════════════════════╪═══════════╡
│ VendorID              ┆ BIGINT    │
│ tpep_pickup_datetime  ┆ TIMESTAMP │
│ tpep_dropoff_datetime ┆ TIMESTAMP │
│ passenger_count       ┆ DOUBLE    │
│ trip_distance         ┆ DOUBLE    │
│ …                     ┆ …         │
│ tolls_amount          ┆ DOUBLE    │
│ improvement_surcharge ┆ DOUBLE    │
│ total_amount          ┆ DOUBLE    │
│ congestion_surcharge  ┆ DOUBLE    │
│ airport_fee           ┆ DOUBLE    │
└───────────────────────┴───────────┘


In [6]:
# Check schema consistency across all files
if len(data_files) > 1:
    print("\nChecking schema consistency across all files...\n")
    
    inconsistent_files = []
    
    for file_path in data_files[1:]:
        file_schema = duckdb.sql(f"""
            SELECT * FROM '{file_path}' LIMIT 0
        """).description
        
        file_columns = [col[0] for col in file_schema]
        file_types = [str(col[1]) for col in file_schema]
        
        # Check for differences
        if file_columns != schema_columns:
            inconsistent_files.append({
                "file": file_path.name,
                "issue": "Column names mismatch",
                "details": f"Expected {len(schema_columns)} columns, got {len(file_columns)}"
            })
        elif [str(t) for t in schema_types] != file_types:
            inconsistent_files.append({
                "file": file_path.name,
                "issue": "Data types mismatch",
                "details": "Column types differ from reference"
            })
    
    if inconsistent_files:
        print(f"⚠️  ALERT: Found {len(inconsistent_files)} files with schema inconsistencies:\n")
        for issue in inconsistent_files:
            print(f"   - {issue['file']}")
            print(f"     {issue['issue']}: {issue['details']}")
    else:
        print("✓ All files have consistent schemas")
elif len(data_files) == 1:
    print("Only one file available - cannot check consistency")


Checking schema consistency across all files...

⚠️  ALERT: Found 34 files with schema inconsistencies:

   - yellow_tripdata_2023-02.parquet
     Column names mismatch: Expected 19 columns, got 19
   - yellow_tripdata_2023-03.parquet
     Column names mismatch: Expected 19 columns, got 19
   - yellow_tripdata_2023-04.parquet
     Column names mismatch: Expected 19 columns, got 19
   - yellow_tripdata_2023-05.parquet
     Column names mismatch: Expected 19 columns, got 19
   - yellow_tripdata_2023-06.parquet
     Column names mismatch: Expected 19 columns, got 19
   - yellow_tripdata_2023-07.parquet
     Column names mismatch: Expected 19 columns, got 19
   - yellow_tripdata_2023-08.parquet
     Column names mismatch: Expected 19 columns, got 19
   - yellow_tripdata_2023-09.parquet
     Column names mismatch: Expected 19 columns, got 19
   - yellow_tripdata_2023-10.parquet
     Column names mismatch: Expected 19 columns, got 19
   - yellow_tripdata_2023-11.parquet
     Column names mi

## 4. Data Quality Metrics

Analyze row counts, null values, and data ranges for each file.

In [7]:
# Get row counts and basic statistics for each file
if len(data_files) > 0:
    print("Analyzing data quality metrics...\n")
    
    file_stats = []
    
    for file_path in data_files:
        # Get row count
        row_count = duckdb.sql(f"""
            SELECT COUNT(*) as cnt FROM '{file_path}'
        """).fetchone()[0]
        
        # Extract year-month
        year_month = file_path.stem.split("_")[2]
        
        file_stats.append({
            "year_month": year_month,
            "filename": file_path.name,
            "row_count": row_count,
            "size_mb": file_path.stat().st_size / (1024 * 1024)
        })
    
    stats_df = pl.DataFrame(file_stats).sort("year_month")
    
    print("File Statistics:")
    print(stats_df.select(["year_month", "row_count", "size_mb"]))
    
    # Summary statistics
    print(f"\n{'='*80}")
    print("Summary:")
    print(f"  Total Files: {len(stats_df)}")
    print(f"  Total Rows: {stats_df['row_count'].sum():,}")
    print(f"  Total Size: {stats_df['size_mb'].sum():.2f} MB")
    print(f"  Avg Rows/File: {stats_df['row_count'].mean():,.0f}")
    print(f"  Min Rows: {stats_df['row_count'].min():,} ({stats_df.filter(pl.col('row_count') == pl.col('row_count').min())['year_month'][0]})")
    print(f"  Max Rows: {stats_df['row_count'].max():,} ({stats_df.filter(pl.col('row_count') == pl.col('row_count').max())['year_month'][0]})")
else:
    print("No files to analyze")
    stats_df = pl.DataFrame()

Analyzing data quality metrics...



File Statistics:
shape: (59, 3)
┌────────────┬───────────┬───────────┐
│ year_month ┆ row_count ┆ size_mb   │
│ ---        ┆ ---       ┆ ---       │
│ str        ┆ i64       ┆ f64       │
╞════════════╪═══════════╪═══════════╡
│ 2021-01    ┆ 1369769   ┆ 20.681445 │
│ 2021-02    ┆ 1371709   ┆ 20.768412 │
│ 2021-03    ┆ 1925152   ┆ 28.617718 │
│ 2021-04    ┆ 2171187   ┆ 32.442627 │
│ 2021-05    ┆ 2507109   ┆ 36.948854 │
│ …          ┆ …         ┆ …         │
│ 2025-07    ┆ 3898963   ┆ 63.842514 │
│ 2025-08    ┆ 3574091   ┆ 59.407943 │
│ 2025-09    ┆ 4251015   ┆ 69.077439 │
│ 2025-10    ┆ 4428699   ┆ 71.780766 │
│ 2025-11    ┆ 4181444   ┆ 67.838912 │
└────────────┴───────────┴───────────┘

Summary:
  Total Files: 59
  Total Rows: 194,457,948
  Total Size: 3032.40 MB
  Avg Rows/File: 3,295,897
  Min Rows: 1,369,769 (2021-01)
  Max Rows: 4,591,845 (2025-05)


In [8]:
# Check for null values in key columns
if len(data_files) > 0:
    print("\nChecking for null values in key columns...\n")
    
    # Query all files at once using glob pattern
    pattern_path = str(data_dir / pattern)
    
    # Key columns to check
    key_columns = [
        "VendorID",
        "tpep_pickup_datetime",
        "tpep_dropoff_datetime",
        "passenger_count",
        "trip_distance",
        "PULocationID",
        "DOLocationID",
        "payment_type",
        "fare_amount",
        "total_amount"
    ]
    
    # Build dynamic query to count nulls
    null_checks = []
    for col in key_columns:
        null_checks.append(f"SUM(CASE WHEN {col} IS NULL THEN 1 ELSE 0 END) as {col}_nulls")
    
    null_query = f"""
    SELECT 
        COUNT(*) as total_rows,
        {', '.join(null_checks)}
    FROM '{pattern_path}'
    """
    
    null_results = duckdb.sql(null_query).pl()
    
    # Calculate null percentages
    total_rows = null_results["total_rows"][0]
    print(f"Total rows across all files: {total_rows:,}\n")
    
    null_summary = []
    for col in key_columns:
        null_count = null_results[f"{col}_nulls"][0]
        null_pct = (null_count / total_rows * 100) if total_rows > 0 else 0
        null_summary.append({
            "column": col,
            "null_count": null_count,
            "null_percentage": null_pct
        })
    
    null_df = pl.DataFrame(null_summary).sort("null_percentage", descending=True)
    print("Null Value Analysis:")
    print(null_df)
    
    # Alert on high null percentages
    high_nulls = null_df.filter(pl.col("null_percentage") > 10)
    if len(high_nulls) > 0:
        print(f"\n⚠️  ALERT: Columns with >10% null values:")
        for row in high_nulls.iter_rows(named=True):
            print(f"   - {row['column']}: {row['null_percentage']:.2f}%")
    else:
        print("\n✓ No columns have excessive null values (>10%)")


Checking for null values in key columns...



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Total rows across all files: 194,457,948

Null Value Analysis:
shape: (10, 3)
┌───────────────────────┬───────────────┬───────────────────────────────┐
│ column                ┆ null_count    ┆ null_percentage               │
│ ---                   ┆ ---           ┆ ---                           │
│ str                   ┆ decimal[38,0] ┆ decimal[38,27]                │
╞═══════════════════════╪═══════════════╪═══════════════════════════════╡
│ passenger_count       ┆ 18663998      ┆ 9.597960994631085997060917253 │
│ VendorID              ┆ 0             ┆ 0.000000000000000000000000000 │
│ tpep_pickup_datetime  ┆ 0             ┆ 0.000000000000000000000000000 │
│ tpep_dropoff_datetime ┆ 0             ┆ 0.000000000000000000000000000 │
│ trip_distance         ┆ 0             ┆ 0.000000000000000000000000000 │
│ PULocationID          ┆ 0             ┆ 0.000000000000000000000000000 │
│ DOLocationID          ┆ 0             ┆ 0.000000000000000000000000000 │
│ payment_type          ┆ 0       

## 5. Data Range Validation

Check for anomalies in key numeric fields.

In [9]:
# Analyze data ranges and potential anomalies
if len(data_files) > 0:
    print("Analyzing data ranges...\n")
    
    pattern_path = str(data_dir / pattern)
    
    range_query = f"""
    SELECT
        MIN(tpep_pickup_datetime) as earliest_pickup,
        MAX(tpep_pickup_datetime) as latest_pickup,
        MIN(trip_distance) as min_distance,
        MAX(trip_distance) as max_distance,
        AVG(trip_distance) as avg_distance,
        MIN(fare_amount) as min_fare,
        MAX(fare_amount) as max_fare,
        AVG(fare_amount) as avg_fare,
        MIN(total_amount) as min_total,
        MAX(total_amount) as max_total,
        AVG(total_amount) as avg_total,
        MIN(passenger_count) as min_passengers,
        MAX(passenger_count) as max_passengers,
        AVG(passenger_count) as avg_passengers
    FROM '{pattern_path}'
    """
    
    ranges = duckdb.sql(range_query).pl()
    
    print("Data Range Summary:")
    print(ranges)
    
    # Check for potential anomalies
    anomalies = []
    
    row = ranges.row(0, named=True)
    
    if row['min_distance'] < 0:
        anomalies.append("Negative trip distances detected")
    if row['max_distance'] > 1000:
        anomalies.append(f"Extremely long trip distance: {row['max_distance']:.2f} miles")
    if row['min_fare'] < 0:
        anomalies.append("Negative fare amounts detected")
    if row['max_fare'] > 10000:
        anomalies.append(f"Extremely high fare: ${row['max_fare']:.2f}")
    if row['min_total'] < 0:
        anomalies.append("Negative total amounts detected")
    if row['min_passengers'] < 0:
        anomalies.append("Negative passenger counts detected")
    if row['max_passengers'] > 9:
        anomalies.append(f"Unusually high passenger count: {row['max_passengers']}")
    
    if anomalies:
        print(f"\n⚠️  ALERT: Potential data anomalies detected:")
        for anomaly in anomalies:
            print(f"   - {anomaly}")
    else:
        print("\n✓ No obvious data anomalies detected")

Analyzing data ranges...



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Data Range Summary:
shape: (1, 14)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ earliest_ ┆ latest_pi ┆ min_dista ┆ max_dista ┆ … ┆ avg_total ┆ min_passe ┆ max_passe ┆ avg_pass │
│ pickup    ┆ ckup      ┆ nce       ┆ nce       ┆   ┆ ---       ┆ ngers     ┆ ngers     ┆ engers   │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ f64       ┆ ---       ┆ ---       ┆ ---      │
│ datetime[ ┆ datetime[ ┆ f64       ┆ f64       ┆   ┆           ┆ f64       ┆ f64       ┆ f64      │
│ μs]       ┆ μs]       ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2001-01-0 ┆ 2098-09-1 ┆ 0.0       ┆ 398608.62 ┆ … ┆ 25.108223 ┆ 0.0       ┆ 112.0     ┆ 1.364151 │
│ 1         ┆ 1         ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ 00:03:14  ┆ 02:23:31  ┆           ┆           ┆   ┆   

## 6. Schema Documentation

Detailed field descriptions based on NYC TLC data dictionary.

In [10]:
# NYC Yellow Taxi Trip Record Schema Documentation
schema_documentation = {
    "VendorID": {
        "description": "TPEP provider (1=Creative Mobile Technologies, 2=VeriFone Inc.)",
        "type": "Integer"
    },
    "tpep_pickup_datetime": {
        "description": "Date and time when the meter was engaged",
        "type": "Timestamp"
    },
    "tpep_dropoff_datetime": {
        "description": "Date and time when the meter was disengaged",
        "type": "Timestamp"
    },
    "passenger_count": {
        "description": "Number of passengers in the vehicle (driver entered value)",
        "type": "Integer"
    },
    "trip_distance": {
        "description": "Trip distance in miles reported by the taximeter",
        "type": "Float"
    },
    "RatecodeID": {
        "description": "Rate code (1=Standard, 2=JFK, 3=Newark, 4=Nassau/Westchester, 5=Negotiated, 6=Group ride)",
        "type": "Integer"
    },
    "store_and_fwd_flag": {
        "description": "Trip record held in vehicle memory before sending (Y=store and forward, N=not)",
        "type": "String"
    },
    "PULocationID": {
        "description": "TLC Taxi Zone where the meter was engaged",
        "type": "Integer",
        "note": "Critical for congestion pricing analysis"
    },
    "DOLocationID": {
        "description": "TLC Taxi Zone where the meter was disengaged",
        "type": "Integer",
        "note": "Critical for congestion pricing analysis"
    },
    "payment_type": {
        "description": "Payment method (1=Credit card, 2=Cash, 3=No charge, 4=Dispute, 5=Unknown, 6=Voided)",
        "type": "Integer"
    },
    "fare_amount": {
        "description": "Time-and-distance fare calculated by the meter",
        "type": "Float"
    },
    "extra": {
        "description": "Miscellaneous extras and surcharges (rush hour, overnight)",
        "type": "Float"
    },
    "mta_tax": {
        "description": "$0.50 MTA tax automatically triggered based on metered rate",
        "type": "Float"
    },
    "tip_amount": {
        "description": "Tip amount (automatically populated for credit card, cash tips not included)",
        "type": "Float"
    },
    "tolls_amount": {
        "description": "Total amount of all tolls paid in trip",
        "type": "Float"
    },
    "improvement_surcharge": {
        "description": "$0.30 improvement surcharge assessed on hailed trips",
        "type": "Float"
    },
    "total_amount": {
        "description": "Total amount charged to passengers (does not include cash tips)",
        "type": "Float"
    },
    "congestion_surcharge": {
        "description": "Congestion surcharge for trips in Manhattan south of 96th Street",
        "type": "Float",
        "note": "Introduced in 2019 - may not be present in older files"
    },
    "airport_fee": {
        "description": "$1.25 fee for pickups at LaGuardia and JFK airports",
        "type": "Float",
        "note": "Added in more recent data - may not be present in older files"
    }
}

print("Schema Documentation for NYC Yellow Taxi Trip Records")
print("="*80)
print("\nKey Fields for Congestion Pricing Analysis:")
print("  - PULocationID: Pickup location (taxi zone)")
print("  - DOLocationID: Dropoff location (taxi zone)")
print("  - tpep_pickup_datetime: Timestamp for time-of-day analysis")
print("  - congestion_surcharge: Existing congestion fee (2019+)")
print("\nNote: The congestion pricing zone generally covers Manhattan south of 60th Street.")
print("      Taxi zones 1-263 map to specific geographic areas.")
print("\nFull schema saved to outputs/schema_documentation.json")

Schema Documentation for NYC Yellow Taxi Trip Records

Key Fields for Congestion Pricing Analysis:
  - PULocationID: Pickup location (taxi zone)
  - DOLocationID: Dropoff location (taxi zone)
  - tpep_pickup_datetime: Timestamp for time-of-day analysis
  - congestion_surcharge: Existing congestion fee (2019+)

Note: The congestion pricing zone generally covers Manhattan south of 60th Street.
      Taxi zones 1-263 map to specific geographic areas.

Full schema saved to outputs/schema_documentation.json


In [11]:
# Save schema documentation to outputs
output_dir = Path("outputs")
output_dir.mkdir(parents=True, exist_ok=True)

schema_output = {
    "schema": schema_documentation,
    "columns": schema_columns if 'schema_columns' in dir() else [],
    "data_types": [str(t) for t in schema_types] if 'schema_types' in dir() else [],
    "generated_at": datetime.now().isoformat()
}

with open(output_dir / "schema_documentation.json", "w") as f:
    json.dump(schema_output, f, indent=2)

print("✓ Schema documentation saved")

✓ Schema documentation saved


## 7. Validation Summary

Overall data quality assessment.

In [12]:
print("\n" + "="*80)
print("VALIDATION SUMMARY")
print("="*80)

if len(data_files) > 0:
    print(f"\n✓ Dataset Status: READY FOR ANALYSIS")
    print(f"\nDataset Overview:")
    print(f"  - Files: {len(data_files)}")
    if 'stats_df' in dir() and len(stats_df) > 0:
        print(f"  - Total Records: {stats_df['row_count'].sum():,}")
        print(f"  - Date Range: {files_df['year_month'].min()} to {files_df['year_month'].max()}")
        print(f"  - Total Size: {stats_df['size_mb'].sum():.2f} MB")
    
    print(f"\nData Quality:")
    issues = []
    if 'missing_months' in dir() and len(missing_months) > 0:
        issues.append(f"{len(missing_months)} missing month(s)")
    if 'inconsistent_files' in dir() and len(inconsistent_files) > 0:
        issues.append(f"{len(inconsistent_files)} file(s) with schema issues")
    if 'anomalies' in dir() and len(anomalies) > 0:
        issues.append(f"{len(anomalies)} data anomaly/anomalies")
    
    if issues:
        print(f"  ⚠️  Issues Found: {', '.join(issues)}")
        print(f"  ℹ️  Review alerts above for details")
    else:
        print(f"  ✓ No critical issues detected")
    
    print(f"\nNext Steps:")
    print(f"  1. Review any alerts or warnings above")
    print(f"  2. Use schema documentation in outputs/schema_documentation.json")
    print(f"  3. Proceed with analysis in other notebooks")
    print(f"  4. Reference taxi zones at: https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc")
else:
    print(f"\n⚠️  Dataset Status: NO DATA FOUND")
    print(f"\nPlease run the data download notebook first to fetch NYC taxi data.")

print("\n" + "="*80)


VALIDATION SUMMARY

✓ Dataset Status: READY FOR ANALYSIS

Dataset Overview:
  - Files: 59
  - Total Records: 194,457,948
  - Date Range: 2021-01 to 2025-11
  - Total Size: 3032.40 MB

Data Quality:
  ⚠️  Issues Found: 34 file(s) with schema issues, 5 data anomaly/anomalies
  ℹ️  Review alerts above for details

Next Steps:
  2. Use schema documentation in outputs/schema_documentation.json
  3. Proceed with analysis in other notebooks
  4. Reference taxi zones at: https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc

