# DGXB Data Product & Benchmark Exploration

This notebook demonstrates the DGXB pipeline:
- Bronze layer data ingestion (traffic & weather)
- Silver layer consolidation
- Data exploration and validation

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
from datetime import datetime

# DGXB imports
from dgxb.etl.silver_processor import (
    process_bronze_to_silver_traffic,
    process_bronze_to_silver_weather
)

print("DGXB Exploration Notebook")

## 1. Load Silver Layer Data

In [None]:
# Load silver traffic data
traffic_df = pd.read_parquet("silver-cpu-traffic/data_silver.parquet")
print(f"Traffic data: {len(traffic_df)} records")
print(f"Columns: {list(traffic_df.columns)}")
print(f"Date range: {traffic_df['timestamp'].min()} to {traffic_df['timestamp'].max()}")
traffic_df.head()

In [None]:
# Load silver weather data
weather_df = pd.read_parquet("silver-cpu-weather/data_silver.parquet")
print(f"Weather data: {len(weather_df)} records")
print(f"Columns: {list(weather_df.columns)}")
print(f"Date range: {weather_df['timestamp'].min()} to {weather_df['timestamp'].max()}")
weather_df.head()

## 2. Data Exploration

In [None]:
# Traffic incidents by type
if "description" in traffic_df.columns:
    incident_counts = traffic_df["description"].value_counts().head(10)
    print("Top 10 Incident Types:")
    print(incident_counts)
    
    fig = px.bar(
        x=incident_counts.values,
        y=incident_counts.index,
        orientation="h",
        title="Top 10 Traffic Incident Types"
    )
    fig.show()

In [None]:
# Traffic incidents over time
traffic_df["date"] = pd.to_datetime(traffic_df["timestamp"]).dt.date
daily_counts = traffic_df.groupby("date").size()

fig = px.line(
    x=daily_counts.index,
    y=daily_counts.values,
    title="Daily Traffic Incidents",
    labels={"x": "Date", "y": "Incident Count"}
)
fig.show()

## 3. Data Quality Checks

In [None]:
# Traffic data quality
print("Traffic Data Quality:")
print(f"Total records: {len(traffic_df)}")
print(f"Missing lat: {traffic_df["lat"].isna().sum()}")
print(f"Missing lon: {traffic_df["lon"].isna().sum()}")
print(f"Missing timestamp: {traffic_df["timestamp"].isna().sum()}")
print(f"Missing description: {traffic_df["description"].isna().sum()}")
print(f"
Unique incident types: {traffic_df["description"].nunique()}")
print(f"Unique locations: {traffic_df[["lat", "lon"]].drop_duplicates().shape[0]}")