In [None]:
import pandas as pd
import gzip

with gzip.open("combined_data.csv.gz", "rt") as f:
    combined_df = pd.read_csv(f)


combined_df = pd.read_csv("combined_data.csv")
import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.io as pio

# Use notebook-compatible renderer
pio.renderers.default = 'notebook_connected'

# Copy your data
df = combined_df.copy()

# Parse time & extract hour in UTC
df['ingestion_time'] = pd.to_datetime(df['ingestion_time'], errors='coerce', utc=True)
df['ingestion_hour'] = df['ingestion_time'].dt.hour

# Set weekday order
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['ingestion_day_of_week'] = pd.Categorical(df['ingestion_day_of_week'], categories=day_order, ordered=True)

# Workflow filter only
workflow_dropdown = widgets.Dropdown(
    options=['All'] + sorted(df['workflow_id'].dropna().unique().tolist()),
    value='All',
    description='Workflow ID:',
    style={'description_width': 'initial'}
)

output = widgets.Output()

def update_dashboard(change=None):
    output.clear_output()

    # Filter by workflow
    filtered = df.copy()
    if workflow_dropdown.value != 'All':
        filtered = filtered[filtered['workflow_id'] == workflow_dropdown.value]

    with output:
        if filtered.empty:
            print("No data for selected workflow.")
            return

        # --- Bar Chart: Ingestions by UTC Hour ---
        hourly_counts = (
            filtered['ingestion_hour']
            .value_counts()
            .sort_index()
            .reset_index()
        )
        hourly_counts.columns = ['ingestion_hour', 'count']

        fig_hour = px.bar(
            hourly_counts,
            x='ingestion_hour',
            y='count',
            labels={'ingestion_hour': 'Hour of Day (UTC)', 'count': 'Ingestions'},
            title='Ingestions by Hour of Day (UTC)'
        )
        fig_hour.update_xaxes(dtick=1)
        fig_hour.show()

        # --- Bar Chart: Ingestions by Day of Week ---
        dow_counts = (
            filtered['ingestion_day_of_week']
            .value_counts()
            .reindex(day_order)
            .dropna()
            .reset_index()
        )
        dow_counts.columns = ['ingestion_day_of_week', 'count']

        fig_dow = px.bar(
            dow_counts,
            x='ingestion_day_of_week',
            y='count',
            labels={'ingestion_day_of_week': 'Day of Week', 'count': 'Ingestions'},
            title='Ingestions by Day of Week'
        )
        fig_dow.show()

        # --- Heatmap: Day of Week × Hour (UTC) ---
        heatmap_df = (
            filtered.groupby(['ingestion_day_of_week', 'ingestion_hour'])
            .size()
            .reset_index(name='count')
        )

        if not heatmap_df.empty:
            fig_heat = px.density_heatmap(
                heatmap_df,
                x='ingestion_hour',
                y='ingestion_day_of_week',
                z='count',
                color_continuous_scale='Blues',
                labels={'ingestion_hour': 'Hour (UTC)', 'count': 'Ingestions'},
                title='Ingestion Heatmap: Day of Week vs Hour (UTC)'
            )
            fig_heat.update_xaxes(dtick=1)
            fig_heat.show()
        else:
            print("No heatmap data to display.")

# Trigger update on dropdown
workflow_dropdown.observe(update_dashboard, names='value')

# Initial dashboard render
update_dashboard()

# Display controls and output
display(workflow_dropdown)
display(output)
