In [4]:
# Click “Runtime → Run all” once the notebook opens.
# Wait 10–15 seconds for the dashboard and charts to load.
# Use the dropdown at the top to filter by workflow.
# You don’t need to install anything — it works entirely in the browser!




import pandas as pd
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.io as pio





import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ipywidgets as widgets
from IPython.display import display


# Read CSV directly from zip file
import zipfile

# Open zip and manually select the real CSV
import requests
import zipfile
import io

url = 'https://github.com/santanisumeet/my_analysis_1/raw/main/combined_df.csv.zip'
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
with z.open('combined_df.csv') as f:
    combined_df = pd.read_csv(f)



# === Setup Data ===
combined_df['ingestion_dt'] = pd.to_datetime(combined_df['ingestion_dt'], format='ISO8601', errors='coerce')
combined_df['hour'] = combined_df['ingestion_dt'].dt.hour
combined_df['day_of_week'] = combined_df['ingestion_dt'].dt.day_name()

# Ensure workflow_id is clean
combined_df['workflow_id'] = combined_df['workflow_id'].astype(str)
combined_df = combined_df.dropna(subset=['workflow_id'])

# Day and hour order
days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
hour_order = list(range(24))

# === Widget Setup ===
workflow_dropdown = widgets.Dropdown(
    options=sorted(combined_df['workflow_id'].unique()),
    description='Workflow:',
    layout=widgets.Layout(width='50%'),
    style={'description_width': 'initial'}
)

output = widgets.Output()

# === Callback Function ===
def update_dashboard(workflow_id):
    output.clear_output()
    with output:
        filtered = combined_df[combined_df['workflow_id'] == workflow_id].copy()

        if filtered.empty:
            print("No data available for this workflow.")
            return

        # Recompute hour and day_of_week safely after filtering
        filtered['ingestion_dt'] = pd.to_datetime(filtered['ingestion_dt'], errors='coerce')
        filtered['hour'] = filtered['ingestion_dt'].dt.hour
        filtered['day_of_week'] = filtered['ingestion_dt'].dt.day_name()

        # Drop rows with missing datetime if any
        filtered = filtered.dropna(subset=['hour'])

        # Display dataset IDs
        print(f"Dataset ID for workflow '{workflow_id}':\n")
        print(filtered['dataset_id'].unique())
        print("\n")

        # === Plot 1: Bar Chart by Hour ===
        plt.figure(figsize=(10, 4))
        sns.countplot(data=filtered, x='hour', order=hour_order, color='steelblue')
        plt.title('Ingestion Frequency by UTC Hour of Day')
        plt.xlabel('Hour')
        plt.ylabel('Count')
        plt.xticks(ticks=range(24))
        plt.tight_layout()
        plt.show()

        # === Plot 2: Bar Chart by Day of Week ===
        plt.figure(figsize=(10, 4))
        sns.countplot(data=filtered, x='day_of_week', order=days_order, color='seagreen')
        plt.title('Ingestion Frequency by Day of Week')
        plt.xlabel('Day')
        plt.ylabel('Count')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

        # === Plot 3: Heatmap ===
        heatmap_data = (
            filtered
            .groupby(['day_of_week', 'hour'])
            .size()
            .reset_index(name='count')
        )

        pivot_table = heatmap_data.pivot(index='day_of_week', columns='hour', values='count')
        pivot_table = pivot_table.reindex(index=days_order, columns=hour_order, fill_value=0)

        plt.figure(figsize=(14, 6))
        sns.heatmap(pivot_table, cmap='Blues', linewidths=0.5, annot=True, fmt='g')
        plt.title(f'Ingestion Frequency Heatmap')
        plt.xlabel('UTC Hour of Day')
        plt.ylabel('Day of Week')
        plt.tight_layout()
        plt.show()


# === Bind widget and display ===
widgets.interact(update_dashboard, workflow_id=workflow_dropdown)
display(output)



interactive(children=(Dropdown(description='Workflow:', layout=Layout(width='50%'), options=('cxo40003_crux_wr…

Output()