In [None]:
# === Data loading from Google Drive using gdown ===
import pandas as pd
import gdown

# Google Drive file ID
file_id = "1-VixmGEbt1_eknWPCuTvj9Q9KhjYUaMQ"
url = f"https://drive.google.com/uc?id={file_id}"

# Download CSV from Google Drive
gdown.download(url, "combined_data.csv", quiet=False)

# Read the downloaded CSV, assuming first column is index
combined_df = pd.read_csv("combined_data.csv", index_col=0)

# === Interactive Dashboard ===
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display, clear_output
import plotly.io as pio

# Use notebook-compatible renderer
pio.renderers.default = 'notebook_connected'

# Copy the DataFrame
df = combined_df.copy()

# Parse ingestion time to UTC and extract hour
df['ingestion_time'] = pd.to_datetime(df['ingestion_time'], errors='coerce', utc=True)
df['ingestion_hour'] = df['ingestion_time'].dt.hour

# Order days of the week
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
df['ingestion_day_of_week'] = pd.Categorical(df['ingestion_day_of_week'], categories=day_order, ordered=True)

# === Workflow ID Filter Widget ===
workflow_dropdown = widgets.Dropdown(
    options=['All'] + sorted(df['workflow_id'].dropna().unique().tolist()),
    value='All',
    description='Workflow ID:',
    style={'description_width': 'initial'}
)

output = widgets.Output()

# === Dashboard Update Function ===
def update_dashboard(change=None):
    output.clear_output()

    filtered = df.copy()
    if workflow_dropdown.value != 'All':
        filtered = filtered[filtered['workflow_id'] == workflow_dropdown.value]

    with output:
        if filtered.empty:
            print("No data for selected workflow.")
            return

        # --- Bar Chart: Ingestions by Hour (UTC) ---
        hourly_counts = (
            filtered['ingestion_hour']
            .value_counts()
            .sort_index()
            .reset_index()
        )
        hourly_counts.columns = ['ingestion_hour', 'count']
        fig_hour = px.bar(
            hourly_counts,
            x='ingestion_hour',
            y='count',
            labels={'ingestion_hour': 'Hour of Day (UTC)', 'count': 'Ingestions'},
            title='Ingestions by Hour of Day (UTC)'
        )
        fig_hour.update_xaxes(dtick=1)
        fig_hour.show()

        # --- Bar Chart: Ingestions by Day of Week ---
        dow_counts = (
            filtered['ingestion_day_of_week']
            .value_counts()
            .reindex(day_order)
            .dropna()
            .reset_index()
        )
        dow_counts.columns = ['ingestion_day_of_week', 'count']
        fig_dow = px.bar(
            dow_counts,
            x='ingestion_day_of_week',
            y='count',
            labels={'ingestion_day_of_week': 'Day of Week', 'count': 'Ingestions'},
            title='Ingestions by Day of Week'
        )
        fig_dow.show()

        # --- Heatmap: Day of Week × Hour ---
        heatmap_df = (
            filtered
            .groupby(['ingestion_day_of_week', 'ingestion_hour'])
            .size()
            .reset_index(name='count')
        )

        if not heatmap_df.empty:
            fig_heat = px.density_heatmap(
                heatmap_df,
                x='ingestion_hour',
                y='ingestion_day_of_week',
                z='count',
                color_continuous_scale='Blues',
                labels={'ingestion_hour': 'Hour (UTC)', 'count': 'Ingestions'},
                title='Ingestion Heatmap: Day of Week vs Hour (UTC)'
            )
            fig_heat.update_xaxes(dtick=1)
            fig_heat.show()
        else:
            print("No heatmap data to display.")

# Bind filter dropdown to update function
workflow_dropdown.observe(update_dashboard, names='value')

# Initial dashboard load
update_dashboard()

# Display widgets and output
display(workflow_dropdown)
display(output)


Downloading...
From (original): https://drive.google.com/uc?id=1-VixmGEbt1_eknWPCuTvj9Q9KhjYUaMQ
From (redirected): https://drive.google.com/uc?id=1-VixmGEbt1_eknWPCuTvj9Q9KhjYUaMQ&confirm=t&uuid=d92eccf3-b4bc-4429-bcea-5adb424f2758
To: /Users/sumeet.santani/Downloads/combined_data.csv

  0%|                                                                                                                                                                                         | 0.00/4.09G [00:00<?, ?B/s][A
  0%|                                                                                                                                                                                 | 524k/4.09G [00:00<13:28, 5.06MB/s][A
  0%|                                                                                                                                                                                | 1.05M/4.09G [00:00<14:00, 4.86MB/s][A
  0%|                                         

In [31]:
pip install gdown


Collecting gdown
  Downloading gdown-5.2.0-py3-none-any.whl (18 kB)
Collecting filelock
  Downloading filelock-3.18.0-py3-none-any.whl (16 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 KB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting PySocks!=1.5.7,>=1.5.6
  Using cached PySocks-1.7.1-py3-none-any.whl (16 kB)
Installing collected packages: tqdm, PySocks, filelock, gdown
Successfully installed PySocks-1.7.1 filelock-3.18.0 gdown-5.2.0 tqdm-4.67.1
You should consider upgrading via the '/Users/sumeet.santani/.pyenv/versions/3.9.16/bin/python3.9 -m pip install --upgrade pip' command.[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


Downloading...
From (original): https://drive.google.com/uc?id=1-VixmGEbt1_eknWPCuTvj9Q9KhjYUaMQ
From (redirected): https://drive.google.com/uc?id=1-VixmGEbt1_eknWPCuTvj9Q9KhjYUaMQ&confirm=t&uuid=4676be76-eca3-4a1a-8fbf-e3a56c13947c
To: /Users/sumeet.santani/Downloads/combined_data.csv
 19%|██████████████████████████████████▏                                                                                                                                              | 789M/4.09G [00:22<01:37, 33.7MB/s]

KeyboardInterrupt: 

 19%|██████████████████████████████████▎                                                                                                                                              | 794M/4.09G [00:40<01:37, 33.7MB/s]

In [20]:
combined_df = combined_df[combined_df['delivery_type'].str.contains('delta', na=False)]


In [22]:
combined_df.to_csv("combined_df.csv", index=False)

In [25]:
test = pd.read_csv('combined_df.csv')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.