In [20]:
# ---- image export setup ----
import sys, subprocess, importlib

def pip_i(spec: str):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", spec])

try:
    import plotly, pkg_resources
    from packaging.version import Version
    if Version(plotly.__version__) < Version("6.1.1"):
        pip_i("plotly>=6.1.1")
except Exception:
    pip_i("plotly>=6.1.1")

# Pin Kaleido to the compatible build
try:
    import kaleido
    from packaging.version import Version
    if Version(kaleido.__version__) != Version("0.2.1"):
        pip_i("kaleido==0.2.1")
except Exception:
    pip_i("kaleido==0.2.1")

import plotly.io as pio

# Only set scope if kaleido is actually loaded
if getattr(pio, "kaleido", None) and getattr(getattr(pio, "kaleido", None), "scope", None):
    pio.kaleido.scope.default_width  = 1600
    pio.kaleido.scope.default_height = 900
else:
    print("Kaleido scope not available yet")

# where to store survey images
from pathlib import Path
SURVEY = (Path().resolve().parent / "figures" / "survey")
SURVEY.mkdir(parents=True, exist_ok=True)
print("Export folder:", SURVEY)


Export folder: /Users/macbookair/Documents/resource_availability_tool/figures/survey




Use of plotly.io.kaleido.scope.default_width is deprecated and support will be removed after September 2025.
Please use plotly.io.defaults.default_width instead.




Use of plotly.io.kaleido.scope.default_height is deprecated and support will be removed after September 2025.
Please use plotly.io.defaults.default_height instead.




In [21]:
# Setup 

from pathlib import Path
import sys, pandas as pd, numpy as np, plotly.express as px

SRC = Path().resolve().parent / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

from availability import get_active_periods, compute_daily_spans, extract_breaks, weekly_availability_matrix
from multitask   import overlap_summary

CACHE = (Path().resolve().parent / "data" / "_cache").resolve()

CACHE_FILES = {
    "bpi2017": "bpi2017_clean.parquet",
    "sepsis":  "sepsis_clean.parquet",
    "bpi2019": "bpi2019_clean.parquet",
}

VALID = {"start","complete","schedule","resume","suspend","abort","withdraw"}

def load_cached_events(name: str) -> pd.DataFrame:
    fp = CACHE / name
    df = pd.read_parquet(fp)
    df["lifecycle"] = df["lifecycle"].astype(str).str.lower().fillna("unknown")
    df = df[df["lifecycle"].isin(VALID)].copy()
    return df

DS = "sepsis"   # ← switch dataset
events = load_cached_events(CACHE_FILES[DS])
print(DS, "events:", events.shape)


sepsis events: (15214, 5)


In [22]:
# Helpers, Overlap via task periods

def compute_task_periods(events: pd.DataFrame) -> pd.DataFrame:
    """Pair start/complete for each case, resource, activity"""
    df = events.copy()
    start = df[df["lifecycle"].str.startswith("start")][["case_id","resource","activity","timestamp"]].rename(columns={"timestamp":"start"})
    end   = df[df["lifecycle"].str.startswith("complete")][["case_id","resource","activity","timestamp"]].rename(columns={"timestamp":"end"})
    m = pd.merge(start, end, on=["case_id","resource","activity"], how="inner")
    m["start"] = pd.to_datetime(m["start"]); m["end"] = pd.to_datetime(m["end"])
    m = m[m["end"] >= m["start"]].reset_index(drop=True)
    return m

def build_periods(events: pd.DataFrame, tolerance="5min") -> pd.DataFrame:
    """Active sessions by resource """
    p = get_active_periods(events, tolerance=tolerance).copy()
    p["start"] = pd.to_datetime(p["start"]); p["end"] = pd.to_datetime(p["end"])
    p["duration_min"] = (p["end"] - p["start"]).dt.total_seconds()/60
    return p

def org_kpis(events: pd.DataFrame, tolerance="5min") -> dict:
    """Org-level KPIs for a given tolerance."""
    p = build_periods(events, tolerance=tolerance)
    d = compute_daily_spans(p)
    b = extract_breaks(p)

    # overlap using task periods, sweep-line robust
    tasks = compute_task_periods(events)
    if tasks.empty:
        overlap_share = 0.0
    else:
        ol = overlap_summary(tasks)
        # weighted share = sum(overlap_min)/sum(busy_min)
        num = ol["overlap_min"].sum()
        den = ol["busy_min"].sum()
        overlap_share = float(num/den) if den > 0 else 0.0

    return dict(
        n_periods=len(p),
        days_active=d["day"].nunique(),
        avg_span_min=float(d["span_min"].mean()),
        avg_pct_busy=float(d["pct_busy"].mean()),
        overlap_share=overlap_share
    ), p, d, b


In [23]:
# --- Q3: export side-by-side candidates for tolerance identification ---

import pandas as pd
import plotly.express as px
from pathlib import Path

# make sure SURVEY folder exists (you already have this earlier; safe to re-run)
from pathlib import Path
SURVEY = (Path().resolve().parent / "figures" / "survey")
SURVEY.mkdir(parents=True, exist_ok=True)

# ensure we have a loaded & filtered event log as `df`
try:
    df  # noqa: F821
except NameError:
    from io_utils import load_event_log_csv
    DATA = Path().resolve().parent / "data" / "BPI_2017_tail10k.csv"
    df = load_event_log_csv(DATA)
    valid = ["start","complete","schedule","resume","suspend","abort","withdraw"]
    df["lifecycle"] = df["lifecycle"].str.lower().fillna("unknown")
    df = df[df["lifecycle"].isin(valid)].copy()

# pick a resource with lots of activity (helps the tolerance differences show up)
RESOURCE = df["resource"].value_counts().idxmax()  # or set e.g. "User_100"

from availability import get_active_periods

TOLS = ["0min", "5min", "15min"]  # render 3 clear choices

for tol in TOLS:
    # build periods under this tolerance
    p = get_active_periods(df, tolerance=tol)
    p["start"] = pd.to_datetime(p["start"]); p["end"] = pd.to_datetime(p["end"])
    p["resource"] = p["resource"].astype(str)

    # focus on the chosen resource & its busiest day
    sub = p[p["resource"] == RESOURCE].copy()
    if sub.empty:
        print(f"No periods for {RESOURCE} at tol={tol}"); 
        continue

    day = sub["start"].dt.date.value_counts().idxmax()
    one = sub[sub["start"].dt.date == day].copy()

    # colour is not needed for Q3; keep a clean timeline
    one["is_overlap"] = False

    title = f"{RESOURCE} — {day}  (tolerance={tol})"
    fig = px.timeline(one, x_start="start", x_end="end", y="resource", title=title)
    fig.update_yaxes(autorange="reversed", visible=False, showticklabels=False)
    fig.write_image(SURVEY / f"Q3_{RESOURCE}_{day}_tol_{tol}.png", scale=2)

print("Saved:", [f"Q3_{RESOURCE}_{day}_tol_{t}.png" for t in TOLS])




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Saved: ['Q3_User_1_2017-01-10_tol_0min.png', 'Q3_User_1_2017-01-10_tol_5min.png', 'Q3_User_1_2017-01-10_tol_15min.png']




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




In [24]:
#  Tolerance sweep (RQ2 validation)

tols = ["0min","2min","5min","10min","15min"]
rows = []
for t in tols:
    org, p, d, b = org_kpis(events, tolerance=t)
    rows.append(dict(tolerance=t, **org))

tol_summary = pd.DataFrame(rows)
display(tol_summary)

fig = px.line(
    tol_summary.melt("tolerance", var_name="metric", value_name="value"),
    x="tolerance", y="value", color="metric", markers=True,
    title=f"Tolerance sweep — organization-level KPIs ({DS})"
)
fig.update_yaxes(matches=None); fig.show()


Unnamed: 0,tolerance,n_periods,days_active,avg_span_min,avg_pct_busy,overlap_share
0,0min,8494,504,292.217244,0.523968,0.0
1,2min,8311,504,292.217244,0.524365,0.0
2,5min,8084,504,292.029426,0.526658,0.0
3,10min,7756,504,291.847808,0.53091,0.0
4,15min,7513,504,291.847808,0.535728,0.0


In [25]:
# === Q3 candidates: mine tolerance-sensitive (resource, day) pairs ===
# For each (resource, day), we measure how the # of work periods changes
# when small idle gaps are merged (tolerance). We also compute:
#  - small_gap_share: share of 1–15 min gaps within the day (0–1)
#  - midday_flag: whether a 30–120 min gap starts within 11:00–14:00
# High-scoring days are ideal stimuli for Q3.

import pandas as pd, numpy as np

# Helper: normalize org_kpis() result (works whether it returns 3 or 4 items)
def _unpack_org_kpis(result):
    """
    Returns: (org_dict, periods_df, daily_df, breaks_df)
    """
    if isinstance(result, tuple):
        if len(result) == 4:
            org, p, d, b = result
            return org, p, d, b
        if len(result) == 3:
            p, d, b = result
            return {}, p, d, b
    raise ValueError("Unexpected return shape from org_kpis().")

# Tolerances to compare in Q3 (add "30min" if needed for your data)
TOLS_Q3 = ["0min", "5min", "15min"]

# Precompute period tables per tolerance and add a 'day' column
periods_by_tol = {}
for t in TOLS_Q3:
    _, p, _, _ = _unpack_org_kpis(org_kpis(events, tolerance=t))
    periods_by_tol[t] = p.assign(day=pd.to_datetime(p["start"]).dt.date)

# Count periods per (resource, day) for each tolerance
counts = []
for t, p in periods_by_tol.items():
    cnt = p.groupby(["resource", "day"], observed=True).size().rename(f"n_{t}").reset_index()
    counts.append(cnt)

# Merge counts
sens = counts[0]
for c in counts[1:]:
    sens = sens.merge(c, on=["resource", "day"], how="outer")

for col in [f"n_{t}" for t in TOLS_Q3]:
    sens[col] = sens[col].fillna(0).astype(int)

# Primary sensitivity signal: drop from 0min to the largest tolerance
last = f"n_{TOLS_Q3[-1]}"
sens["delta_periods"] = sens["n_0min"] - sens[last]

# Compute small_gap_share and midday_flag from 0min breaks (no groupby.apply)
_, _, _, b0 = _unpack_org_kpis(org_kpis(events, tolerance="0min"))
b0 = b0.copy()
b0["day"] = pd.to_datetime(b0["gap_start"]).dt.date
b0["start_hr"] = (
    pd.to_datetime(b0["gap_start"]).dt.hour
    + pd.to_datetime(b0["gap_start"]).dt.minute / 60.0
)

small_share = (
    b0.assign(is_small=b0["gap_min"].between(1, 15))
      .groupby(["resource", "day"], observed=True)["is_small"]
      .mean()
      .rename("small_gap_share")
      .reset_index()
)

midday_flag = (
    b0.assign(is_mid=b0["gap_min"].between(30, 120) & b0["start_hr"].between(11, 14))
      .groupby(["resource", "day"], observed=True)["is_mid"]
      .any()
      .rename("midday_flag")
      .reset_index()
)

sens = (sens
        .merge(small_share, on=["resource","day"], how="left")
        .merge(midday_flag, on=["resource","day"], how="left"))

# Be explicit about dtypes
sens["small_gap_share"] = sens["small_gap_share"].fillna(0.0).astype("float64")
sens["midday_flag"] = sens["midday_flag"].fillna(False).astype(bool)

# Composite score (tune weights if needed)
max_delta = max(1, sens["delta_periods"].max())
sens["score"] = (
    0.6 * (sens["delta_periods"] / max_delta)
    + 0.3 * sens["small_gap_share"]
    + 0.1 * sens["midday_flag"].astype(int)
)

# Recommended candidate filter (adjust thresholds to control volume)
candidates = sens[
    (sens["delta_periods"] >= 3) & (sens["small_gap_share"] >= 0.20)
].sort_values("score", ascending=False).reset_index(drop=True)

print(f"Candidates found: {len(candidates)}")
display(candidates.head(12))


Candidates found: 113



Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



Unnamed: 0,resource,day,n_0min,n_5min,n_15min,delta_periods,small_gap_share,midday_flag,score
0,A,2014-08-27,27,24,17,10,0.384615,True,0.815385
1,A,2014-05-25,18,16,9,9,0.529412,True,0.798824
2,A,2014-05-03,19,16,9,10,0.555556,False,0.766667
3,A,2014-08-01,17,13,10,7,0.4375,True,0.65125
4,A,2014-08-20,9,7,3,6,0.625,True,0.6475
5,A,2014-10-11,15,10,7,8,0.5,False,0.63
6,A,2014-04-17,15,11,8,7,0.357143,True,0.627143
7,A,2014-10-07,18,15,11,7,0.352941,True,0.625882
8,A,2014-03-06,14,9,7,7,0.538462,False,0.581538
9,A,2014-06-29,14,12,8,6,0.384615,True,0.575385


In [26]:
# === Q3 panels: side-by-side timelines for different tolerances ===
# Draws A/B/C panels for the SAME (resource, day) with different tolerances
# (e.g., 0 / 5 / 15 min). A shaded band highlights the 11:00–14:00 window.
# Exports PNGs to figures/survey (requires kaleido).

import plotly.express as px
from plotly.subplots import make_subplots
from pathlib import Path
import pandas as pd
import random

# Output folder (same convention as your 04 notebook)
SURVEY = (Path().resolve().parent / "figures" / "survey")
SURVEY.mkdir(parents=True, exist_ok=True)

def render_q3_panels(resource, day, tols=TOLS_Q3, lunch_window=(11.0, 14.0),
                     save=True, prefix="Q3", shuffle_panels=False,
                     width=1200, height=330):
    """
    Render side-by-side panels for different gap-merging tolerances.

    Parameters
    ----------
    resource : str|int      Resource id to plot.
    day : str|date          Calendar day to plot.
    tols : list[str]        Tolerances to show (e.g., ["0min","5min","15min"]).
    lunch_window : (float,float)  Shaded band (start_hour, end_hour).
    save : bool             If True, export PNG via kaleido.
    prefix : str            Filename prefix for export.
    shuffle_panels : bool   If True, randomize panel order (for survey A/B/C).
    width, height : int     Figure size.
    """
    day = pd.to_datetime(day).date()
    tol_list = list(tols)
    if shuffle_panels:
        random.shuffle(tol_list)

    fig = make_subplots(
        rows=1, cols=len(tol_list), shared_yaxes=True, horizontal_spacing=0.03,
        subplot_titles=[f"tol={t}" for t in tol_list]
    )

    for i, t in enumerate(tol_list, start=1):
        p = periods_by_tol[t]
        sub = p[(p["resource"] == resource) & (p["day"] == day)].copy()

        if sub.empty:
            # Placeholder with a reasonable time range
            fig.add_annotation(text="No activity", row=1, col=i, showarrow=False)
            x0 = pd.Timestamp(day) + pd.Timedelta(hours=7)
            x1 = pd.Timestamp(day) + pd.Timedelta(hours=19)
            fig.update_xaxes(range=[x0, x1], row=1, col=i, type="date")
        else:
            sub["resource"] = str(resource)  # single-row timeline
            tl = px.timeline(sub, x_start="start", x_end="end", y="resource")
            for tr in tl.data:
                fig.add_trace(tr, row=1, col=i)
            # Keep comparable x-range across panels
            x0 = pd.Timestamp(day) + pd.Timedelta(hours=6)
            x1 = pd.Timestamp(day) + pd.Timedelta(hours=20)
            fig.update_xaxes(range=[x0, x1], row=1, col=i, type="date")

        # Highlight 11:00–14:00 (helps human raters detect lunch/merging)
        if lunch_window is not None:
            zx0 = pd.Timestamp(day) + pd.Timedelta(hours=float(lunch_window[0]))
            zx1 = pd.Timestamp(day) + pd.Timedelta(hours=float(lunch_window[1]))
            fig.add_vrect(x0=zx0, x1=zx1, row=1, col=i,
                          fillcolor="LightGrey", opacity=0.25, line_width=0)

    fig.update_yaxes(autorange="reversed", title_text="resource")
    fig.update_layout(title=f"{resource} — {day}", height=height, width=width)

    if save:
        out = SURVEY / f"{prefix}_{resource}_{day}.png"
        fig.write_image(str(out), scale=2)  # requires kaleido
        print("Saved:", out)

    return fig

# Quick preview: generate panels for the top 3 candidates
for _, r in candidates.head(3).iterrows():
    render_q3_panels(r["resource"], r["day"])

# Batch export (e.g., top-N for the survey) — uncomment as needed:
# for _, r in candidates.head(20).iterrows():
#     render_q3_panels(r["resource"], r["day"], shuffle_panels=True, save=True)




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Saved: /Users/macbookair/Documents/resource_availability_tool/figures/survey/Q3_A_2014-08-27.png




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




Saved: /Users/macbookair/Documents/resource_availability_tool/figures/survey/Q3_A_2014-05-25.png
Saved: /Users/macbookair/Documents/resource_availability_tool/figures/survey/Q3_A_2014-05-03.png




Support for Kaleido versions less than 1.0.0 is deprecated and will be removed after September 2025.
Please upgrade Kaleido to version 1.0.0 or greater (`pip install 'kaleido>=1.0.0'` or `pip install 'plotly[kaleido]'`).




In [27]:
#  Lunch window sensitivity (window × min_gap grid)

# Fix tolerance to the mid value 
_, p5, d5, b5 = org_kpis(events, tolerance="5min")
def calc_lunch_rate(breaks: pd.DataFrame, window=(11.5,13.5), min_gap=30) -> float:
    if breaks.empty:
        return 0.0
    bb = breaks.copy()
    bb["start_hr"] = pd.to_datetime(bb["gap_start"]).dt.hour + pd.to_datetime(bb["gap_start"]).dt.minute/60
    ok = (bb["start_hr"].between(window[0], window[1])) & (bb["gap_min"] >= min_gap)
    by_res = bb.assign(ok=ok).groupby("resource")["ok"].max()
    return float(by_res.mean())

windows = [(11.0,13.0), (11.5,13.5), (12.0,14.0)]
min_gaps = [20, 30, 45, 60, 90]

grid = []
for w in windows:
    for g in min_gaps:
        grid.append({
            "window": f"{w[0]}–{w[1]}",
            "min_gap": g,
            "lunch_rate": calc_lunch_rate(b5, window=w, min_gap=g)
        })

lunch_sens = pd.DataFrame(grid)
display(lunch_sens)

fig = px.line(lunch_sens, x="min_gap", y="lunch_rate", color="window",
              markers=True, title=f"Lunch presence sensitivity ({DS})")
fig.update_yaxes(tickformat=".0%"); fig.show()


Unnamed: 0,window,min_gap,lunch_rate
0,11.0–13.0,20,0.818182
1,11.0–13.0,30,0.818182
2,11.0–13.0,45,0.818182
3,11.0–13.0,60,0.818182
4,11.0–13.0,90,0.818182
5,11.5–13.5,20,0.681818
6,11.5–13.5,30,0.681818
7,11.5–13.5,45,0.681818
8,11.5–13.5,60,0.636364
9,11.5–13.5,90,0.636364


In [28]:
# Quick sanity + reference hourly coverage (tol=5min)

bad_order = int((p5["end"] <= p5["start"]).sum())
too_long  = int((p5["duration_min"] > 24*60).sum())
d5_nonpos = int((d5["span_min"] <= 0).sum())
print(f"Sanity → bad_order: {bad_order}, >24h periods: {too_long}, non_positive_daily_spans: {d5_nonpos}")

wk = weekly_availability_matrix(p5)
org_hour = wk.groupby("hour", as_index=False)["availability_share"].mean()
fig = px.line(org_hour, x="hour", y="availability_share",
              title=f"Organization-wide hourly availability (tol=5min, {DS})")
fig.update_yaxes(tickformat=".0%"); fig.show()


Sanity → bad_order: 0, >24h periods: 0, non_positive_daily_spans: 0
