In [5]:
# ---- image export setup ----
import sys, subprocess, importlib

def pip_i(spec: str):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", spec])

try:
    import plotly, pkg_resources
    from packaging.version import Version
    if Version(plotly.__version__) < Version("6.1.1"):
        pip_i("plotly>=6.1.1")
except Exception:
    pip_i("plotly>=6.1.1")

# Pin Kaleido to the compatible build
try:
    import kaleido
    from packaging.version import Version
    if Version(kaleido.__version__) != Version("0.2.1"):
        pip_i("kaleido==0.2.1")
except Exception:
    pip_i("kaleido==0.2.1")

import plotly.io as pio

# Only set scope if kaleido is actually loaded
if getattr(pio, "kaleido", None) and getattr(getattr(pio, "kaleido", None), "scope", None):
    pio.kaleido.scope.default_width  = 1600
    pio.kaleido.scope.default_height = 900
else:
    print("Kaleido scope not available yet")

# where to store survey images
from pathlib import Path
SURVEY = (Path().resolve().parent / "figures" / "survey")
SURVEY.mkdir(parents=True, exist_ok=True)
print("Export folder:", SURVEY)


Kaleido scope not available yet
Export folder: /Users/macbookair/Documents/resource_availability_tool/figures/survey


In [6]:

from pathlib import Path
import sys, pandas as pd

SRC = Path().resolve().parent / "src"
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))

from availability import (
    get_active_periods, compute_daily_spans, extract_breaks, weekly_availability_matrix
)
from multitask import add_overlap_info, overlap_summary
from transform import extract_start_end  

DATA  = Path().resolve().parent / "data"
CACHE = DATA / "_cache"          # from 01_import
DERIV = DATA / "derived"         # outputs o
DERIV.mkdir(exist_ok=True)

print("Paths\n- DATA :", DATA, "\n- CACHE:", CACHE, "\n- DERIV:", DERIV)

DATASETS = {
    "bpi2017":  "bpi2017_clean.parquet",
    "sepsis":   "sepsis_clean.parquet",
    "bpi2019":  "bpi2019_clean.parquet",
}


RUN_ALL  = True
SELECTED = "sepsis"


TOLS = ["5min"] 



Paths
- DATA : /Users/macbookair/Documents/resource_availability_tool/data 
- CACHE: /Users/macbookair/Documents/resource_availability_tool/data/_cache 
- DERIV: /Users/macbookair/Documents/resource_availability_tool/data/derived


In [7]:
def summarize_events(df: pd.DataFrame, name: str):
    """Quick sanity peek so we see volumes & range."""
    d = df.copy()
    d["timestamp"] = pd.to_datetime(d["timestamp"], errors="coerce")
    print(
        f"✅ {name}: {len(d):,} events | "
        f"cases={d['case_id'].nunique():,} | "
        f"resources={d['resource'].nunique():,} | "
        f"activities={d['activity'].nunique():,} | "
        f"range=[{d['timestamp'].min()} .. {d['timestamp'].max()}]"
    )
    display(d.head(3))


def load_cached_events(stem_or_file: str) -> pd.DataFrame:
    """Read one of the parquet caches produced by 01_import."""
    name = Path(stem_or_file).name
    if not name.endswith(".parquet"):
        name += ".parquet"
    p = (CACHE / name).resolve()
    if not p.exists():
        available = sorted(q.name for q in CACHE.glob("*.parquet"))
        raise FileNotFoundError(f"Cache not found: {p.name}\nIn _cache I can see: {available}")
    df = pd.read_parquet(p)

    expected = ["case_id","activity","resource","timestamp","lifecycle"]
    missing = [c for c in expected if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns in cached file {p.name}: {missing}")
    return df


def build_union_periods(events: pd.DataFrame, tolerance="5min") -> pd.DataFrame:
    """Make resource-level union 'periods' (availability), no overlaps by design."""
    p = get_active_periods(events, tolerance=tolerance).copy()
    p["start"] = pd.to_datetime(p["start"]);  p["end"] = pd.to_datetime(p["end"])
    p["duration_min"] = (p["end"] - p["start"]).dt.total_seconds() / 60
    return p


def compute_task_periods(events: pd.DataFrame) -> pd.DataFrame:
    """Case+activity+resource start→end pairs (this is what we use for overlap)."""
    tasks = extract_start_end(
        events,
        start_label="start", end_label="complete"  
    ).rename(columns={"start_time":"start", "end_time":"end"})
    tasks["start"] = pd.to_datetime(tasks["start"]); tasks["end"] = pd.to_datetime(tasks["end"])
    return tasks


def save_frames(stem: str, **frames) -> dict:
    """Save any number of named DataFrames to DERIV with a common stem."""
    paths = {}
    for key, df in frames.items():
        path = DERIV / f"{stem}_{key}.parquet"
        df.to_parquet(path, index=False)
        paths[key] = path
    return paths


In [8]:
def process_one(ds_key: str, cache_file: str, tolerances: list[str]):
    print("\n", "="*12, ds_key, "="*12)
    events = load_cached_events(cache_file)
    summarize_events(events, ds_key)

    for tol in tolerances:
        print(f"→ Building UNION periods (tolerance={tol}) ...")
        periods = build_union_periods(events, tolerance=tol)
        print(f"   periods: {len(periods):,} rows | >24h={(periods['duration_min'] > 24*60).sum()}")

        # availability 
        daily  = compute_daily_spans(periods)
        breaks = extract_breaks(periods)
        weekly = weekly_availability_matrix(periods)
        print("   daily:", daily.shape, "| breaks:", breaks.shape, "| weekly:", weekly.shape)

        # task-based overlap 
        tasks = compute_task_periods(events)
        if tasks.empty:
            print("   tasks: 0 rows → no start/complete pairs; skipping overlap KPIs.")
            olap = pd.DataFrame(columns=["resource","n_periods","busy_min","overlap_min","overlap_share"])
        else:
            olap = overlap_summary(tasks)
            print("   tasks:", tasks.shape, "| overlap_kpi:", olap.shape)

        stem = f"{ds_key}__tol_{tol.replace(' ','')}"
        out = save_frames(
            stem,
            periods=periods, daily_spans=daily, breaks=breaks, weekly=weekly,
            task_periods=tasks, overlap_kpi=olap
        )
        print("   saved:")
        for k, p in out.items():
            print(f"     - {k:13s} -> {p.name}")


In [9]:
# example test
import pandas as pd
df_dummy = pd.DataFrame({
    "case_id": ["1","1","2","2"],
    "activity": ["A","A","B","B"],
    "resource": ["R1","R1","R1","R1"],
    "timestamp": pd.to_datetime(["2024-01-01 10:00","2024-01-01 10:10",
                                 "2024-01-01 10:05","2024-01-01 10:20"]),
    "lifecycle": ["start","complete","start","complete"]
})
tasks = compute_task_periods(df_dummy)
display(tasks[["resource","start","end"]])
display(overlap_summary(tasks))


Unnamed: 0,resource,start,end
0,R1,2024-01-01 10:00:00,2024-01-01 10:10:00
1,R1,2024-01-01 10:05:00,2024-01-01 10:20:00


Unnamed: 0,resource,n_periods,busy_min,overlap_min,overlap_share
0,R1,2,20.0,5.0,0.25
