In [1]:
import sys, subprocess, pkgutil
print("Python exe:", sys.executable)

if pkgutil.find_loader("pm4py") is None:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "pm4py", "lxml"])

import pm4py
print("pm4py:", pm4py.__version__)


Python exe: /opt/anaconda3/bin/python


  if pkgutil.find_loader("pm4py") is None:


pm4py: 2.7.17


In [2]:
from pathlib import Path
import pandas as pd
import pm4py, logging

DATA = Path().resolve().parent / "data"
CACHE = DATA / "_cache"
CACHE.mkdir(exist_ok=True)

USE_CASE_SAMPLE = {
    "BPI_2017_tail10k_csv": None,     
    "Sepsis_xes":            None,     
    "BPI_2019_xes":          1000,     
}

XES_COLMAP = {
    "case:concept:name": "case_id",
    "concept:name":      "activity",
    "org:resource":      "resource",
    "time:timestamp":    "timestamp",
    "lifecycle:transition": "lifecycle",
}

EXPECTED = ["case_id", "activity", "resource", "timestamp", "lifecycle"]
VALID_TAGS = {"start","complete","schedule","resume","suspend","abort","withdraw"}


In [3]:


def _summarize(df: pd.DataFrame, name: str):
    df = df.copy()
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    print(
        f"✅ {name}: {len(df):,} events | "
        f"cases={df['case_id'].nunique():,} | "
        f"resources={df['resource'].nunique():,} | "
        f"activities={df['activity'].nunique():,} | "
        f"range=[{df['timestamp'].min()} .. {df['timestamp'].max()}]"
    )
    display(df.head())


def _clean_common(df: pd.DataFrame) -> pd.DataFrame:
    df = df.rename(columns={c: c.strip() for c in df.columns})
    
    if "resource" not in df.columns:
        df["resource"] = "Unknown"
    if "lifecycle" not in df.columns:
        df["lifecycle"] = "complete"
    if "activity" not in df.columns:
        raise KeyError("Missing required column: 'activity'")
    if "case_id" not in df.columns:
        raise KeyError("Missing required column: 'case_id'")
    if "timestamp" not in df.columns:
        raise KeyError("Missing required column: 'timestamp'")

    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True).dt.tz_convert(None)
    bad = df["timestamp"].isna().sum()
    if bad:
        logging.warning(f"Dropping {bad} rows with invalid timestamps.")
        df = df.dropna(subset=["timestamp"])

    df["lifecycle"] = df["lifecycle"].astype(str).str.lower()
    df["resource"]  = df["resource"].fillna("Unknown").astype(str)

    df = df[df["lifecycle"].isin(VALID_TAGS) | ~df["lifecycle"].notna()]

    cols = [c for c in EXPECTED if c in df.columns]
    return df[cols].sort_values("timestamp").reset_index(drop=True)


def load_xes(path: Path, case_sample: int | None, name: str) -> pd.DataFrame:
    raw = pm4py.read_xes(str(path))

    df = raw.rename(columns=XES_COLMAP).copy()

    if "case_id" not in df.columns:
        if "case:concept:name" in raw.columns:
            df["case_id"] = raw["case:concept:name"]
        else:
            raise KeyError("No case_id (case:concept:name) found in XES.")

    if "activity" not in df.columns:
        df["activity"] = raw["concept:name"] if "concept:name" in raw.columns else "Activity"

    if "resource" not in df.columns:
        alts = ["org:resource", "Resource", "resource", "org:role", "role", "org:group"]
        alt = next((c for c in alts if c in raw.columns), None)
        df["resource"] = raw[alt] if alt else "Unknown"

    if "lifecycle" not in df.columns:
        df["lifecycle"] = "complete"

    if "timestamp" not in df.columns:
        ts_col = next((c for c in raw.columns if c.lower().startswith("time:timestamp")), None)
        if ts_col is None:
            raise KeyError("No timestamp column found in XES.")
        df["timestamp"] = raw[ts_col]

    if case_sample:
        keep_cases = (
            df["case_id"].drop_duplicates()
              .sample(min(case_sample, df["case_id"].nunique()), random_state=42)
        )
        df = df[df["case_id"].isin(keep_cases)].copy()

    df = _clean_common(df)
    _summarize(df, name)
    return df


def load_csv(path: Path, name: str) -> pd.DataFrame:
    df = pd.read_csv(path)

    ts_cols = [c for c in df.columns if "timestamp" in c.lower()]
    if ts_cols:
        if ts_cols[0] != "timestamp":
            df = df.rename(columns={ts_cols[0]: "timestamp"})
    else:
        raise KeyError("No timestamp-like column found in CSV.")

    if "resource" not in df.columns:
        df["resource"] = "Unknown"
    if "lifecycle" not in df.columns:
        df["lifecycle"] = "complete"

    df = _clean_common(df)
    _summarize(df, name)
    return df


def save_cache(df: pd.DataFrame, stem: str) -> Path:
    out = CACHE / f"{stem}.parquet"
    df.to_parquet(out, index=False)
    print(f" Cached -> {out.as_posix()}")
    return out


In [4]:
results = {}

#  BPI_2017_tail10k.csv
df_bpi2017 = load_csv(DATA / "BPI_2017_tail10k.csv", "BPI_2017_tail10k")
_summarize(df_bpi2017, "BPI_2017_tail10k")
results["BPI_2017_tail10k"] = save_cache(df_bpi2017, "bpi2017_clean")

#Sepsis (XES)
df_sepsis = load_xes(DATA / "sepsis_cases.xes", USE_CASE_SAMPLE["Sepsis_xes"], "Sepsis_xes")
_summarize(df_sepsis, "Sepsis_xes")
results["Sepsis_xes"] = save_cache(df_sepsis, "sepsis_clean")

# BPI Challenge 2019 
df_bpi2019 = load_xes(DATA / "BPI_Challenge_2019.xes", USE_CASE_SAMPLE["BPI_2019_xes"], "BPI_2019_xes")
_summarize(df_bpi2019, "BPI_2019_xes")
results["BPI_2019_xes"] = save_cache(df_bpi2019, "bpi2019_clean")

print("Done.")




✅ BPI_2017_tail10k: 9,105 events | cases=908 | resources=73 | activities=20 | range=[2017-01-09 16:27:15.229000 .. 2017-02-01 14:11:03.499000]


Unnamed: 0,case_id,activity,resource,timestamp,lifecycle
0,Application_171367276,W_Call incomplete files,User_61,2017-01-09 16:27:15.229,suspend
1,Application_2089121798,W_Call incomplete files,User_61,2017-01-09 16:27:52.747,resume
2,Application_600788947,O_Sent (online only),User_40,2017-01-09 16:29:15.912,complete
3,Application_63353190,W_Call incomplete files,User_93,2017-01-09 16:30:18.530,schedule
4,Application_63353190,W_Call incomplete files,User_93,2017-01-09 16:30:18.533,start


✅ BPI_2017_tail10k: 9,105 events | cases=908 | resources=73 | activities=20 | range=[2017-01-09 16:27:15.229000 .. 2017-02-01 14:11:03.499000]


Unnamed: 0,case_id,activity,resource,timestamp,lifecycle
0,Application_171367276,W_Call incomplete files,User_61,2017-01-09 16:27:15.229,suspend
1,Application_2089121798,W_Call incomplete files,User_61,2017-01-09 16:27:52.747,resume
2,Application_600788947,O_Sent (online only),User_40,2017-01-09 16:29:15.912,complete
3,Application_63353190,W_Call incomplete files,User_93,2017-01-09 16:30:18.530,schedule
4,Application_63353190,W_Call incomplete files,User_93,2017-01-09 16:30:18.533,start


 Cached -> /Users/macbookair/Documents/resource_availability_tool/data/_cache/bpi2017_clean.parquet




parsing log, completed traces ::   0%|          | 0/1050 [00:00<?, ?it/s]

✅ Sepsis_xes: 15,214 events | cases=1,050 | resources=26 | activities=16 | range=[2013-11-07 08:18:29 .. 2015-06-05 12:25:11]


Unnamed: 0,case_id,activity,resource,timestamp,lifecycle
0,XJ,ER Registration,A,2013-11-07 08:18:29,complete
1,XJ,ER Triage,C,2013-11-07 08:29:18,complete
2,XJ,ER Sepsis Triage,A,2013-11-07 08:37:32,complete
3,XJ,LacticAcid,B,2013-11-07 08:51:00,complete
4,XJ,Leucocytes,B,2013-11-07 08:51:00,complete


✅ Sepsis_xes: 15,214 events | cases=1,050 | resources=26 | activities=16 | range=[2013-11-07 08:18:29 .. 2015-06-05 12:25:11]


Unnamed: 0,case_id,activity,resource,timestamp,lifecycle
0,XJ,ER Registration,A,2013-11-07 08:18:29,complete
1,XJ,ER Triage,C,2013-11-07 08:29:18,complete
2,XJ,ER Sepsis Triage,A,2013-11-07 08:37:32,complete
3,XJ,LacticAcid,B,2013-11-07 08:51:00,complete
4,XJ,Leucocytes,B,2013-11-07 08:51:00,complete


 Cached -> /Users/macbookair/Documents/resource_availability_tool/data/_cache/sepsis_clean.parquet


parsing log, completed traces ::   0%|          | 0/251734 [00:00<?, ?it/s]

✅ BPI_2019_xes: 6,764 events | cases=1,000 | resources=274 | activities=30 | range=[2018-01-02 08:10:00 .. 2019-01-17 14:23:00]


Unnamed: 0,case_id,activity,resource,timestamp,lifecycle
0,4507000258_00001,Receive Order Confirmation,batch_00,2018-01-02 08:10:00,complete
1,4507000258_00001,Create Purchase Order Item,batch_00,2018-01-02 08:10:00,complete
2,4507000335_00010,Create Purchase Order Item,user_052,2018-01-02 10:59:00,complete
3,4507000369_00020,Create Purchase Order Item,user_085,2018-01-02 11:51:00,complete
4,4507004024_00100,Vendor creates invoice,NONE,2018-01-02 22:59:00,complete


✅ BPI_2019_xes: 6,764 events | cases=1,000 | resources=274 | activities=30 | range=[2018-01-02 08:10:00 .. 2019-01-17 14:23:00]


Unnamed: 0,case_id,activity,resource,timestamp,lifecycle
0,4507000258_00001,Receive Order Confirmation,batch_00,2018-01-02 08:10:00,complete
1,4507000258_00001,Create Purchase Order Item,batch_00,2018-01-02 08:10:00,complete
2,4507000335_00010,Create Purchase Order Item,user_052,2018-01-02 10:59:00,complete
3,4507000369_00020,Create Purchase Order Item,user_085,2018-01-02 11:51:00,complete
4,4507004024_00100,Vendor creates invoice,NONE,2018-01-02 22:59:00,complete


 Cached -> /Users/macbookair/Documents/resource_availability_tool/data/_cache/bpi2019_clean.parquet
Done.
