Requirements

In [None]:
#Install required packages:
%pip install -r requirements.txt

In [None]:
Step 1: Generate Labels

Sample controls

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

# Load ICU stays and sepsis cases
icustays = pd.read_csv("input/ICUSTAYS.csv.gz", usecols=["ICUSTAY_ID", "INTIME", "OUTTIME"])
labels = pd.read_csv("output/sepsis_labels.csv")

# Get non-sepsis ICU stays
sepsis_ids = set(labels["ICUSTAY_ID"])
non_sepsis = icustays[~icustays["ICUSTAY_ID"].isin(sepsis_ids)].dropna()
print(f"[INFO] Found {len(non_sepsis)} non-sepsis ICU stays")

# Sample same number of controls (or as many as available)
n_controls = min(len(non_sepsis), len(labels))
controls = non_sepsis.sample(n=n_controls, replace=False, random_state=42).copy()

controls["INTIME"] = pd.to_datetime(controls["INTIME"])
controls["OUTTIME"] = pd.to_datetime(controls["OUTTIME"])

# Pick a random timestamp 12+ hours after INTIME and 12h before OUTTIME
def random_control_time(row):
    start = row["INTIME"] + pd.Timedelta(hours=12)
    end = row["OUTTIME"] - pd.Timedelta(hours=12)
    if start >= end:
        return np.nan
    return pd.to_datetime(start + (end - start) * np.random.rand())

controls["SEPSIS_ONSET"] = controls.apply(random_control_time, axis=1)
controls = controls.dropna(subset=["SEPSIS_ONSET"])

# Save as pseudo-sepsis labels (for reuse)
controls[["ICUSTAY_ID", "SEPSIS_ONSET"]].to_csv("output/control_labels.csv", index=False)
print(f"[DONE] Sampled {len(controls)} control patients → output/control_labels.csv")


extract labels

In [None]:
import pandas as pd
from tqdm import tqdm

# Paths to CSVs
ICUSTAYS_PATH = "input/ICUSTAYS.csv.gz"
PRESCRIPTIONS_PATH = "input/PRESCRIPTIONS.csv.gz"
MICROBIO_PATH = "input/MICROBIOLOGYEVENTS.csv.gz"

# Load ICU stay data
icustays = pd.read_csv(ICUSTAYS_PATH, usecols=["ICUSTAY_ID", "HADM_ID", "INTIME"])
print(f"[INFO] Loaded {len(icustays)} ICU stays")

# Antibiotics (proxy: generic names with 'cillin', 'cef', 'micin', etc.)
prescriptions = pd.read_csv(PRESCRIPTIONS_PATH, usecols=["HADM_ID", "STARTDATE", "DRUG"])
prescriptions = prescriptions.dropna(subset=["STARTDATE", "DRUG"])
prescriptions = prescriptions[prescriptions["DRUG"].str.lower().str.contains("cillin|cef|micin|penem|floxacin|mycin", na=False)]

# Microbiology events
microbio = pd.read_csv(MICROBIO_PATH, usecols=["HADM_ID", "CHARTDATE"])
microbio = microbio.dropna(subset=["CHARTDATE"])

# Merge to find suspected infection window
merged = pd.merge(prescriptions, microbio, on="HADM_ID", how="inner")
merged["SUSPECTED_TIME"] = pd.to_datetime(merged[["STARTDATE", "CHARTDATE"]].max(axis=1))

# Map to ICU stay
merged = pd.merge(merged, icustays, on="HADM_ID", how="inner")
merged = merged[merged["SUSPECTED_TIME"] > merged["INTIME"]]
suspected_df = merged.groupby("ICUSTAY_ID").SUSPECTED_TIME.min().reset_index()

# Simulated SOFA threshold logic (you can refine later)
# Placeholder: label everyone with suspicion as "sepsis"
suspected_df["SEPSIS_ONSET"] = suspected_df["SUSPECTED_TIME"]

# Save label file
suspected_df[["ICUSTAY_ID", "SEPSIS_ONSET"]].to_csv("output/sepsis_labels.csv", index=False)
print(f"[INFO] Labeled {len(suspected_df)} sepsis cases")


Step 2: Filter CHARTEVENTS

Extract only relevant vitals from the full CHARTEVENTS dataset for faster processing.

In [None]:
import pandas as pd
from tqdm import tqdm

# Path to full CHARTEVENTS
INPUT_FILE = "input/CHARTEVENTS.csv.gz"
OUTPUT_FILE = "output/filtered_chartevents.csv.gz"

# ITEMIDs for vitals we care about (MIMIC-III specific)
ITEM_IDS = {
    "Heart Rate": [211],
    "Systolic BP": [51],
    "Diastolic BP": [8368],
    "Mean BP": [456],
    "Respiratory Rate": [618],
    "SpO2": [646],
    "Temperature": [223761]
}

ITEMID_LIST = sum(ITEM_IDS.values(), [])  # flatten list of item IDs

print("[INFO] Filtering CHARTEVENTS.csv.gz — this may take ~5–15 min...")

chunks = []
reader = pd.read_csv(
    INPUT_FILE,
    usecols=["ICUSTAY_ID", "ITEMID", "CHARTTIME", "VALUENUM"],
    chunksize=1_000_000,
    low_memory=False
)

for chunk in tqdm(reader):
    filtered = chunk[chunk["ITEMID"].isin(ITEMID_LIST)]
    chunks.append(filtered)

# Combine and save
filtered_df = pd.concat(chunks)
filtered_df.to_csv(OUTPUT_FILE, index=False, compression="gzip")

print(f"[DONE] Saved filtered CHARTEVENTS to: {OUTPUT_FILE}")
print(f"[INFO] Rows after filtering: {len(filtered_df)}")


Step 3: Extract Vitals Features