# MIMIC 4 data - dataset construction inputevents

In [None]:
import gzip
from pathlib import Path
from zipfile import ZipFile

import numpy as np
import pandas as pd
import pyarrow
import pyarrow.csv
import pyarrow.parquet

# Load tables

## Table Schema

In [None]:
rawdata_file = Path.cwd() / "mimic-iv-1.0.zip"
dataset_path = Path.cwd() / "processed"
rawdata_path = Path.cwd() / "raw"


files = {
    "inputevents": "mimic-iv-1.0/icu/inputevents.csv.gz",
    "inputitems": "mimic-iv-1.0/icu/d_items.csv.gz",
}

CATEGORY = pyarrow.dictionary("int32", "string")
ID_TYPE = "int32"  # pyarrow.dictionary("int32", "int32", ordered=True)

column_types = {
    "inputevents": {
        "amount": "float32",
        "amountuom": CATEGORY,
        "cancelreason": "int32",
        "continueinnextdept": "bool",
        "endtime": "timestamp[s]",
        "hadm_id": ID_TYPE,
        "isopenbag": "bool",
        "itemid": ID_TYPE,
        "linkorderid": ID_TYPE,
        "ordercategorydescription": CATEGORY,
        "ordercategoryname": CATEGORY,
        "ordercomponenttypedescription": CATEGORY,
        "orderid": ID_TYPE,
        "originalamount": "float32",
        "originalrate": "float32",
        "patientweight": "float32",
        "rate": "float32",
        "rateuom": CATEGORY,
        "secondaryordercategoryname": CATEGORY,
        "starttime": "timestamp[s]",
        "statusdescription": CATEGORY,
        "stay_id": ID_TYPE,
        "storetime": "timestamp[s]",
        "subject_id": ID_TYPE,
        "totalamount": "float32",
        "totalamountuom": CATEGORY,
    },
    "inputitems": {
        "abbreviation": CATEGORY,
        "category": CATEGORY,
        "highnormalvalue": "float32",
        "itemid": ID_TYPE,
        "label": CATEGORY,
        "linksto": CATEGORY,
        "lownormalvalue": "float32",
        "param_type": CATEGORY,
        "unitname": CATEGORY,
    },
}

null_values = [
    "-",
    "-1.#IND",
    "-1.#QNAN",
    "-NaN",
    "-nan",
    "?",
    "",
    "#N/A N/A",
    "#N/A",
    "#NA",
    "#na",
    "<N/A>",
    "<n/a>",
    "<NA>",
    "<na>",
    "1.#IND",
    "1.#QNAN",
    "INFORMATION NOT AVAILABLE",
    "N/A",
    "n/a",
    "NA",
    "na",
    "NAN",
    "NaN",
    "nan",
    "NONE",
    "None",
    "none",
    "NULL",
    "NULL",
    "Null",
    "null",
    "UNABLE TO OBTAIN",
    "UNKNOWN",
    "unknown",
]

types_map = {
    "string": pd.StringDtype(),
    "bool": pd.BooleanDtype(),
    "int8": pd.Int8Dtype(),
    "int16": pd.Int16Dtype(),
    "int32": pd.Int32Dtype(),
    "int64": pd.Int64Dtype(),
    "uint8": pd.UInt8Dtype(),
    "uint16": pd.UInt16Dtype(),
    "uint32": pd.UInt32Dtype(),
    "uint64": pd.UInt64Dtype(),
}

## Load `Inputevents`

In [None]:
# shape: (10M, 26) ⇝ 0.3B values.
key = "inputevents"
with (
    ZipFile(rawdata_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    inputevents = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=null_values,
        ),
    )
inputevents.shape, inputevents.schema

### Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(inputevents, rawdata_path / f"{key}.parquet")
inputevents = inputevents.to_pandas(self_destruct=True, types_mapper=types_map.get)
pd.DataFrame({"type": inputevents.dtypes, "uniques": inputevents.nunique()})

## Load `inputitems`

In [None]:
key = "inputitems"
with (
    ZipFile(rawdata_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    inputitems = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=null_values,
        ),
    )
inputitems.shape, inputitems.schema

### Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(inputitems, rawdata_path / f"{key}.parquet")
inputitems = inputitems.to_pandas(self_destruct=True, types_mapper=types_map.get)
inputitems

# Select relevant Subset of Data

## Only keep selected Patients

In [None]:
admissions = pd.read_parquet(dataset_path / "admissions_processed.parquet")

for key in ["hadm_id", "subject_id"]:
    mask = inputevents[key].isin(admissions[key])
    inputevents = inputevents[mask]
    print(f"Removing {(~mask).sum()} {key}")
    print(f"Number of patients remaining: {inputevents['subject_id'].nunique()}")
    print(f"Number of admissions remaining: {inputevents['hadm_id'].nunique()}")
    print(f"Number of events remaining: {inputevents.shape}")

## Only keep columns of interest

In [None]:
selected_columns = [
    "amount",
    "amountuom",
    "endtime",
    "hadm_id",
    "itemid",
    "ordercategorydescription",
    "patientweight",
    "rate",
    "rateuom",
    "starttime",
    "subject_id",
]

inputevents = inputevents[selected_columns]

## Only keep common items

In [None]:
inputevents = pd.merge(inputevents, inputitems, on="itemid")
print(f"Number of events remaining: {inputevents.shape}")
print(f"Number of patients remaining: {inputevents['subject_id'].nunique()}")

In [None]:
item_freq = inputevents.groupby("label")["subject_id"].nunique()
N_TOP = 50
common_items = item_freq.sort_values(ascending=False).index[:N_TOP]
inputevents = inputevents[inputevents["label"].isin(common_items)]
print(f"Number of events remaining: {inputevents.shape}")
print(f"Number of patients remaining: {inputevents['subject_id'].nunique()}")
inputevents.groupby("label")["amountuom"].value_counts()

# Clean up bad data entries

## Split Continuous medication

In [None]:
key = "Pantoprazole (Protonix)"
unit = "Continuous Med"
mask = (inputevents["label"] == key) & (inputevents["ordercategorydescription"] == unit)
print(
    f"Modifying {mask.sum():5d} entries - bad data in {key} (divide in two)\n\tdrug"
    " shot or continuous treatment and create a new item id for the continuous version"
)
inputevents["label"] = inputevents["label"].cat.add_categories(
    "Pantoprazole (Protonix) Continuous"
)
inputevents.loc[mask, "label"] = "Pantoprazole (Protonix) Continuous"
inputevents.loc[mask, "itemid"] = 2217441

## Only keep drugs with correct dosage

In [None]:
check_amountuom = {  # MIMIC-III and MIMIC-IV
    "Cefazolin": "dose",
    "Cefepime": "dose",
    "Ceftriaxone": "dose",
    "Ciprofloxacin": "dose",
    "Famotidine (Pepcid)": "dose",
    "Fentanyl (Concentrate)": "mg",
    "Heparin Sodium (Prophylaxis)": "dose",
    "Hydromorphone (Dilaudid)": "mg",
    "Magnesium Sulfate": "grams",
    "Metoprolol": "mg",
    "Metronidazole": "dose",
    "Pantoprazole (Protonix)": "dose",
    "Piperacillin/Tazobactam (Zosyn)": "dose",
    "Propofol": "mg",
    "Ranitidine (Prophylaxis)": "dose",
    "Vancomycin": "dose",
} | {  # MIMIC-IV specific
    "Acetaminophen-IV": "mg",
    "D5 1/2NS": "ml",
    "LR": "ml",
    "NaCl 0.9%": "ml",
    "OR Crystalloid Intake": "ml",
    "PO Intake": "ml",
    "Pre-Admission/Non-ICU Intake": "ml",
}

MASK = pd.Series(False, index=inputevents.index)

for drug, unit in check_amountuom.items():
    assert drug in inputevents["label"].cat.categories, f"{drug} not in categories!"
    mask = (inputevents["label"] == drug) & (inputevents["amountuom"] != unit)
    print(f"Removing {mask.sum():5d} entries - bad data in {drug} (drop {unit})")
    MASK |= mask

inputevents = inputevents[~MASK].copy()
print(f"Number of events remaining: {inputevents.shape}")
print(f"Number of patients remaining: {inputevents['subject_id'].nunique()}")

## Convert to standard units

In [None]:
key = "Dexmedetomidine (Precedex)"
mask = (inputevents["label"] == key) & (inputevents["amountuom"] == "mcg")
print(f"Modifying {mask.sum():5d} entries - bad data in {key} (cast all to mg)")
inputevents.loc[mask, "amount"] /= 1000
inputevents.loc[mask, "amountuom"] = "mg"


key = "Fentanyl (Concentrate)"
mask = (inputevents["label"] == key) & (inputevents["amountuom"] == "mg")
print(f"Modifying {mask.sum():5d} entries - bad data in {key} (remove the non mg)")
inputevents.loc[mask, "amount"] *= 1000
inputevents.loc[mask, "amountuom"] = "mcg"


key = "Fentanyl"
mask = (inputevents["label"] == key) & (inputevents["amountuom"] == "mg")
print(f"Modifying {mask.sum():5d} entries - bad data in {key} (put the mg to mcg)")
inputevents.loc[mask, "amount"] *= 1000
inputevents.loc[mask, "amountuom"] = "mcg"
inputevents.groupby("label")["amountuom"].value_counts()

## Validate prescription rates

In [None]:
check_rateuom = {
    "Acetaminophen-IV": "mg/min",
    "Dextrose 5%": "mL/hour",
    "Fentanyl (Concentrate)": "mcg/hour",
    "Magnesium Sulfate (Bolus)": "mL/hour",
    "NaCl 0.9%": "mL/hour",
    "Packed Red Blood Cells": "mL/hour",
    "Phenylephrine": "mcg/kg/min",
    "Piggyback": "mL/hour",
    "Sterile Water": "mL/hour",
}

MASK = pd.Series(False, index=inputevents.index)

for drug, unit in check_rateuom.items():
    assert drug in inputevents["label"].cat.categories, f"{drug} not in categories!"
    mask = (inputevents["label"] == drug) & (inputevents["rateuom"] != unit)
    print(f"Removing {mask.sum():5d} entries - bad data in {drug} (drop {unit})")
    MASK |= mask

inputevents = inputevents[~MASK].copy()
print(f"Number of events remaining: {inputevents.shape}")
print(f"Number of patients remaining: {inputevents['subject_id'].nunique()}")

### Check if a single unit per drug

In [None]:
for drug, frame in inputevents.groupby("label"):
    usage_by_unit = frame["rateuom"].value_counts()
    assert (usage_by_unit != 0).sum() in (0, 1)

## Validate computed amounts agree with reported amounts

In [None]:
DURATION = inputevents.endtime - inputevents.starttime
total_hours = DURATION / np.timedelta64(1, "h")
units = inputevents["rateuom"].dropna().unique()

for unit in units:
    mask = inputevents["rate"].notnull() & (inputevents["rateuom"] == unit)
    df = inputevents[mask]
    time = total_hours[mask] * 60 ** ("min" in unit)

    amount_units = df["amountuom"].unique()
    assert len(amount_units) == 1

    if "kg" in unit:
        amount = df["rate"] * time * df["patientweight"]
        discrepancy = amount / 1000 - df["amount"]
    else:
        amount = df["rate"] * time
        discrepancy = amount - df["amount"]

    print(f"{unit=:12} {amount_units[0]=:8} {discrepancy.max()=:.6f}")
    assert all(discrepancy < 0.01)

# Create data splits

In [None]:
FREQ = pd.Timedelta("30min")
has_rate = inputevents["rate"].notna()
is_late = DURATION > FREQ

splits = {}

# The first dataframe contains the entries with no rate but with extended duration inputs (over 0.5 hour)
splits[0] = inputevents[~has_rate & is_late].copy()

# The second dataframe contains the entries with no rate and low duration entries (<0.5hour)
splits[1] = inputevents[~has_rate & ~is_late].copy()

# The third dataframe contains the entries with a rate and extended duration inputs (over 0.5 hour)
splits[2] = inputevents[has_rate & is_late].copy()

# The forth dataframe contains the entries with a rate and low duration entries (< 0.5 hour)
splits[3] = inputevents[has_rate & ~is_late].copy()

# Check if split is complete
assert sum(map(len, splits.values())) == len(inputevents)

### Add repeats to split 0 and 2

In [None]:
REPEATS = DURATION / FREQ
IS_INT = np.ceil(REPEATS).astype(int) == REPEATS
REPEATS = np.ceil(REPEATS).astype(int) + IS_INT
REPEATS = REPEATS.astype("Int32")

for k in (0, 2):
    df = splits[k].copy()
    df["Repeat"] = REPEATS[df.index]
    df = df.reindex(df.index.repeat(df["Repeat"]))
    step = df["starttime"].groupby(level=0).cumcount()
    df["charttime"] = df["starttime"] + step * FREQ
    df["amount"] = (df["amount"] / df["Repeat"]).astype("float32")
    min_diff = (df["endtime"] - df["charttime"]).groupby(level=0).min()
    assert all(min_diff < FREQ), "Did add enough steps!"
    splits[k] = df

### For the early splits, we do not add repeats.

In [None]:
splits[1]["charttime"] = splits[1]["starttime"]
splits[3]["charttime"] = splits[3]["starttime"]

### Merge splits

In [None]:
inputevents = pd.concat(splits, names=["split"])
inputevents = inputevents.reset_index(level=1, drop=True)
inputevents = inputevents.sort_values(by=["subject_id", "charttime", "label"])
inputevents

# Serialize Pre-processed DataFrame

In [None]:
# Clean categories


def clean_categories(df):
    for col in df:
        if df[col].dtype == "category":
            df[col] = df[col].cat.remove_unused_categories()
    return df


inputevents = clean_categories(inputevents)
inputevents.reset_index().to_parquet(dataset_path / "inputevents_processed.parquet")
inputevents.shape, inputevents.dtypes