# MIMIC 4 data - dataset construction inputevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [None]:
import gzip
import os
from datetime import datetime
from zipfile import ZipFile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow
import pyarrow.csv
import pyarrow.parquet

In [None]:
admissions = pd.read_parquet("admissions_processed.parquet")
admissions.head()

In [None]:
dataset_file = "mimic-iv-1.0.zip"

files = {
    "inputevents": "mimic-iv-1.0/icu/inputevents.csv.gz",
    "inputitems": "mimic-iv-1.0/icu/d_items.csv.gz",
}

column_types = {
    "inputevents": {
        "amount": "float32",
        "amountuom": "string",
        "cancelreason": "int32",
        "continueinnextdept": "bool",
        "endtime": "timestamp[s]",
        "hadm_id": "int32",
        "isopenbag": "bool",
        "itemid": "int32",
        "linkorderid": "int32",
        "ordercategorydescription": "string",
        "ordercategoryname": "string",
        "ordercomponenttypedescription": "string",
        "orderid": "int32",
        "originalamount": "float32",
        "originalrate": "float32",
        "patientweight": "float32",
        "rate": "float32",
        "rateuom": "string",
        "secondaryordercategoryname": "string",
        "starttime": "timestamp[s]",
        "statusdescription": "string",
        "stay_id": "int32",
        "storetime": "timestamp[s]",
        "subject_id": "int32",
        "totalamount": "float32",
        "totalamountuom": "string",
    },
    "inputitems": {
        "abbreviation": "string",
        "category": "string",
        "highnormalvalue": "float32",
        "itemid": "int32",
        "label": "string",
        "linksto": "string",
        "lownormalvalue": "float32",
        "param_type": "string",
        "unitname": "string",
    },
}

In [None]:
# shape: (10M, 26) ⇝ 0.3B values.
key = "inputevents"
with (
    ZipFile(dataset_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    inputevents = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            auto_dict_encode=True,
            strings_can_be_null=True,
            null_values=[""],
        ),
    )

In [None]:
# set categoricals
for col, dtype in column_types[key].items():
    if dtype == "string":
        index = inputevents.column_names.index(col)
        categories = inputevents[col].dictionary_encode()
        inputevents = inputevents.set_column(index, col, categories)

inputevents

### Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(inputevents, f"{key}.parquet")
del inputevents
with pd.option_context("string_storage", "pyarrow"):
    inputevents = pd.read_parquet(f"{key}.parquet", use_nullable_dtypes=True)
print(f"Number of events remaining: {len(inputevents)}")

### Only keep inputevents associated with the selected patients

In [None]:
mask = inputevents["hadm_id"].isin(admissions["hadm_id"])
inputevents = inputevents[mask]
print(f"Number of events remaining: {inputevents.shape}")
print(f"Number of patients remaining: {inputevents['subject_id'].nunique()}")

### Only keep columns of interest

In [None]:
selected_columns = [
    "amount",
    "amountuom",
    "endtime",
    "hadm_id",
    "itemid",
    "ordercategorydescription",
    "patientweight",
    "rate",
    "rateuom",
    "starttime",
    "subject_id",
]

inputevents = inputevents[selected_columns]

### Load item table

In [None]:
key = "inputitems"
with (
    ZipFile(dataset_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    inputitems = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=[""],
        ),
    )

inputitems.shape

### Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(inputitems, f"{key}.parquet")
del inputitems
with pd.option_context("string_storage", "pyarrow"):
    inputitems = pd.read_parquet(f"{key}.parquet", use_nullable_dtypes=True)
print(f"Number of events remaining: {inputitems.shape}")

### Merge with Items, keep only common items

In [None]:
inputevents = pd.merge(inputevents, inputitems, on="itemid")
print(f"Number of events remaining: {inputevents.shape}")
print(f"Number of patients remaining: {inputevents['subject_id'].nunique()}")

In [None]:
item_freq = inputevents.groupby("label")["subject_id"].nunique()
common_items = item_freq.sort_values(ascending=False).index[:50]
inputevents = inputevents[inputevents["label"].isin(common_items)]
print(f"Number of events remaining: {inputevents.shape}")
print(f"Number of patients remaining: {inputevents['subject_id'].nunique()}")

In [None]:
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(inputevents.groupby("label")["amountuom"].value_counts())

## Clean up bad data entries

In [None]:
BACKUP = inputevents.copy()

In [None]:
check_amountuom = {  # MIMIC-III and MIMIC-IV
    "Cefazolin": "dose",
    "Cefepime": "dose",
    "Ceftriaxone": "dose",
    "Ciprofloxacin": "dose",
    "Famotidine (Pepcid)": "dose",
    "Fentanyl (Concentrate)": "mg",
    "Heparin Sodium (Prophylaxis)": "dose",
    "Hydromorphone (Dilaudid)": "mg",
    "Magnesium Sulfate": "grams",
    "Propofol": "mg",
    "Metoprolol": "mg",
    "Piperacillin/Tazobactam (Zosyn)": "dose",
    "Metronidazole": "dose",
    "Ranitidine (Prophylaxis)": "dose",
    "Vancomycin": "dose",
    "Pantoprazole (Protonix": "dose",
} | {  # MIMIC-IV specific
    "Acetaminophen-IV": "mg",
    "D5 1/2NS": "ml",
    "LR": "ml",
    "NaCl 0.9%": "ml",
    "OR Crystalloid Intake": "ml",
    "PO Intake": "ml",
    "Pre-Admission/Non-ICU Intake": "ml",
}

MASK = pd.Series(False, index=inputevents.index)

for key, val in check_amountuom.items():
    mask = (inputevents["label"] == key) & (inputevents["amountuom"] != val)
    diff = mask & ~MASK
    print(f"Removing {mask.sum():5d} entries - bad data in {key} (drop {val})")
    MASK |= mask

inputevents = inputevents.drop(inputevents[MASK].index)

In [None]:
key = "Fentanyl (Concentrate)"
mask = (inputevents["label"] == key) & (inputevents["amountuom"] == "mg")
print(f"Modifying {mask.sum():5d} entries - bad data in {key} (remove the non mg)")
inputevents.loc[mask, "amount"] *= 1000
inputevents.loc[mask, "amountuom"] = "mcg"


key = "Fentanyl"
mask = (inputevents["label"] == key) & (inputevents["amountuom"] == "mg")
print(f"Modifying {mask.sum():5d} entries - bad data in {key} (put the mg to mcg)")
inputevents.loc[mask, "amount"] *= 1000
inputevents.loc[mask, "amountuom"] = "mcg"


key = "Pantoprazole (Protonix)"
mask = (inputevents["label"] == key) & (
    inputevents["ordercategorydescription"] == "Continuous Med"
)
print(
    f"Modifying {mask.sum():5d} entries - bad data in {key} (divide in two)"
    "drug shot or continuous treatment and create a new item id for the continuous version"
)
inputevents.loc[mask, "label"] = "Pantoprazole (Protonix) Continuous"
inputevents.loc[mask, "itemid"] = 2217441


key = "Dexmedetomidine (Precedex)"
mask = (inputevents["label"] == key) & (inputevents["amountuom"] == "mcg")
print(f"Modifying {mask.sum():5d} entries - bad data in {key} (cast all to mg)")
inputevents.loc[mask, "amount"] /= 1000
inputevents.loc[mask, "amountuom"] = "mg"

In [None]:
# Verify
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(inputevents.groupby("label")["amountuom"].value_counts())

In [None]:
# same thing for inputs given in rates
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(inputevents.groupby("label")["rateuom"].value_counts())

In [None]:
check_rateuom = {
    "Dextrose 5%": "mL/hour",
    "Magnesium Sulfate (Bolus)": "mL/hour",
    "NaCl 0.9%": "mL/hour",
    "Piggyback": "mL/hour",
    "Packed Red Blood Cells": "mL/hour",
    "Acetaminophen-IV": "mg/min",
    "Fentanyl (Concentrate)": "mcg/hour",
    "Phenylephrine": "mcg/kg/min",
    "Sterile Water": "mL/hour",
}

In [None]:
MASK = pd.Series(False, index=inputevents.index)

for key, val in check_rateuom.items():
    mask = (inputevents["label"] == key) & (inputevents["rateuom"] != val)
    diff = mask & ~MASK
    print(f"Removing {mask.sum():5d} entries - bad data in {key} (drop {val})")
    MASK |= mask

inputevents = inputevents.drop(inputevents[MASK].index)

In [None]:
# Check if a single unit per drug
with pd.option_context("display.max_rows", None, "display.max_columns", None):
    print(inputevents.groupby("label")["rateuom"].value_counts())

## Testing if the computed amounts are OK

In [None]:
duration = inputevents.endtime - inputevents.starttime
total_hours = duration / np.timedelta64(1, "h")
units = inputevents["rateuom"].dropna().unique()

for unit in units:
    mask = inputevents["rate"].notnull() & (inputevents["rateuom"] == unit)
    df = inputevents[mask]
    time = total_hours[mask] * 60 ** ("min" in unit)

    amount_units = df["amountuom"].unique()
    assert len(amount_units) == 1

    if "kg" in unit:
        amount = df["rate"] * time * df["patientweight"]
        discrepancy = amount / 1000 - df["amount"]
    else:
        amount = df["rate"] * time
        discrepancy = amount - df["amount"]

    print(f"{unit=:12} {amount_units[0]=:8} {discrepancy.max()=:.6f}")
    assert all(discrepancy < 0.01)

# Creating splits

In [None]:
FREQ = pd.Timedelta("30min")
has_rate = inputevents["rate"].notnull()
is_early = duration <= FREQ

splits = {}

# The first dataframe contains the entries with no rate but with extended duration inputs (over 0.5 hour)
splits[0] = inputevents[~has_rate & ~is_early].copy()

# The second dataframe contains the entries with no rate and low duration entries (<0.5hour)
splits[1] = inputevents[~has_rate & is_early].copy()

# The third dataframe contains the entries with a rate and extended duration inputs (over 0.5 hour)
splits[2] = inputevents[has_rate & ~is_early].copy()

# The forth dataframe contains the entries with a rate and low duration entries (< 0.5 hour)
splits[3] = inputevents[has_rate & is_early].copy()

# Check if split is complete
assert sum(map(len, splits.values())) == len(inputevents)

In [None]:
# We then create the admninistration time as a shifted version of the STARTTIME.


def create_range(x):
    return pd.date_range(start=x.iat[0], freq=FREQ, periods=len(x))


# preprocess late split
df = splits[0]
df["Repeat"] = np.ceil(duration[df.index] / FREQ).astype(int)
df = df.reindex(df.index.repeat(df["Repeat"]))
df["charttime"] = df["starttime"].groupby(level=0).transform(create_range)
df["amount"] /= df["Repeat"]
splits[0] = df

# preprocess other late split
df = splits[2]
df["Repeat"] = np.ceil(duration[df.index] / FREQ).astype(int)
df = df.reindex(df.index.repeat(df["Repeat"]))
df["charttime"] = df["starttime"].groupby(level=0).transform(create_range)
df["amount"] /= df["Repeat"]
splits[2] = df

# For the early splits, we do not add repeats.
splits[1]["charttime"] = splits[1]["starttime"]
splits[3]["charttime"] = splits[3]["starttime"]

In [None]:
# Eventually, we merge all 4splits into one.
# The result is a dataset with discrete inputs for each treatment.
result = pd.concat(splits, names=["split"])
result = result.reset_index(level=1, drop=True)
result = result.reset_index()
result = result.set_index(["split", "subject_id", "charttime", "label"])
result = result.sort_index()
result

In [None]:
result.to_parquet("inputs_processed.parquet")
result["hadm_id"].nunique()