# MIMIC 4 data - dataset construction inputevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [None]:
from zipfile import ZipFile

import pandas as pd

In [None]:
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 300)

In [None]:
admissions = pd.read_parquet("admissions_processed.parquet")

In [None]:
dataset_file = "mimic-iv-1.0.zip"

files = {
    "outputevents": "mimic-iv-1.0/icu/outputevents.csv.gz",
}
column_types = {
    "outputevents": {
        "subject_id": "int32",
        "hadm_id": "int32",
        "stay_id": "int32",
        "charttime": "datetime64",
        "storetime": "datetime64",
        "value": "float32",
        "valueuom": "string",
    }
}

In [None]:
key = "outputevents"
with (
    ZipFile(dataset_file) as archive,
    archive.open(files[key]) as file,
    # gzip.open(compressed_file) as file,
):
    outputevents = pd.read_csv(
        file,
        compression="gzip",
        parse_dates=[k for k, v in column_types[key].items() if v == "datetime64"],
        dtype={k: v for k, v in column_types[key].items() if v != "datetime64"},
    )

outputevents

## Only choose previously selected admission ids

In [None]:
outputevents = outputevents[outputevents["hadm_id"].isin(admissions["hadm_id"])]
print(f"Number of patients remaining: {outputevents['subject_id'].nunique()}")

## Load the input items

In [None]:
key = "inputitems"
with pd.option_context("string_storage", "pyarrow"):
    inputitems = pd.read_parquet(f"{key}.parquet", use_nullable_dtypes=True)
inputitems

## Merge on items &  keep only common ones

In [None]:
outputevents = pd.merge(outputevents, inputitems, on="itemid")
n_best = 15
item_freq = outputevents.groupby("label")["subject_id"].nunique()
item_freq = item_freq.sort_values(ascending=False)
common_items = item_freq.index[:n_best]
common_items

In [None]:
outputevents = outputevents[outputevents["label"].isin(common_items)]
print(f"Number of events remaining: {outputevents.shape}")
print(f"Number of patients remaining: {outputevents['subject_id'].nunique()}")

In [None]:
common_items = [
    "Chest Tube #1",
    "Emesis",
    "Fecal Bag",
    "Foley",
    "Jackson Pratt #1",
    "Nasogastric",
    "OR EBL",
    "OR Urine",
    "Oral Gastric",
    "Pre-Admission",
    "Stool",
    "Straight Cath",
    "TF Residual Output",
    "TF Residual",
    "Void",
]

In [None]:
alt_list = [
    "Chest Tube #1",
    "Chest Tube #2",
    "Condom Cath",
    "Fecal Bag",
    "Foley",
    "Gastric Gastric Tube",
    "Jackson Pratt #1",
    "OR EBL",
    "Ostomy (output)",
    "Pre-Admission",
    "Stool Out Stool",
    "TF Residual",
    "Ultrafiltrate Ultrafiltrate",
    "Urine Out Incontinent",
    "Void",
]

In [None]:
used_labels = [
    "Chest Tube",
    "Emesis",
    "Fecal Bag",
    "Foley",
    "Jackson Pratt",
    "Nasogastric",
    "OR EBL",
    "OR Urine",
    "Oral Gastric",
    "Pre-Admission",
    "Stool",
    "Straight Cath",
    "TF Residual Output",
    "TF Residual",
    "Void",
]

### Verification that all input labels have the same amounts units

In [None]:
assert outputevents["valueuom"].nunique() == 1
outputevents.groupby("label")["valueuom"].value_counts()

In [None]:
outputevents.to_parquet("outputevents_processed.parquet")