# MIMIC 4 data - dataset construction inputevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [None]:
import gzip
from pathlib import Path
from zipfile import ZipFile

import numpy as np
import pandas as pd
import pyarrow
import pyarrow.csv
import pyarrow.parquet

# Load `inputevents` and `inputitems` tables

## Table Schema

In [None]:
rawdata_file = Path.cwd() / "mimic-iv-1.0.zip"
dataset_path = Path.cwd() / "processed"
rawdata_path = Path.cwd() / "raw"

files = {
    "outputevents": "mimic-iv-1.0/icu/outputevents.csv.gz",
}

CATEGORY = pyarrow.dictionary("int32", "string")

column_types = {
    "outputevents": {
        "subject_id": "int32",
        "hadm_id": "int32",
        "stay_id": "int32",
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "value": "float32",
        "valueuom": CATEGORY,
    }
}

In [None]:
key = "outputevents"
with (
    ZipFile(rawdata_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    outputevents = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=[""],
        ),
    )

outputevents.schema

## Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(outputevents, rawdata_path / f"{key}.parquet")
outputevents = outputevents.to_pandas(self_destruct=True)
outputevents

## Load the input items

In [None]:
key = "inputitems"
with pd.option_context("string_storage", "pyarrow"):
    inputitems = pd.read_parquet(
        rawdata_path / f"{key}.parquet", use_nullable_dtypes=True
    )
inputitems

# Filter Dataset

## Only choose previously selected admission ids

In [None]:
admissions = pd.read_parquet(dataset_path / "admissions_processed.parquet")
outputevents = outputevents[outputevents["hadm_id"].isin(admissions["hadm_id"])]
print(f"Number of patients remaining: {outputevents['subject_id'].nunique()}")

## Merge on items &  keep only common ones

In [None]:
outputevents = pd.merge(outputevents, inputitems, on="itemid")
n_best = 15
item_freq = outputevents.groupby("label")["subject_id"].nunique()
item_freq = item_freq.sort_values(ascending=False)
common_items = item_freq.index[:n_best]
common_items

In [None]:
outputevents = outputevents[outputevents["label"].isin(common_items)]
print(f"Number of events remaining: {outputevents.shape}")
print(f"Number of patients remaining: {outputevents['subject_id'].nunique()}")

In [None]:
common_items = [
    "Chest Tube #1",
    "Emesis",
    "Fecal Bag",
    "Foley",
    "Jackson Pratt #1",
    "Nasogastric",
    "OR EBL",
    "OR Urine",
    "Oral Gastric",
    "Pre-Admission",
    "Stool",
    "Straight Cath",
    "TF Residual Output",
    "TF Residual",
    "Void",
]

In [None]:
alt_list = [
    "Chest Tube #1",
    "Chest Tube #2",
    "Condom Cath",
    "Fecal Bag",
    "Foley",
    "Gastric Gastric Tube",
    "Jackson Pratt #1",
    "OR EBL",
    "Ostomy (output)",
    "Pre-Admission",
    "Stool Out Stool",
    "TF Residual",
    "Ultrafiltrate Ultrafiltrate",
    "Urine Out Incontinent",
    "Void",
]

In [None]:
used_labels = [
    "Chest Tube",
    "Emesis",
    "Fecal Bag",
    "Foley",
    "Jackson Pratt",
    "Nasogastric",
    "OR EBL",
    "OR Urine",
    "Oral Gastric",
    "Pre-Admission",
    "Stool",
    "Straight Cath",
    "TF Residual Output",
    "TF Residual",
    "Void",
]

## Verification that all input labels have the same amounts units

In [None]:
assert outputevents["valueuom"].nunique() == 1
outputevents.groupby("label")["valueuom"].value_counts()

# Serialize Pre-processed DataFrame

In [None]:
# Clean categories
def clean_categories(df):
    for col in df:
        if df[col].dtype == "category":
            df[col] = df[col].cat.remove_unused_categories()
    return df


outputevents = clean_categories(outputevents)
outputevents.to_parquet(dataset_path / "outputevents_processed.parquet")