# MIMIC 4 data - dataset construction inputevents

In [None]:
import gzip
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import pyarrow
import pyarrow.csv
import pyarrow.parquet

# Load tables

## Table Schema

In [None]:
rawdata_file = Path.cwd() / "mimic-iv-1.0.zip"
dataset_path = Path.cwd() / "processed"
rawdata_path = Path.cwd() / "raw"

files = {
    "outputevents": "mimic-iv-1.0/icu/outputevents.csv.gz",
}

CATEGORY = pyarrow.dictionary("int32", "string")
ID_TYPE = "int32"  # pyarrow.dictionary("int32", "int32", ordered=True)

column_types = {
    "outputevents": {
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "itemid": ID_TYPE,
        "value": "float32",
        "valueuom": CATEGORY,
    }
}

null_values = [
    "-",
    "-1.#IND",
    "-1.#QNAN",
    "-NaN",
    "-nan",
    "?",
    "",
    "#N/A N/A",
    "#N/A",
    "#NA",
    "#na",
    "<N/A>",
    "<n/a>",
    "<NA>",
    "<na>",
    "1.#IND",
    "1.#QNAN",
    "INFORMATION NOT AVAILABLE",
    "N/A",
    "n/a",
    "NA",
    "na",
    "NAN",
    "NaN",
    "nan",
    "NONE",
    "None",
    "none",
    "NULL",
    "NULL",
    "Null",
    "null",
    "UNABLE TO OBTAIN",
    "UNKNOWN",
    "unknown",
]

types_map = {
    "string": pd.StringDtype(),
    "bool": pd.BooleanDtype(),
    "int8": pd.Int8Dtype(),
    "int16": pd.Int16Dtype(),
    "int32": pd.Int32Dtype(),
    "int64": pd.Int64Dtype(),
    "uint8": pd.UInt8Dtype(),
    "uint16": pd.UInt16Dtype(),
    "uint32": pd.UInt32Dtype(),
    "uint64": pd.UInt64Dtype(),
}

In [None]:
key = "outputevents"
with (
    ZipFile(rawdata_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    outputevents = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=null_values,
        ),
    )

outputevents.shape, outputevents.schema

## Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(outputevents, rawdata_path / f"{key}.parquet")
outputevents = outputevents.to_pandas(self_destruct=True, types_mapper=types_map.get)
outputevents

## Load the input items

In [None]:
key = "inputitems"
with pd.option_context("string_storage", "pyarrow"):
    inputitems = pd.read_parquet(
        rawdata_path / f"{key}.parquet", use_nullable_dtypes=True
    )
inputitems

# Filter Dataset

## Only keep selected patients/admissions

In [None]:
admissions = pd.read_parquet(dataset_path / "admissions_processed.parquet")

for key in ["hadm_id", "subject_id"]:
    mask = outputevents[key].isin(admissions[key])
    outputevents = outputevents[mask]
    print(f"Removing {(~mask).sum()} {key}")
    print(f"Number of patients remaining: {outputevents['subject_id'].nunique()}")
    print(f"Number of admissions remaining: {outputevents['hadm_id'].nunique()}")
    print(f"Number of events remaining: {outputevents.shape}")

## Merge on items &  keep only common ones

In [None]:
outputevents = pd.merge(outputevents, inputitems, on="itemid")
NTOP = 15
item_freq = outputevents.groupby("label")["subject_id"].nunique()
item_freq = item_freq.sort_values(ascending=False)
common_items = item_freq.index[:NTOP]

for key in common_items:
    assert key in outputevents["label"].values, f"{key:20} missing in data!"

outputevents = outputevents[outputevents["label"].isin(common_items)]
print(f"Number of events remaining: {outputevents.shape}")
print(f"Number of patients remaining: {outputevents['subject_id'].nunique()}")
list(common_items)

## Verification that all input labels have the same amounts units

In [None]:
assert outputevents["valueuom"].nunique() == 1
outputevents.groupby("label")["valueuom"].value_counts()

# Serialize Pre-processed DataFrame

In [None]:
# Clean categories


def clean_categories(df):
    for col in df:
        if df[col].dtype == "category":
            df[col] = df[col].cat.remove_unused_categories()
    return df


outputevents = clean_categories(outputevents)
outputevents.to_parquet(dataset_path / "outputevents_processed.parquet")
outputevents.shape, outputevents.dtypes