# MIMIC 4 data - dataset construction labevents

In [None]:
import gzip
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import pyarrow
import pyarrow.csv
import pyarrow.parquet

# Load `labevents` and `labitems` tables

## Table Schema

In [None]:
rawdata_file = Path.cwd() / "mimic-iv-1.0.zip"
dataset_path = Path.cwd() / "processed"
rawdata_path = Path.cwd() / "raw"

files = {
    "labevents": "mimic-iv-1.0/hosp/labevents.csv.gz",
    "labitems": "mimic-iv-1.0/hosp/d_labitems.csv.gz",
}

CATEGORY = pyarrow.dictionary("int32", "string")
ID_TYPE = "int32"  # pyarrow.dictionary("int32", "int32", ordered=True)

column_types = {
    "labevents": {
        "labevent_id": ID_TYPE,
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "specimen_id": ID_TYPE,
        "itemid": ID_TYPE,
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "value": CATEGORY,
        "valuenum": "float32",
        "valueuom": CATEGORY,
        "ref_range_lower": "float32",
        "ref_range_upper": "float32",
        "flag": CATEGORY,
        "priority": CATEGORY,
        "comments": "string",
    },
    "labitems": {
        "itemid": ID_TYPE,
        "label": CATEGORY,
        "fluid": CATEGORY,
        "category": CATEGORY,
        "loinc_code": CATEGORY,
    },
}

null_values = [
    "-",
    "-1.#IND",
    "-1.#QNAN",
    "-NaN",
    "-nan",
    "?",
    "",
    "#N/A N/A",
    "#N/A",
    "#NA",
    "#na",
    "<N/A>",
    "<n/a>",
    "<NA>",
    "<na>",
    "1.#IND",
    "1.#QNAN",
    "INFORMATION NOT AVAILABLE",
    "N/A",
    "n/a",
    "NA",
    "na",
    "NAN",
    "NaN",
    "nan",
    "NONE",
    "None",
    "none",
    "NULL",
    "NULL",
    "Null",
    "null",
    "UNABLE TO OBTAIN",
    "UNKNOWN",
    "unknown",
]

types_map = {
    "string": pd.StringDtype(),
    "bool": pd.BooleanDtype(),
    "int8": pd.Int8Dtype(),
    "int16": pd.Int16Dtype(),
    "int32": pd.Int32Dtype(),
    "int64": pd.Int64Dtype(),
    "uint8": pd.UInt8Dtype(),
    "uint16": pd.UInt16Dtype(),
    "uint32": pd.UInt32Dtype(),
    "uint64": pd.UInt64Dtype(),
}

## Load `labevents` table

In [None]:
# shape: (120M, 15) ⇝ 1.8B values.
key = "labevents"
with (
    ZipFile(rawdata_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    labevents = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=null_values,
        ),
    )

labevents.shape, labevents.schema

## Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(labevents, rawdata_path / f"{key}.parquet")

labevents = labevents.to_pandas(self_destruct=True, types_mapper=types_map.get)
print(labevents.dtypes)
labevents

## Load `labitems` table

In [None]:
# shape: (120M, 15) ⇝ 1.8B values.
key = "labitems"
with (
    ZipFile(rawdata_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    labitems = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=null_values,
        ),
    )

labitems.shape, labitems.schema

## Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(labitems, rawdata_path / f"{key}.parquet")

labitems = labitems.to_pandas(self_destruct=True, types_mapper=types_map.get)
labitems

# Filter Dataset

## Only keep labevents associated with the selected patients

In [None]:
admissions = pd.read_parquet(dataset_path / "admissions_processed.parquet")

for key in ["hadm_id", "subject_id"]:
    mask = labevents[key].isin(admissions[key])
    labevents = labevents[mask]
    print(f"Removing {(~mask).sum()} {key}")
    print(f"Number of patients remaining: {labevents['subject_id'].nunique()}")
    print(f"Number of admissions remaining: {labevents['hadm_id'].nunique()}")
    print(f"Number of events remaining: {labevents.shape}")

## Only keep labevents with float measurements

In [None]:
labevents = labevents[labevents["valuenum"].notna()]
print(f"Number of events remaining: {labevents.shape}")
print(f"Number of patients remaining: {labevents.subject_id.nunique()}")

## Merge on `ItemID`, only select tests that were given to many people

In [None]:
labevents = pd.merge(labevents, labitems, on="itemid")

In [None]:
selected_variables = [
    "Albumin",
    "Alanine Aminotransferase (ALT)",
    "Alkaline Phosphatase",
    "Anion Gap",
    "Asparate Aminotransferase (AST)",
    "Base Excess",
    "Basophils",
    "Bicarbonate",
    "Bilirubin, Total",
    "Calcium, Total",
    "Calculated Total CO2",
    "Chloride",
    "Creatinine",
    "Eosinophils",
    "Glucose",
    "Hematocrit",
    "Hemoglobin",
    "Lactate",
    "Lymphocytes",
    "MCH",
    "MCV",
    "Magnesium",
    "Monocytes",
    "Neutrophils",
    "PT",
    "PTT",
    "Phosphate",
    "Platelet Count",
    "Potassium",
    "RDW",
    "Red Blood Cells",
    "Sodium",
    "Specific Gravity",
    "Urea Nitrogen",
    "White Blood Cells",
    "pCO2",
    "pH",
    "pO2",
]

labevents = labevents[labevents["label"].isin(selected_variables)]
print(f"Number of patients remaining: {labevents.subject_id.nunique()}")

In [None]:
# test_freq = labevents.groupby("label")["subject_id"].nunique()
# common_tests = test_freq.sort_values(ascending=False).index[:150]
# labevents = labevents[labevents["label"].isin(common_tests)]
# print(f"Number of patients remaining: {labevents.subject_id.nunique()}")

# Serialize Pre-processed DataFrame

In [None]:
# Clean categories


def clean_categories(df):
    for col in df:
        if df[col].dtype == "category":
            df[col] = df[col].cat.remove_unused_categories()
    return df


labevents = clean_categories(labevents)
labevents.to_parquet(dataset_path / "labevents_processed.parquet")
labevents.shape, labevents.dtypes

In [None]:
labevents.subject_id