# MIMIC 4 data - dataset construction labevents

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [None]:
%load_ext autotime

In [None]:
import os
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 300)

In [None]:
admissions = pd.read_parquet("admissions_processed.parquet")
admissions.head()

In [None]:
import gzip
from zipfile import ZipFile

import pyarrow
import pyarrow.csv
import pyarrow.parquet

In [None]:
dataset_file = "mimic-iv-1.0.zip"

files = {
    "labevents": "mimic-iv-1.0/hosp/labevents.csv.gz",
    "labitems": "mimic-iv-1.0/hosp/d_labitems.csv.gz",
}

column_types = {
    "labevents": {
        "labevent_id": "int32",
        "subject_id": "int32",
        "hadm_id": "int32",
        "specimen_id": "int32",
        "itemid": "int32",
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "value": "string",
        "valuenum": "float32",
        "valueuom": "string",
        "ref_range_lower": "float32",
        "ref_range_upper": "float32",
        "flag": "string",
        "priority": "string",
        "comments": "string",
    },
    "labitems": {
        "itemid": "int32",
        "label": "string",
        "fluid": "string",
        "category": "string",
        "loinc_code": "string",
    },
}

In [None]:
# shape: (120M, 15) ⇝ 1.8B values.
key = "labevents"
with (
    ZipFile(dataset_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    labevents = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=[""],
        ),
    )

### Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(labevents, f"{key}.parquet")
del labevents

In [None]:
with pd.option_context("string_storage", "pyarrow"):
    labevents = pd.read_parquet(f"{key}.parquet", use_nullable_dtypes=True)
print(f"Number of events remaining in the dataframe: {len(labevents)}")

### Only keep labevents associated with the selected patients

In [None]:
labevents = labevents[labevents.hadm_id.isin(admissions.hadm_id)]
print(f"Number of events remaining: {len(labevents)}")
print(f"Number of patients remaining: {labevents.subject_id.nunique()}")

### Load Labitems table

In [None]:
# shape: (120M, 15) ⇝ 1.8B values.
key = "labitems"
with (
    ZipFile(dataset_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    labitems = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=[""],
        ),
    )

#### Store and reload

In [None]:
pyarrow.parquet.write_table(labitems, f"{key}.parquet")

with pd.option_context("string_storage", "pyarrow"):
    labitems = pd.read_parquet(f"{key}.parquet", use_nullable_dtypes=True)

labitems.dtypes

### Merge on ItemID, only select tests that were given to many people

In [None]:
selected_variables = [
    "Albumin",
    "Alanine Aminotransferase (ALT)",
    "Alkaline Phosphatase",
    "Anion Gap",
    "Asparate Aminotransferase (AST)",
    "Base Excess",
    "Basophils",
    "Bicarbonate",
    "Bilirubin, Total",
    "Calcium, Total",
    "Calculated Total CO2",
    "Chloride",
    "Creatinine",
    "Eosinophils",
    "Glucose",
    "Hematocrit",
    "Hemoglobin",
    "Lactate",
    "Lymphocytes",
    "MCH",
    "MCV",
    "Magnesium",
    "Monocytes",
    "Neutrophils",
    "PT",
    "PTT",
    "Phosphate",
    "Platelet Count",
    "Potassium",
    "RDW",
    "Red Blood Cells",
    "Sodium",
    "Specific Gravity",
    "Urea Nitrogen",
    "White Blood Cells",
    "pCO2",
    "pH",
    "pO2",
]

In [None]:
labevents = pd.merge(labevents, labitems, on="itemid")

In [None]:
# test_freq = labevents.groupby("label")["subject_id"].nunique()
# common_tests = test_freq.sort_values(ascending=False).index[:150]
# labevents = labevents[labevents["label"].isin(common_tests)]
# print(f"Number of patients remaining: {labevents.subject_id.nunique()}")

In [None]:
labevents = labevents[labevents["label"].isin(selected_variables)]
print(f"Number of patients remaining: {labevents.subject_id.nunique()}")

In [None]:
labevents

In [None]:
labevents.to_parquet("labevents_processed.parquet")