# MIMIC 4 data - dataset construction admissions

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [None]:
import gzip
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import pyarrow as pa
import pyarrow.csv
import pyarrow.parquet

# Load `admissions` table

## Table Schema

In [None]:
rawdata_file = Path.cwd() / "mimic-iv-1.0.zip"
dataset_path = Path.cwd() / "processed"
rawdata_path = Path.cwd() / "raw"

files = {
    "admissions": "mimic-iv-1.0/core/admissions.csv.gz",
    "patients": "mimic-iv-1.0/core/patients.csv.gz",
    "chartevents": "mimic-iv-1.0/icu/chartevents.csv.gz",
}

CATEGORY = pyarrow.dictionary("int32", "string")

column_types = {
    "chartevents": {
        "subject_id": "int32",
        "hadm_id": "int32",
        "stay_id": "int32",
        "itemid": "int32",
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "value": CATEGORY,
        "valuenum": "float32",
        "valueuom": CATEGORY,
        "warning": "bool",
    }
}


dtypes = {
    "admissions": {
        "subject_id": "int32",
        "hadm_id": "int32",
        "admittime": "datetime64",
        "dischtime": "datetime64",
        "deathtime": "datetime64",
        "admission_type": "string",
        "admission_location": "string",
        "discharge_location": "string",
        "insurance": "string",
        "language": "string",
        "marital_status": "string",
        "ethnicity": "string",
        "edregtime": "datetime64",
        "edouttime": "datetime64",
        "hospital_expire_flag": "bool",
    },
    "patients": {
        "subject_id": "int32",
        "gender": "string",
        "anchor_age": "int32",
        "anchor_year": "int32",
        "anchor_year_group": "string",
        "dod": "datetime64",
    },
}

na_values = {
    "admissions": {
        "subject_id": [],
        "hadm_id": [],
        "admittime": [],
        "dischtime": [],
        "deathtime": [],
        "admission_type": [],
        "admission_location": ["INFORMATION NOT AVAILABLE"],
        "discharge_location": [],
        "insurance": [],
        "language": ["?"],
        "marital_status": [],
        "ethnicity": ["UNKNOWN", "UNABLE TO OBTAIN"],
        "edregtime": [],
        "edouttime": [],
        "hospital_expire_flag": [],
    },
    "patients": {
        "subject_id": [],
        "gender": [],
        "anchor_age": [],
        "anchor_year": [],
        "anchor_year_group": [],
        "dod": [],
    },
}

## Load `admissions` and `patients` tables

In [None]:
with ZipFile(rawdata_file) as archive:
    key = "admissions"
    with archive.open(files[key], "r") as file:
        admissions = pd.read_csv(
            file,
            compression="gzip",
            parse_dates=[k for k, v in dtypes[key].items() if v == "datetime64"],
            dtype={k: v for k, v in dtypes[key].items() if v != "datetime64"},
            na_values=na_values[key],
        )
    admissions.to_parquet(rawdata_path / f"{key}.parquet")

    key = "patients"
    with archive.open(files[key], "r") as file:
        patients = pd.read_csv(
            file,
            compression="gzip",
            parse_dates=[k for k, v in dtypes[key].items() if v == "datetime64"],
            dtype={k: v for k, v in dtypes[key].items() if v != "datetime64"},
            na_values=na_values[key],
        )
    patients.to_parquet(rawdata_path / f"{key}.parquet")


admissions = pd.merge(admissions, patients, on="subject_id")
admissions

## Load `chartevents` table

In [None]:
# shape: (330M, 10) ⇝ 3.3B values
key = "chartevents"
with (
    ZipFile(rawdata_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    chartevents = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=[""],
        ),
    )

chartevents.schema

In [None]:
pyarrow.parquet.write_table(chartevents, rawdata_path / f"{key}.parquet")
chartevents = chartevents.to_pandas(self_destruct=True)
chartevents

# Filter Dataset

## Only keep patients with single admission

In [None]:
num_admissions = admissions.groupby("subject_id")["hadm_id"].nunique()
selected_subjects = num_admissions[num_admissions == 1].index
mask = admissions["subject_id"].isin(selected_subjects)
admissions = admissions[mask].copy()
print(f"Number of patients remainin in the dataframe: {admissions.shape}")

## Only keep patients that stayed between 2 and 29 days

In [None]:
admissions["elapsed_time"] = admissions["dischtime"] - admissions["admittime"]
admissions["elapsed_days"] = admissions["elapsed_time"].dt.days
mask = (admissions["elapsed_days"] > 2) & (admissions["elapsed_days"] < 30)
admissions = admissions[mask].copy()
print(f"Number of patients remainin in the dataframe: {admissions.shape}")

## Only keep patients older than 15

In [None]:
admissions = admissions[admissions["anchor_age"] > 15]
print(f"Number of patients remainin in the dataframe: {admissions.shape}")

## Only keep Patients that have time series data associated with them

In [None]:
admissions = admissions[admissions.hadm_id.isin(chartevents.hadm_id)]
print(f"Number of patients remainin in the dataframe: {admissions.shape}")

# Serialize Pre-processed DataFrame

In [None]:
# Clean categories
def clean_categories(df):
    for col in df:
        if df[col].dtype == "category":
            df[col] = df[col].cat.remove_unused_categories()
    return df


admissions = clean_categories(admissions)
admissions.to_parquet(dataset_path / "admissions_processed.parquet")