# MIMIC 4 data - dataset construction admissions

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [None]:
%load_ext autotime

In [None]:
import gzip
import os
from datetime import datetime, timedelta
from zipfile import ZipFile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
dataset_file = "mimic-iv-1.0.zip"

files = {
    "admissions": "mimic-iv-1.0/core/admissions.csv.gz",
    "patients": "mimic-iv-1.0/core/patients.csv.gz",
    "chartevents": "mimic-iv-1.0/icu/chartevents.csv.gz",
}

dtypes = {
    "admissions": {
        "subject_id": "int32",
        "hadm_id": "int32",
        "admittime": "datetime64",
        "dischtime": "datetime64",
        "deathtime": "datetime64",
        "admission_type": "string",
        "admission_location": "string",
        "discharge_location": "string",
        "insurance": "string",
        "language": "string",
        "marital_status": "string",
        "ethnicity": "string",
        "edregtime": "datetime64",
        "edouttime": "datetime64",
        "hospital_expire_flag": "bool",
    },
    "patients": {
        "subject_id": "int32",
        "gender": "string",
        "anchor_age": "int32",
        "anchor_year": "int32",
        "anchor_year_group": "string",
        "dod": "datetime64",
    },
    "chartevents": {
        "subject_id": "int32",
        "hadm_id": "int32",
        "stay_id": "int32",
        "itemid": "int32",
        "charttime": "datetime64",
        "storetime": "datetime64",
        "value": "string[pyarrow]",
        "valuenum": "float32",
        "valueuom": "string[pyarrow]",
        "warning": "bool",
    },
}

na_values = {
    "admissions": {
        "subject_id": [],
        "hadm_id": [],
        "admittime": [],
        "dischtime": [],
        "deathtime": [],
        "admission_type": [],
        "admission_location": ["INFORMATION NOT AVAILABLE"],
        "discharge_location": [],
        "insurance": [],
        "language": ["?"],
        "marital_status": [],
        "ethnicity": ["UNKNOWN", "UNABLE TO OBTAIN"],
        "edregtime": [],
        "edouttime": [],
        "hospital_expire_flag": [],
    },
    "patients": {
        "subject_id": [],
        "gender": [],
        "anchor_age": [],
        "anchor_year": [],
        "anchor_year_group": [],
        "dod": [],
    },
    "chartevents": {
        "subject_id": [],
        "hadm_id": [],
        "stay_id": [],
        "itemid": [],
        "charttime": [],
        "storetime": [],
        "value": [""],
        "valuenum": [],
        "valueuom": [""],
        "warning": [],
    },
}

### Load Admissions and Patients tables

In [None]:
with ZipFile(dataset_file) as archive:

    key = "admissions"
    with archive.open(files[key], "r") as file:
        admissions = pd.read_csv(
            file,
            compression="gzip",
            parse_dates=[k for k, v in dtypes[key].items() if v == "datetime64"],
            dtype={k: v for k, v in dtypes[key].items() if v != "datetime64"},
            na_values=na_values[key],
            index_col="subject_id",
        )

    key = "patients"
    with archive.open(files[key], "r") as file:
        patients = pd.read_csv(
            file,
            compression="gzip",
            parse_dates=[k for k, v in dtypes[key].items() if v == "datetime64"],
            dtype={k: v for k, v in dtypes[key].items() if v != "datetime64"},
            na_values=na_values[key],
            index_col="subject_id",
        )

data = pd.merge(patients, admissions, on="subject_id")
data

### Only keep patients with single admission

In [None]:
num_admissions = data.groupby("subject_id")["hadm_id"].nunique()
subject_ids = num_admissions[num_admissions == 1].index
data = data.loc[subject_ids]
print(f"Number of patients remaining in the dataframe: {len(data)}")

### Only keep patients that stayed between 2 and 29 days

In [None]:
data["elapsed_time"] = data["dischtime"] - data["admittime"]
data["elapsed_days"] = data["elapsed_time"].dt.days
data = data.loc[(data["elapsed_days"] > 2) & (data["elapsed_days"] < 30)]
print(f"Number of patients remaining in the dataframe: {len(data)}")

### Only keep patients older than 15

In [None]:
data = data[data["anchor_age"] > 15]
print(f"Number of patients remaining in the dataframe: {len(data)}")

### Load Chartevents table

In [None]:
import pyarrow as pa
import pyarrow.csv
import pyarrow.parquet

convert_options = pyarrow.csv.ConvertOptions(
    column_types={
        "subject_id": "int32",
        "hadm_id": "int32",
        "stay_id": "int32",
        "itemid": "int32",
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "value": "string",
        "valuenum": "float32",
        "valueuom": "string",
        "warning": "bool",
    },
    strings_can_be_null=True,
    null_values=[""],
)

In [None]:
# shape: (330M, 10) ⇝ 3.3B values
key = "chartevents"
with (
    ZipFile(dataset_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    chartevents = pyarrow.csv.read_csv(file, convert_options=convert_options)

In [None]:
pyarrow.parquet.write_table(chartevents, "chartevents.parquet")
del chartevents

In [None]:
with pd.option_context("string_storage", "pyarrow"):
    chartevents = pd.read_parquet("chartevents.parquet", use_nullable_dtypes=True)
chartevents.dtypes

### Only keep Patients that have time series data associated with them

In [None]:
data = data[data.hadm_id.isin(chartevents.hadm_id)]
print(f"Number of patients remaining in the dataframe: {len(data)}")

## Store result

In [None]:
data.to_parquet("admissions_processed.parquet")