# MIMIC 4 data - dataset construction prescriptions

Code taken from GRU-ODE-Bayes preprocessing; simplified and adapted for MIMIC 4 1.0

In [None]:
import gzip
from zipfile import ZipFile

import pandas as pd
import pyarrow
import pyarrow.csv
import pyarrow.parquet

In [None]:
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 300)

In [None]:
admissions = pd.read_parquet("admissions_processed.parquet")

In [None]:
dataset_file = "mimic-iv-1.0.zip"

files = {
    "prescriptions": "mimic-iv-1.0/hosp/prescriptions.csv.gz",
}
column_types = {
    "prescriptions": {
        "subject_id": "int32",
        "hadm_id": "int32",
        "pharmacy_id": "int32",
        "starttime": "timestamp[s]",
        "stoptime": "timestamp[s]",
        "drug_type": "string",
        "drug": "string",
        "gsn": "string",
        "ndc": "float32",
        "prod_strength": "string",
        "form_rx": "string",
        "dose_val_rx": "string",  # float or interval[float]
        "dose_unit_rx": "string",
        "form_val_disp": "string",  # float or interval[float]
        "form_unit_disp": "string",
        "doses_per_24_hrs": "int32",
        "route": "string",
    }
}

In [None]:
%%time
key = "prescriptions"
with (
    ZipFile(dataset_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    prescriptions = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            auto_dict_encode=True,
            strings_can_be_null=True,
            null_values=[""],
        ),
    )

# set categoricals
for col, dtype in column_types[key].items():
    if dtype == "string":
        index = prescriptions.column_names.index(col)
        categories = prescriptions[col].dictionary_encode()
        prescriptions = prescriptions.set_column(index, col, categories)

pyarrow.parquet.write_table(prescriptions, f"{key}.parquet")
prescriptions.schema

## Store and reload as pandas

In [None]:
with pd.option_context("string_storage", "pyarrow"):
    prescriptions = pd.read_parquet(f"{key}.parquet", use_nullable_dtypes=True)

pd.DataFrame({"type": prescriptions.dtypes, "uniques": prescriptions.nunique()})

### Only choose previously selected admission ids

In [None]:
prescriptions = prescriptions[prescriptions["hadm_id"].isin(admissions["hadm_id"])]
print(f"Number of events remaining: {prescriptions.shape}")
print(f"Number of patients remaining: {prescriptions['subject_id'].nunique()}")

# Select entries whose drug name is in the list from the paper.

In [None]:
drugs_list = [
    "Acetaminophen",
    "Aspirin",
    "Bisacodyl",
    "Insulin",
    "Heparin",
    "Docusate Sodium",
    "D5W",
    "Humulin-R Insulin",
    "Potassium Chloride",
    "Magnesium Sulfate",
    "Metoprolol Tartrate",
    "Sodium Chloride 0.9%  Flush",
    "Pantoprazole",
]
prescriptions = prescriptions[prescriptions["drug"].isin(drugs_list)]
print(f"Number of events remaining: {prescriptions.shape}")
print(f"Number of patients remaining: {prescriptions['subject_id'].nunique()}")
prescriptions.groupby("drug")["dose_unit_rx"].value_counts()

# Units correction

In [None]:
prescriptions = prescriptions[prescriptions["dose_unit_rx"].notna()]
print(f"Number of events remaining: {prescriptions.shape}")
print(f"Number of patients remaining: {prescriptions['subject_id'].nunique()}")

In [None]:
check_dose_unit_rx = {
    "Acetaminophen": "mg",
    "D5W": "mL",
    "Heparin": "UNIT",
    "Insulin": "UNIT",
    "Magnesium Sulfate": "gm",
    "Potassium Chloride": "mEq",
    "Bisacodyl": "mg",
    "Pantoprazole": "mg",
}

MASK = pd.Series(False, index=prescriptions.index)
for drug, unit in check_dose_unit_rx.items():
    mask = (prescriptions["drug"] == drug) & (prescriptions["dose_unit_rx"] != unit)
    print(f"Removing {mask.sum():5d} entries - bad data in {drug} (drop {unit})")
    MASK |= mask

prescriptions = prescriptions[~MASK]

# Fixing units

In [None]:
prescriptions["dose_unit_rx"].replace("mL", "ml", inplace=True)
prescriptions.groupby("drug")["dose_unit_rx"].value_counts()

### To avoid confounding labels with labels from other tables, we add "drug" to the name

In [None]:
prescriptions.starttime

In [None]:
prescriptions["charttime"] = prescriptions["starttime"]

In [None]:
prescriptions.to_parquet("prescriptions_processed.parquet")