# MIMIC 4 data - dataset construction prescriptions

In [None]:
import gzip
from pathlib import Path
from zipfile import ZipFile

import numpy as np
import pandas as pd
import pyarrow
import pyarrow.csv
import pyarrow.parquet

# Load `prescriptions` table

## Table Schema

In [None]:
rawdata_file = Path.cwd() / "mimic-iv-1.0.zip"
dataset_path = Path.cwd() / "processed"
rawdata_path = Path.cwd() / "raw"

files = {
    "prescriptions": "mimic-iv-1.0/hosp/prescriptions.csv.gz",
}

CATEGORY = pyarrow.dictionary("int32", "string")


column_types = {
    "prescriptions": {
        "subject_id": "int32",
        "hadm_id": "int32",
        "pharmacy_id": "int32",
        "starttime": "timestamp[s]",
        "stoptime": "timestamp[s]",
        "drug_type": CATEGORY,
        "drug": CATEGORY,
        "gsn": CATEGORY,
        "ndc": "float32",
        "prod_strength": CATEGORY,
        "form_rx": CATEGORY,
        "dose_val_rx": CATEGORY,  # float or interval[float]
        "dose_unit_rx": CATEGORY,
        "form_val_disp": CATEGORY,  # float or interval[float]
        "form_unit_disp": CATEGORY,
        "doses_per_24_hrs": "float32",
        "route": CATEGORY,
    }
}

## Load Prescriptions

In [None]:
%%time
key = "prescriptions"
with (
    ZipFile(rawdata_file) as archive,
    archive.open(files[key]) as compressed_file,
    gzip.open(compressed_file) as file,
):
    prescriptions = pyarrow.csv.read_csv(
        file,
        convert_options=pyarrow.csv.ConvertOptions(
            column_types=column_types[key],
            strings_can_be_null=True,
            null_values=[""],
        ),
    )

prescriptions.schema

## Store and reload as pandas

In [None]:
pyarrow.parquet.write_table(prescriptions, f"{key}.parquet")
prescriptions = prescriptions.to_pandas(self_destruct=True)
prescriptions

# Filter Dataset

## Only choose previously selected admission ids

In [None]:
admissions = pd.read_parquet(dataset_path / "admissions_processed.parquet")
prescriptions = prescriptions[prescriptions["hadm_id"].isin(admissions["hadm_id"])]
print(f"Number of events remaining: {prescriptions.shape}")
print(f"Number of patients remaining: {prescriptions['subject_id'].nunique()}")

## Select entries whose drug name is in the list from the paper.

In [None]:
drugs_list = [
    "Acetaminophen",
    "Aspirin",
    "Bisacodyl",
    "Insulin",
    "Heparin",
    "Docusate Sodium",
    "D5W",
    "Humulin-R Insulin",
    "Potassium Chloride",
    "Magnesium Sulfate",
    "Metoprolol Tartrate",
    "Sodium Chloride 0.9%  Flush",
    "Pantoprazole",
]
prescriptions = prescriptions[prescriptions["drug"].isin(drugs_list)]
print(f"Number of events remaining: {prescriptions.shape}")
print(f"Number of patients remaining: {prescriptions['subject_id'].nunique()}")
prescriptions.groupby("drug")["dose_unit_rx"].value_counts()

# Correct Units

In [None]:
prescriptions = prescriptions[prescriptions["dose_unit_rx"].notna()]
print(f"Number of events remaining: {prescriptions.shape}")
print(f"Number of patients remaining: {prescriptions['subject_id'].nunique()}")

In [None]:
check_dose_unit_rx = {
    "Acetaminophen": "mg",
    "D5W": "mL",
    "Heparin": "UNIT",
    "Insulin": "UNIT",
    "Magnesium Sulfate": "gm",
    "Potassium Chloride": "mEq",
    "Bisacodyl": "mg",
    "Pantoprazole": "mg",
}

MASK = pd.Series(False, index=prescriptions.index)
for drug, unit in check_dose_unit_rx.items():
    mask = (prescriptions["drug"] == drug) & (prescriptions["dose_unit_rx"] != unit)
    print(f"Removing {mask.sum():5d} entries - bad data in {drug} (drop {unit})")
    MASK |= mask

prescriptions = prescriptions[~MASK].copy()

In [None]:
prescriptions["dose_unit_rx"].replace("mL", "ml", inplace=True)
prescriptions.groupby("drug")["dose_unit_rx"].value_counts()

# Serialize Pre-processed DataFrame

In [None]:
prescriptions["charttime"] = prescriptions["starttime"]

# Clean categories
def clean_categories(df):
    for col in df:
        if df[col].dtype == "category":
            df[col] = df[col].cat.remove_unused_categories()
    return df


prescriptions = clean_categories(prescriptions)
prescriptions.to_parquet(dataset_path / "prescriptions_processed.parquet")