# Preparing the MIMIC 3 dataset

MIMIC3 can be downloaded from [here](https://mimic.physionet.org/gettingstarted/access/).

We use the same preprocessing as the authors of the GRU-ODE-Bayer paper and provide the final data preparation notebook that is missing in the original repository. First, execute the notebooks *admissions*, *outputs*, *labevents* and *prescriptions* provided [here](https://github.com/edebrouwer/gru_ode_bayes/tree/master/data_preproc/MIMIC).

In [None]:
from datetime import timedelta

import numpy as np
import pandas as pd

In [None]:
with pd.option_context("string_storage", "pyarrow"):
    labevents = pd.read_parquet("labevents_processed.parquet", columns=)
    inputevents = pd.read_parquet("inputevents_processed.parquet", columns=)
    outputevents = pd.read_parquet("outputevents_processed.parquet", columns=)
    prescriptions = pd.read_parquet("prescriptions_processed.parquet", columns=)  

In [None]:
file_path = "path"
outfile_path = "path"
lab_df = pd.read_csv(file_path + "LAB_processed.csv")[
    ["SUBJECT_ID", "HADM_ID", "CHARTTIME", "VALUENUM", "LABEL"]
]
inputs_df = pd.read_csv(file_path + "INPUTS_processed.csv")[
    ["SUBJECT_ID", "HADM_ID", "CHARTTIME", "AMOUNT", "LABEL"]
]
outputs_df = pd.read_csv(file_path + "OUTPUTS_processed.csv")[
    ["SUBJECT_ID", "HADM_ID", "CHARTTIME", "VALUE", "LABEL"]
]
presc_df = pd.read_csv(file_path + "PRESCRIPTIONS_processed.csv")[
    ["SUBJECT_ID", "HADM_ID", "CHARTTIME", "DOSE_VAL_RX", "DRUG"]
]

In [None]:
inputs_df["VALUENUM"] = inputs_df["AMOUNT"]
inputs_df.head()
inputs_df = inputs_df.drop(columns=["AMOUNT"]).copy()

outputs_df["VALUENUM"] = outputs_df["VALUE"]
outputs_df = outputs_df.drop(columns=["VALUE"]).copy()

presc_df["VALUENUM"] = presc_df["DOSE_VAL_RX"]
presc_df = presc_df.drop(columns=["DOSE_VAL_RX"]).copy()
presc_df["LABEL"] = presc_df["DRUG"]
presc_df = presc_df.drop(columns=["DRUG"]).copy()

inputs_df["Origin"] = "Inputs"
lab_df["Origin"] = "Lab"
outputs_df["Origin"] = "Outputs"
presc_df["Origin"] = "Prescriptions"

merged_df1 = (inputs_df.append(lab_df)).reset_index()
merged_df2 = (merged_df1.append(outputs_df)).reset_index()
merged_df2.drop(columns="level_0", inplace=True)
merged_df = (merged_df2.append(presc_df)).reset_index()

assert merged_df["LABEL"].nunique() == (
    inputs_df["LABEL"].nunique()
    + lab_df["LABEL"].nunique()
    + outputs_df["LABEL"].nunique()
    + presc_df["LABEL"].nunique()
)

In [None]:
merged_df.head()

In [None]:
merged_df["CHARTTIME"] = pd.to_datetime(
    merged_df["CHARTTIME"], format="%Y-%m-%d %H:%M:%S"
)
ref_time = merged_df.groupby("HADM_ID")["CHARTTIME"].min()

merged_df_1 = pd.merge(
    ref_time.to_frame(name="REF_TIME"), merged_df, left_index=True, right_on="HADM_ID"
)
merged_df_1["TIME_STAMP"] = merged_df_1["CHARTTIME"] - merged_df_1["REF_TIME"]
assert len(merged_df_1.loc[merged_df_1["TIME_STAMP"] < timedelta(hours=0)].index) == 0

In [None]:
label_dict = dict(
    zip(
        list(merged_df_1["LABEL"].unique()),
        range(len(list(merged_df_1["LABEL"].unique()))),
    )
)
merged_df_1["LABEL_CODE"] = merged_df_1["LABEL"].map(label_dict)

merged_df_short = merged_df_1[
    ["HADM_ID", "VALUENUM", "TIME_STAMP", "LABEL_CODE", "Origin"]
]

In [None]:
label_dict_df = pd.Series(merged_df_1["LABEL"].unique()).reset_index()
label_dict_df.columns = ["index", "LABEL"]
label_dict_df["LABEL_CODE"] = label_dict_df["LABEL"].map(label_dict)
label_dict_df.drop(columns=["index"], inplace=True)
label_dict_df.to_csv(outfile_path + "label_dict.csv")

In [None]:
merged_df_short["valuenum"] = merged_df_short["valuenum"].astype(float)

In [None]:
merged_df_short = merged_df_short.loc[(merged_df_short["TIME_STAMP"] < 2880)]
print("Number of patients considered :" + str(merged_df_short["HADM_ID"].nunique()))

In [None]:
merged_df_short.drop(["Origin"], axis=1, inplace=True)
merged_df_short

In [None]:
merged_df_short = complete_df
labels = complete_df["LABEL_CODE"].unique()
value_columns = []
mask_columns = []
for num in labels:
    name = "Value_label_" + str(num)
    name2 = "Mask_label_" + str(num)
    value_columns.append(name)
    mask_columns.append(name2)
    complete_df[name] = 0
    complete_df[name2] = 0
    complete_df[name] = complete_df[name].astype(float)

In [None]:
complete_df.dropna(inplace=True)
for index, row in complete_df.iterrows():
    name = "Value_label_" + str(row["LABEL_CODE"].astype(int))
    name2 = "Mask_label_" + str(row["LABEL_CODE"].astype(int))
    complete_df.at[index, name] = row["VALUENUM"]
    complete_df.at[index, name2] = 1

In [None]:
complete_df.drop(["VALUENUM", "LABEL_CODE"], axis=1, inplace=True)
complete_df = complete_df.groupby(["HADM_ID", "TIME_STAMP"], as_index=False).max()
for x in mask_columns:
    assert len(complete_df.loc[complete_df[x] > 1]) == 0
complete_df

In [None]:
complete_df = complete_df.groupby(["hadm_id", "time_stamp"], as_index=False).max()
complete_df

In [None]:
unique_ids = np.arange(complete_df["hadm_id"].nunique())
np.random.shuffle(unique_ids)
d = dict(zip(complete_df["hadm_id"].unique(), unique_ids))

Unique_id_dict = pd.Series(complete_df["hadm_id"].unique()).reset_index().copy()
Unique_id_dict.columns = ["index", "hadm_id"]
Unique_id_dict["unique_id"] = Unique_id_dict["hadm_id"].map(d)
Unique_id_dict.to_csv("unique_id_dict.csv")

unique_id_df = pd.read_csv("unique_id_dict.csv")
d = dict(zip(unique_id_df["hadm_id"].values, unique_id_df["unique_id"].values))

In [None]:
admissions = pd.read_csv("admissions_processed.csv")
death_tags_s = (
    admissions.groupby("hadm_id")["deathtag"]
    .unique()
    .astype(int)
    .to_frame()
    .reset_index()
)
death_tags_df = death_tags_s.loc[
    death_tags_s["hadm_id"].isin(complete_df["hadm_id"])
].copy()
death_tags_df["unique_id"] = death_tags_df["hadm_id"].map(d)
death_tags_df.sort_values(by="unique_id", inplace=True)
death_tags_df.rename(columns={"deathtag": "Value"}, inplace=True)
death_tags_df.set_index("unique_id", inplace=True)
death_tags_df.drop("hadm_id", axis=1, inplace=True)
death_tags_df.to_csv("complete_death_tags.csv")

complete_df["unique_id"] = complete_df["hadm_id"].map(d)

In [None]:
complete_df.rename(columns={"unique_id": "ID", "time_stamp": "Time"}, inplace=True)
complete_df.drop(["hadm_id"], axis=1, inplace=True)
complete_df.set_index(["ID"], inplace=True)
complete_df

In [None]:
complete_df.to_csv("full_dataset.csv")