# Merging all data sources

In [None]:
import datetime
import gzip
from datetime import timedelta
from zipfile import ZipFile

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow
import pyarrow.csv
import pyarrow.parquet

Load all the previously processed data from the source files in the MIMIC4 dataset. <br/>Unify column naming across data sources and merge to one dataframe.

In [None]:
pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", 300)

In [None]:
with pd.option_context("string_storage", "pyarrow"):
    labevents = pd.read_parquet(
        "labevents_processed.parquet",
        columns=["subject_id", "hadm_id", "charttime", "valuenum", "label"],
    )
    inputevents = pd.read_parquet(
        "inputevents_processed.parquet",
        columns=["subject_id", "hadm_id", "charttime", "amount", "label"],
    )
    outputevents = pd.read_parquet(
        "outputevents_processed.parquet",
        columns=["subject_id", "hadm_id", "charttime", "value", "label"],
    )
    prescriptions = pd.read_parquet(
        "prescriptions_processed.parquet",
        columns=["subject_id", "hadm_id", "charttime", "dose_val_rx", "drug"],
    )

In [None]:
for frame in (labevents, inputevents, outputevents, prescriptions):
    display(pd.DataFrame({"type": frame.dtypes, "uniques": frame.nunique()}))

In [None]:
labevents = pd.read_csv("/path/processed/lab_processed.csv")[
    ["subject_id", "hadm_id", "charttime", "valuenum", "label"]
]
inputevents = pd.read_csv("/path/processed/inputs_processed.csv")[
    ["subject_id", "hadm_id", "charttime", "amount", "label"]
]
outputevents = pd.read_csv("/path/processed/outputs_processed.csv")[
    ["subject_id", "hadm_id", "charttime", "value", "label"]
]
prescriptions = pd.read_csv("/path/processed/prescriptions_processed.csv")[
    ["subject_id", "hadm_id", "charttime", "dose_val_rx", "drug"]
]

In [None]:
# Change the name of amount. Valuenum for every table
inputevents["valuenum"] = inputevents["amount"]
inputevents.head()
inputevents = inputevents.drop(columns=["amount"]).copy()

outputevents["valuenum"] = outputevents["value"]
outputevents = outputevents.drop(columns=["value"]).copy()

prescriptions["valuenum"] = prescriptions["dose_val_rx"]
prescriptions = prescriptions.drop(columns=["dose_val_rx"]).copy()
prescriptions["label"] = prescriptions["drug"]
prescriptions = prescriptions.drop(columns=["drug"]).copy()
prescriptions = prescriptions.drop((prescriptions["valuenum"] == "3-10").index)

# Tag to distinguish between lab and inputs events
inputevents["Origin"] = "Inputs"
labevents["Origin"] = "Lab"
outputevents["Origin"] = "Outputs"
prescriptions["Origin"] = "Prescriptions"

# merge both dfs.
merged_df1 = (inputevents.append(labevents)).reset_index()
merged_df2 = (merged_df1.append(outputevents)).reset_index()
merged_df2.drop(columns="level_0", inplace=True)
merged_df = (merged_df2.append(prescriptions)).reset_index()

# Check that all labels have different names.
assert merged_df["label"].nunique() == (
    inputevents["label"].nunique()
    + labevents["label"].nunique()
    + outputevents["label"].nunique()
    + prescriptions["label"].nunique()
)

In [None]:
# set the timestamp as the time delta between the first chart time for each admission
merged_df["charttime"] = pd.to_datetime(
    merged_df["charttime"], format="%Y-%m-%d %H:%M:%S"
)
ref_time = merged_df.groupby("hadm_id")["charttime"].min()
merged_df_1 = pd.merge(
    ref_time.to_frame(name="ref_time"), merged_df, left_index=True, right_on="hadm_id"
)
merged_df_1["time_stamp"] = merged_df_1["charttime"] - merged_df_1["ref_time"]
assert len(merged_df_1.loc[merged_df_1["time_stamp"] < timedelta(hours=0)].index) == 0

In [None]:
# Create a label code (int) for the labels.
label_dict = dict(
    zip(
        list(merged_df_1["label"].unique()),
        range(len(list(merged_df_1["label"].unique()))),
    )
)
merged_df_1["label_code"] = merged_df_1["label"].map(label_dict)

merged_df_short = merged_df_1[
    ["hadm_id", "valuenum", "time_stamp", "label_code", "Origin"]
]

In [None]:
label_dict_df = pd.Series(merged_df_1["label"].unique()).reset_index()
label_dict_df.columns = ["index", "label"]
label_dict_df["label_code"] = label_dict_df["label"].map(label_dict)
label_dict_df.drop(columns=["index"], inplace=True)
label_dict_df.to_csv("/path/processed/label_dict.csv")

In [None]:
merged_df_short["valuenum"] = merged_df_short["valuenum"].astype(float)

In [None]:
# select only values within first 48 hours
merged_df_short = merged_df_short.loc[
    (merged_df_short["time_stamp"] < timedelta(hours=48))
]
merged_df_short["time_stamp"] = (
    merged_df_short["time_stamp"].dt.total_seconds().div(60).astype(int)
)
print("Number of patients considered: " + str(merged_df_short["hadm_id"].nunique()))
assert len(merged_df_short.loc[merged_df_short["time_stamp"] > 2880].index) == 0

In [None]:
# drop columns that are not needed for final dataset
merged_df_short.drop(["Origin"], axis=1, inplace=True)
complete_df = merged_df_short

In [None]:
# create value- and mask- columns and fill with data
labels = complete_df["label_code"].unique()
value_columns = []
mask_columns = []
for num in labels:
    name = "Value_label_" + str(num)
    name2 = "Mask_label_" + str(num)
    value_columns.append(name)
    mask_columns.append(name2)
    complete_df[name] = 0
    complete_df[name2] = 0
    complete_df[name] = complete_df[name].astype(float)

In [None]:
complete_df.dropna(inplace=True)
for index, row in complete_df.iterrows():
    name = "Value_label_" + str(row["label_code"].astype(int))
    name2 = "Mask_label_" + str(row["label_code"].astype(int))
    complete_df.at[index, name] = row["valuenum"]
    complete_df.at[index, name2] = 1

In [None]:
# drop all unneccesary columns and do sanity check
complete_df.drop(["valuenum", "label_code"], axis=1, inplace=True)
complete_df = complete_df.groupby(["hadm_id", "time_stamp"], as_index=False).max()
for x in mask_columns:
    assert len(complete_df.loc[complete_df[x] > 1]) == 0
complete_df

In [None]:
complete_df.to_csv("/path/processed/full_dataset.csv", index=False)