In [None]:
# import resource


# def limit_memory(maxsize):
#     soft, hard = resource.getrlimit(resource.RLIMIT_AS)
#     # MEMORY LIMIT IN BYTES
#     resource.setrlimit(resource.RLIMIT_AS, (maxsize, hard))


# # IN GIGABYTE
# limit_memory(30 * 1024 * 1024 * 1024)

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import gzip
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import pyarrow as pa
from pandas import DataFrame
from pyarrow import parquet
from tqdm.autonotebook import tqdm

from tsdm.datasets import MIMIC_IV

In [None]:
# ds = MIMIC_IV(initialize=False)
# ds.rawdata_paths

DATASET_PATH = Path("custom_processed")
RAWDATA_PATH = Path("mimic-iv-2.2.zip")

In [None]:
filelist = [
    "mimic-iv-2.2/CHANGELOG.txt",
    "mimic-iv-2.2/LICENSE.txt",
    "mimic-iv-2.2/SHA256SUMS.txt",
    "mimic-iv-2.2/hosp/admissions.csv.gz",
    "mimic-iv-2.2/hosp/d_hcpcs.csv.gz",
    "mimic-iv-2.2/hosp/d_icd_diagnoses.csv.gz",
    "mimic-iv-2.2/hosp/d_icd_procedures.csv.gz",
    "mimic-iv-2.2/hosp/d_labitems.csv.gz",
    "mimic-iv-2.2/hosp/diagnoses_icd.csv.gz",
    "mimic-iv-2.2/hosp/drgcodes.csv.gz",
    "mimic-iv-2.2/hosp/emar.csv.gz",
    "mimic-iv-2.2/hosp/emar_detail.csv.gz",
    "mimic-iv-2.2/hosp/hcpcsevents.csv.gz",
    "mimic-iv-2.2/hosp/labevents.csv.gz",
    "mimic-iv-2.2/hosp/microbiologyevents.csv.gz",
    "mimic-iv-2.2/hosp/omr.csv.gz",
    "mimic-iv-2.2/hosp/patients.csv.gz",
    "mimic-iv-2.2/hosp/pharmacy.csv.gz",
    "mimic-iv-2.2/hosp/poe.csv.gz",
    "mimic-iv-2.2/hosp/poe_detail.csv.gz",
    "mimic-iv-2.2/hosp/prescriptions.csv.gz",
    "mimic-iv-2.2/hosp/procedures_icd.csv.gz",
    "mimic-iv-2.2/hosp/provider.csv.gz",
    "mimic-iv-2.2/hosp/services.csv.gz",
    "mimic-iv-2.2/hosp/transfers.csv.gz",
    "mimic-iv-2.2/icu/caregiver.csv.gz",
    "mimic-iv-2.2/icu/chartevents.csv.gz",
    "mimic-iv-2.2/icu/d_items.csv.gz",
    "mimic-iv-2.2/icu/datetimeevents.csv.gz",
    "mimic-iv-2.2/icu/icustays.csv.gz",
    "mimic-iv-2.2/icu/ingredientevents.csv.gz",
    "mimic-iv-2.2/icu/inputevents.csv.gz",
    "mimic-iv-2.2/icu/outputevents.csv.gz",
    "mimic-iv-2.2/icu/procedureevents.csv.gz",
]

In [None]:
with ZipFile(RAWDATA_PATH, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            table = pa.csv.read_csv(file, **options)
            return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(RAWDATA_PATH, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError


def table_info(table) -> None:
    print(len(table))
    for name, col in tqdm(zip(table.column_names, table.columns)):
        uniques = pa.compute.count_distinct(col).as_py()
        nulls = pa.compute.mean(pa.compute.is_null(col)).as_py()
        dtype = str(col.type)
        print(f"{name:32s}  {dtype=:12s}  {uniques=:12d}  {nulls=:.2f}")


with ZipFile(RAWDATA_PATH, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")
    print(dir(file))

    filesizes = {info.filename: info.file_size for info in file.infolist()}
    # print([x.filename for x in file.infolist()])

pd.Series({key: filesizes[key] for key in filesizes})

In [None]:
ID_TYPE = "uint32"
VALUE_TYPE = "float32"
TIME_TYPE = "timestamp[s]"
BOOL_TYPE = "bool"
STRING_TYPE = "string"
CATEGORY = pa.dictionary("uint32", "string")


column_types = dict(
    chartevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "itemid": ID_TYPE,
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "value": "string",
        "valuenum": "float32",
        "valueuom": "string",
        "warning": "bool",
    },
    inputevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "starttime": "timestamp[s]",
        "endtime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "itemid": ID_TYPE,
        "amount": "float32",
        "amountuom": "string",
        "rate": "float32",
        "rateuom": "string",
        "orderid": ID_TYPE,
        "linkorderid": ID_TYPE,
        "ordercategoryname": "string",
        "secondaryordercategoryname": "string",
        "ordercomponenttypedescription": "string",
        "ordercategorydescription": "string",
        "patientweight": "float32",
        "totalamount": "float32",
        "totalamountuom": "string",
        "isopenbag": "bool",
        "continueinnextdept": "bool",
        "statusdescription": "string",
        "originalamount": "float32",
        "originalrate": "float32",
    },
    outputevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "itemid": ID_TYPE,
        "value": "float32",
        "valueuom": "string",
    },
    procedureevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "starttime": "timestamp[s]",
        "endtime": "timestamp[s]",
        "storetime": "timestamp[ns]",
        "itemid": ID_TYPE,
        "value": "float32",
        "valueuom": "string",
        "location": "string",
        "locationcategory": "string",
        "orderid": ID_TYPE,
        "linkorderid": ID_TYPE,
        "ordercategoryname": "string",
        "ordercategorydescription": "string",
        "patientweight": "float32",
        "isopenbag": "bool",
        "continueinnextdept": "bool",
        "statusdescription": "string",
        "originalamount": "float32",
        "originalrate": "bool",
    },
    datetimeevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "itemid": ID_TYPE,
        "value": "timestamp[s]",
        "valueuom": "string",
        "warning": "bool",
    },
    ingredientevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "starttime": TIME_TYPE,
        "endtime": TIME_TYPE,
        "storetime": TIME_TYPE,
        "itemid": ID_TYPE,
        "amount": VALUE_TYPE,
        "amountuom": STRING_TYPE,
        "rate": VALUE_TYPE,
        "rateuom": STRING_TYPE,
        "orderid": ID_TYPE,
        "linkorderid": ID_TYPE,
        "statusdescription": STRING_TYPE,
        "originalamount": VALUE_TYPE,
        "originalrate": VALUE_TYPE,
    },
    icustays={
    "subject_id"           : ID_TYPE,
    "hadm_id"              : ID_TYPE,
    "stay_id"              : ID_TYPE,
    "first_careunit"       : string,  
 category#    "last_careunit"        : string,  # category
    "intime"               : TIME_TYPE,
    "outtime"              : TIME_TYPE,
    "los"                  : VALUE_TYPE,
    }
);

In [None]:
# mimic-iv-2.2/icu/caregiver.csv.gz                   35893
# mimic-iv-2.2/icu/d_items.csv.gz                     57476
# mimic-iv-2.2/icu/icustays.csv.gz                  2614571

## icustays

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/icustays.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        # column_types=column_types["ingredientevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
pa.compute.unique(table["los"])

## ingredientevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/ingredientevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=column_types["ingredientevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["valueuom"])

## datetimeevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/datetimeevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=column_types["datetimeevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
pa.compute.unique(table["valueuom"])

## Procedureeventes

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/procedureevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=column_types["procedureevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
pa.compute.unique(table["ordercategoryname"])

## Inputevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/inputevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=column_types["inputevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)

## Outputevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/outputevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=column_types["outputevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)

In [None]:
raise

# Encode Chartevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/chartevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=column_types["chartevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)

for name, col in tqdm(zip(table.column_names, table.columns)):
    uniques = pa.compute.count_distinct(col)
    nulls = pa.compute.mean(pa.compute.is_null(col))
    print(f"{name}, {uniques=}  {nulls=}")

In [None]:
# drop everything that has null value or null valueuom
val_null = pa.compute.is_null(table["value"])
uom_null = pa.compute.is_null(table["valueuom"])
mask = pa.compute.invert(pa.compute.or_(val_null, uom_null))
table = pa.compute.filter(table, mask)
table.schema

# cast valueuom to dict and value to float

In [None]:
table = table.set_column(
    table.column_names.index("value"),
    "value",
    pa.compute.cast(table.column("value"), pa.float32()),
)
table = table.set_column(
    table.column_names.index("valueuom"),
    "valueuom",
    pa.compute.dictionary_encode(table["valueuom"]),
)
table.schema

In [None]:
parquet.write_table(table, DATASET_PATH / "chartevents.parquet")

# with polars

In [None]:
import polars as pl

with ZipFile(ds.rawdata_paths, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            return pl.read_csv(file, **options)
            # table = pyarrow.csv.read_csv(file, **options)
            # return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(ds.rawdata_paths, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError

In [None]:
dtypes = {
    "subject_id": pl.datatypes.UInt32(),
    "hadm_id": pl.datatypes.UInt32(),
    "stay_id": pl.datatypes.UInt32(),
    "itemid": pl.datatypes.UInt32(),
    "charttime": pl.datatypes.Datetime(),
    "storetime": pl.datatypes.Datetime(),
    "value": pl.datatypes.Utf8(),
    "valuenum": pl.datatypes.Float32(),
    "valueuom": pl.datatypes.Utf8(),
    "warning": pl.datatypes.Boolean(),
}


table = get_table("mimic-iv-1.0/icu/chartevents.csv.gz", dtypes=dtypes)