In [None]:
# import resource


# def limit_memory(maxsize):
#     soft, hard = resource.getrlimit(resource.RLIMIT_AS)
#     # MEMORY LIMIT IN BYTES
#     resource.setrlimit(resource.RLIMIT_AS, (maxsize, hard))


# # IN GIGABYTE
# limit_memory(30 * 1024 * 1024 * 1024)

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import gzip
import os
import subprocess
from getpass import getpass
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import polars as pl
import pyarrow as pa
from pandas import DataFrame
from pyarrow import csv, parquet
from tqdm.autonotebook import tqdm

In [None]:
import tsdm

self = tsdm.datasets.MIMIC_IV(initialize=False)
self.table_names

RAWDATA_PATH = self.rawdata_paths[f"mimic-iv-{self.__version__}.zip"]

In [None]:
filelist = [
    "mimic-iv-2.2/CHANGELOG.txt",
    "mimic-iv-2.2/LICENSE.txt",
    "mimic-iv-2.2/SHA256SUMS.txt",
    "mimic-iv-2.2/hosp/admissions.csv.gz",
    "mimic-iv-2.2/hosp/d_hcpcs.csv.gz",
    "mimic-iv-2.2/hosp/d_icd_diagnoses.csv.gz",
    "mimic-iv-2.2/hosp/d_icd_procedures.csv.gz",
    "mimic-iv-2.2/hosp/d_labitems.csv.gz",
    "mimic-iv-2.2/hosp/diagnoses_icd.csv.gz",
    "mimic-iv-2.2/hosp/drgcodes.csv.gz",
    "mimic-iv-2.2/hosp/emar.csv.gz",
    "mimic-iv-2.2/hosp/emar_detail.csv.gz",
    "mimic-iv-2.2/hosp/hcpcsevents.csv.gz",
    "mimic-iv-2.2/hosp/labevents.csv.gz",
    "mimic-iv-2.2/hosp/microbiologyevents.csv.gz",
    "mimic-iv-2.2/hosp/omr.csv.gz",
    "mimic-iv-2.2/hosp/patients.csv.gz",
    "mimic-iv-2.2/hosp/pharmacy.csv.gz",
    "mimic-iv-2.2/hosp/poe.csv.gz",
    "mimic-iv-2.2/hosp/poe_detail.csv.gz",
    "mimic-iv-2.2/hosp/prescriptions.csv.gz",
    "mimic-iv-2.2/hosp/procedures_icd.csv.gz",
    "mimic-iv-2.2/hosp/provider.csv.gz",
    "mimic-iv-2.2/hosp/services.csv.gz",
    "mimic-iv-2.2/hosp/transfers.csv.gz",
    "mimic-iv-2.2/icu/caregiver.csv.gz",
    "mimic-iv-2.2/icu/chartevents.csv.gz",
    "mimic-iv-2.2/icu/d_items.csv.gz",
    "mimic-iv-2.2/icu/datetimeevents.csv.gz",
    "mimic-iv-2.2/icu/icustays.csv.gz",
    "mimic-iv-2.2/icu/ingredientevents.csv.gz",
    "mimic-iv-2.2/icu/inputevents.csv.gz",
    "mimic-iv-2.2/icu/outputevents.csv.gz",
    "mimic-iv-2.2/icu/procedureevents.csv.gz",
]

with ZipFile(RAWDATA_PATH, "r") as file:
    a = set(file.namelist())
    b = set(filelist)
    if not a == b:
        raise ValueError(f"The dataset contains unknown files {a-b}!")
    filesizes = {info.filename: info.file_size for info in file.infolist()}
pd.Series({key: filesizes[key] for key in sorted(filesizes, key=filesizes.get)})

In [None]:
from tsdm.datasets.schema.mimic_iv import (
    FALSE_VALUES,
    KEYS,
    NULL_VALUES,
    SCHEMAS,
    TRUE_VALUES,
)

TYPES_MAP = {
    "int64[pyarrow]": "ID_TYPE",
    "string[pyarrow]": "STRING_TYPE",
    "timestamp[s][pyarrow]": "TIME_TYPE",
    "double[pyarrow]": "VALUE_TYPE",
}

In [None]:
from tsdm.utils.data import (
    cast_columns,
    compute_entropy,
    filter_nulls,
    strip_whitespace,
    table_info,
)


def load_pandas(fname, **kwargs):
    with ZipFile(RAWDATA_PATH, "r") as archive:
        with archive.open(fname) as file:
            options = {
                "dtype_backend": "pyarrow",
                "compression": "gzip",
            } | kwargs
            return pd.read_csv(file, **options)


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            table = csv.read_csv(file, **options)
            return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(RAWDATA_PATH, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError


def get_types(table):
    return table.to_pandas(types_mapper=pd.ArrowDtype).dtypes.astype(str).map(TYPES_MAP)

In [None]:
ID_TYPE = pa.uint32()
VALUE_TYPE = pa.float32()
TIME_TYPE = pa.timestamp("s")
DATE_TYPE = pa.date32()
BOOL_TYPE = pa.bool_()
STRING_TYPE = pa.string()
DICT_TYPE = pa.dictionary(pa.int32(), pa.string())
NULL_TYPE = pa.null()
TEXT_TYPE = pa.large_utf8()

# HOSP Tables

## admissions

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/admissions.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["admissions"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## d_hcpcs

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/d_hcpcs.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["d_hcpcs"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## d_icd_diagnoses

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/d_icd_diagnoses.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["d_icd_diagnoses"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## d_icd_procedures

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/d_icd_procedures.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        # column_types=SCHEMAS["d_icd_procedures"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## d_labitems

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/d_labitems.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["d_labitems"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
null_cols = ["label"]
table_info(table)

## diagnoses_icd

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/diagnoses_icd.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["diagnoses_icd"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## drgcodes

In [None]:
from tsdm.datasets.schema.mimic_iv import SCHEMAS

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/drgcodes.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["drgcodes"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## emar

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/emar.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["emar"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

In [None]:
table = filter_nulls(table, ["hadm_id", "scheduletime"])
table_info(table)

## emar_detail

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/emar_detail.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["emar_detail"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

In [None]:
recast_columns = {
    "dose_due": VALUE_TYPE,
    "dose_given": VALUE_TYPE,
    "product_amount_given": VALUE_TYPE,
    "prior_infusion_rate": VALUE_TYPE,
    "infusion_rate": VALUE_TYPE,
    "infusion_rate_adjustment_amount": VALUE_TYPE,
}

for col in recast_columns:
    array = pa.compute.utf8_trim_whitespace(table[col])

    prior_null = pa.compute.is_null(array)
    post_null = pa.compute.is_null(
        pa.Array.from_pandas(
            pd.to_numeric(
                pd.Series(array, dtype="string[pyarrow]"),
                errors="coerce",
                dtype_backend="pyarrow",
                downcast=pa.float32(),
            )
        )
    )

    # mask the values that were initially null or can be cast to float.
    mask = pa.compute.or_(
        prior_null,
        pa.compute.invert(post_null),
    )
    dropped = 1 - pa.compute.mean(mask).as_py()
    print(f"Dropping {dropped:.4%} of rows: {col} can't be cast to float!")
    table = table.filter(mask)
    table = table.set_column(
        table.column_names.index(col),
        col,
        pa.compute.cast(
            array.filter(mask),
            pa.float32(),
        ),
    )
table_info(table)

## hcpcsevents

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/hcpcsevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["hcpcsevents"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)
# sorted(pa.compute.unique(table["los"]).drop_null().to_pylist())

## Labevents

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/labevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["labevents"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

In [None]:
null_cols = ["hadm_id", "storetime", "value", "valuenum", "valueuom"]
table = filter_nulls(table, null_cols)
table_info(table)

### cast value to float

In [None]:
# cast value to float
table = table.set_column(
    table.column_names.index("value"),
    "value",
    pa.compute.cast(table.column("value"), pa.float32()),
)
assert pa.compute.all(pa.compute.equal(table["value"], table["valuenum"])).as_py()
table_info(table)

## Unstack on value -- out of memory...

In [None]:
df = (
    table.to_pandas(types_mapper=pd.ArrowDtype)
    .pivot(
        index=["labevent_id", "subject_id", "hadm_id"],
        columns="valueuom",
        values="value",
    )
    .dropna(how="all")
    .astype(unstacked_dtypes)
    .reset_index()
)
table2 = pa.Table.from_pandas(df)
table_info(table2)

## microbiologyevents

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/microbiologyevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["microbiologyevents"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
null_cols = ["storedate", "storetime", "spec_type_desc"]
table_info(table)
# sorted(pa.compute.unique(table["los"]).drop_null().to_pylist())

### Check dilution

In [None]:
dilution = table["dilution_text"]
dilution = pa.compute.replace_substring(dilution, "<", "")
dilution = pa.compute.replace_substring(dilution, ">", "")
dilution = pa.compute.replace_substring(dilution, "=", "")
dilution = pa.compute.cast(dilution, pa.float32())
assert (
    pa.compute.all(
        pa.compute.equal(dilution, table["dilution_value"]).drop_null()
    ).as_py()
    is True
)

## omr

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/omr.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["omr"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

### split results_value (two values stored for blood pressure)

In [None]:
table = table.set_column(
    table.column_names.index("result_value"),
    "result_value",
    pa.compute.split_pattern(table["result_value"], "/"),
)

df = table.to_pandas().pivot(
    index=["subject_id", "seq_num", "chartdate"],
    columns="result_name",
    values="result_value",
)

for col in (pbar := tqdm(df.columns)):
    pbar.set_postfix(column=f"{col!r}")
    s = df.pop(col).copy()
    m = s.isna()
    s.loc[m] = [[]] * m.sum()
    columns = (
        [f"{col} (systolic)", f"{col} (diastolic)"]
        if "blood pressure" in col.lower()
        else [col]
    )
    frame = pd.DataFrame(s.to_list(), columns=columns, index=s.index)
    for new_col in frame:
        df[new_col] = pd.to_numeric(
            frame[new_col],
            errors="coerce",
            downcast=pa.float32(),
            dtype_backend="pyarrow",
        ).astype(
            "float[pyarrow]"
        )  # required due to "Weight (Lbs)"
        # contains bad data: 210237760.0, the nearest float is 210237758.0
table = pa.Table.from_pandas(df.dropna(how="all").reset_index())
table_info(table)

## patients

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/patients.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["patients"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## pharmacy

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/pharmacy.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["pharmacy"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

In [None]:
null_cols = [
    "subject_id",
    "hadm_id",
    "pharmacy_id",
    "poe_id",
    "starttime",
    "stoptime",
    "medication",
    "proc_type",
    "status",
    "entertime",
    "verifiedtime",
    "route",
    "frequency",
    # "disp_sched",
    # "infusion_type",
    # "sliding_scale",
    # "lockout_interval",
    # "basal_rate",
    # "one_hr_max",
    # "doses_per_24_hrs",
    # "duration",
    "duration_interval",
    # "expiration_value",
    "expiration_unit",
    # "expirationdate",
    "dispensation",
    # "fill_quantity",
]
table_info(filter_nulls(table, null_cols))

## poe

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/poe.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["poe"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## poe_detail

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/poe_detail.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["poe_detail"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

### unstack fields

In [None]:
unstacked_dtypes = {
    # fmt: off
    "Admit category":      DICT_TYPE,
    "Admit to":            DICT_TYPE,
    "Code status":         DICT_TYPE,
    "Consult Status":      DICT_TYPE,
    "Consult Status Time": TIME_TYPE,
    "Discharge Planning":  DICT_TYPE,
    "Discharge When":      DICT_TYPE,
    "Indication":          DICT_TYPE,
    "Level of Urgency":    DICT_TYPE,
    "Transfer to":         DICT_TYPE,
    "Tubes & Drains type": DICT_TYPE,
    # fmt: on
}

unstacked_dtypes = {k: pd.ArrowDtype(v) for k, v in unstacked_dtypes.items()}

df = (
    table.to_pandas(types_mapper=pd.ArrowDtype)
    .pivot(
        index=["poe_id", "poe_seq", "subject_id"],
        columns="field_name",
        values="field_value",
    )
    .dropna(how="all")
    .astype(unstacked_dtypes)
    .reset_index()
)
table2 = pa.Table.from_pandas(df)
table_info(table2)

## prescriptions

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/prescriptions.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["prescriptions"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

### drop val columns that are not float

In [None]:
table = table.set_column(
    table.column_names.index("dose_val_rx"),
    "dose_val_rx",
    pa.Array.from_pandas(
        pd.to_numeric(
            pd.Series(table["dose_val_rx"]),
            errors="coerce",
            dtype_backend="pyarrow",
            downcast=pa.float32(),
        )
    ),
)
table = table.set_column(
    table.column_names.index("form_val_disp"),
    "form_val_disp",
    pa.Array.from_pandas(
        pd.to_numeric(
            pd.Series(table["form_val_disp"]),
            errors="coerce",
            dtype_backend="pyarrow",
            downcast=pa.float32(),
        )
    ),
)
table_info(table)

In [None]:
filter_null_cols = [
    "subject_id",
    "hadm_id",
    "pharmacy_id",
    "poe_id",
    "poe_seq",
    "order_provider_id",
    "starttime",
    "stoptime",
    "drug_type",
    "drug",
    "formulary_drug_cd",
    # "gsn",
    "ndc",
    "prod_strength",
    # "form_rx",
    "dose_val_rx",
    "dose_unit_rx",
    "form_val_disp",
    "form_unit_disp",
    # "doses_per_24_hrs",
    "route",
]
table_info(filter_nulls(table, filter_null_cols))

## procedures_icd

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/procedures_icd.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["procedures_icd"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## provider

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/provider.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["provider"],
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## services

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/services.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["services"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## transfers

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/transfers.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["transfers"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

# ICU tables

## caregiver

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/caregiver.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["caregiver"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## d_items

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/d_items.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["d_items"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## Chartevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/chartevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["chartevents"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

### drop everything that has no valueuom

In [None]:
table = filter_nulls(table, ["valueuom"])
table = table.set_column(
    table.column_names.index("value"),
    "value",
    pa.compute.cast(table.column("value"), pa.float32()),
)
table_info(table)

In [None]:
df = pl.from_arrow(table).pivot(index="charttime", columns="itemid", values="value")

In [None]:
parquet.write_table(table, DATASET_PATH / "chartevents.parquet")

## datetimeevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/datetimeevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["datetimeevents"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## icustays

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/icustays.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["icustays"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## ingredientevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/ingredientevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["ingredientevents"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## Inputevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/inputevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["inputevents"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## Outputevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/outputevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["outputevents"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## Procedureeventes

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/procedureevents.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["procedureevents"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

### cast storetime to timestamp[s]

In [None]:
table = table.set_column(
    table.column_names.index("storetime"),
    "storetime",
    table["storetime"].cast(TIME_TYPE, safe=False),
)

### unstack value - add procedure duration

In [None]:
s = (
    table.to_pandas(types_mapper=pd.ArrowDtype)
    .pivot(
        index=["orderid"],
        columns="valueuom",
        values="value",
    )
    .fillna(0)
    .dot(
        pd.Series(
            {
                "None": 0,
                "min": 60,
                "day": 60 * 60 * 24,
                "hour": 60 * 60,
            },
            dtype="duration[s][pyarrow]",
            name="time",
        )
    )
)

table = table.set_column(
    len(table.column_names),
    "procedure_duration",
    pa.Array.from_pandas(s).cast("duration[s]", safe=False),
)

# with polars

In [None]:
import polars as pl

with ZipFile(ds.rawdata_paths, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            return pl.read_csv(file, **options)
            # table = pyarrow.csv.read_csv(file, **options)
            # return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(ds.rawdata_paths, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError

In [None]:
dtypes = {
    "subject_id": pl.datatypes.UInt32(),
    "hadm_id": pl.datatypes.UInt32(),
    "stay_id": pl.datatypes.UInt32(),
    "itemid": pl.datatypes.UInt32(),
    "charttime": pl.datatypes.Datetime(),
    "storetime": pl.datatypes.Datetime(),
    "value": pl.datatypes.Utf8(),
    "valuenum": pl.datatypes.Float32(),
    "valueuom": pl.datatypes.Utf8(),
    "warning": pl.datatypes.Boolean(),
}


table = get_table("mimic-iv-1.0/icu/chartevents.csv.gz", dtypes=dtypes)