In [None]:
# import resource


# def limit_memory(maxsize):
#     soft, hard = resource.getrlimit(resource.RLIMIT_AS)
#     # MEMORY LIMIT IN BYTES
#     resource.setrlimit(resource.RLIMIT_AS, (maxsize, hard))


# # IN GIGABYTE
# limit_memory(30 * 1024 * 1024 * 1024)

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import gzip
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import pyarrow as pa
from pandas import DataFrame
from pyarrow import csv, parquet
from tqdm.autonotebook import tqdm

In [None]:
# ds = MIMIC_IV(initialize=False)
# ds.rawdata_paths

DATASET_PATH = Path("custom_processed")
RAWDATA_PATH = Path("mimic-iv-2.2.zip")

In [None]:
filelist = [
    "mimic-iv-2.2/CHANGELOG.txt",
    "mimic-iv-2.2/LICENSE.txt",
    "mimic-iv-2.2/SHA256SUMS.txt",
    "mimic-iv-2.2/hosp/admissions.csv.gz",
    "mimic-iv-2.2/hosp/d_hcpcs.csv.gz",
    "mimic-iv-2.2/hosp/d_icd_diagnoses.csv.gz",
    "mimic-iv-2.2/hosp/d_icd_procedures.csv.gz",
    "mimic-iv-2.2/hosp/d_labitems.csv.gz",
    "mimic-iv-2.2/hosp/diagnoses_icd.csv.gz",
    "mimic-iv-2.2/hosp/drgcodes.csv.gz",
    "mimic-iv-2.2/hosp/emar.csv.gz",
    "mimic-iv-2.2/hosp/emar_detail.csv.gz",
    "mimic-iv-2.2/hosp/hcpcsevents.csv.gz",
    "mimic-iv-2.2/hosp/labevents.csv.gz",
    "mimic-iv-2.2/hosp/microbiologyevents.csv.gz",
    "mimic-iv-2.2/hosp/omr.csv.gz",
    "mimic-iv-2.2/hosp/patients.csv.gz",
    "mimic-iv-2.2/hosp/pharmacy.csv.gz",
    "mimic-iv-2.2/hosp/poe.csv.gz",
    "mimic-iv-2.2/hosp/poe_detail.csv.gz",
    "mimic-iv-2.2/hosp/prescriptions.csv.gz",
    "mimic-iv-2.2/hosp/procedures_icd.csv.gz",
    "mimic-iv-2.2/hosp/provider.csv.gz",
    "mimic-iv-2.2/hosp/services.csv.gz",
    "mimic-iv-2.2/hosp/transfers.csv.gz",
    "mimic-iv-2.2/icu/caregiver.csv.gz",
    "mimic-iv-2.2/icu/chartevents.csv.gz",
    "mimic-iv-2.2/icu/d_items.csv.gz",
    "mimic-iv-2.2/icu/datetimeevents.csv.gz",
    "mimic-iv-2.2/icu/icustays.csv.gz",
    "mimic-iv-2.2/icu/ingredientevents.csv.gz",
    "mimic-iv-2.2/icu/inputevents.csv.gz",
    "mimic-iv-2.2/icu/outputevents.csv.gz",
    "mimic-iv-2.2/icu/procedureevents.csv.gz",
]

In [None]:
with ZipFile(RAWDATA_PATH, "r") as file:
    a = set(file.namelist())
    b = set(filelist)
    if not a == b:
        raise ValueError(f"The dataset contains unknown files {a-b}!")


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            table = csv.read_csv(file, **options)
            return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def compute_entropy(value_counts, normalize: bool = False) -> float:
    counts = pa.compute.struct_field(value_counts, 1)
    len(counts)
    freqs = pa.compute.divide(
        pa.compute.cast(counts, pa.float64()),
        pa.compute.sum(counts),
    )

    H = pa.compute.sum(pa.compute.multiply(freqs, pa.compute.log2(freqs)))

    if normalize:
        H = pa.compute.divide(H, pa.compute.log2(len(counts)))

    return -H.as_py()


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(RAWDATA_PATH, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError


def table_info(table) -> None:
    print(len(table))
    for name, col in tqdm(zip(table.column_names, table.columns)):
        num_total = len(col)
        num_null = pa.compute.sum(pa.compute.is_null(col)).as_py()
        value_counts = col.value_counts()
        num_uniques = len(value_counts)
        nulls = num_null / num_total
        uniques = num_uniques / (num_total - num_null)
        entropy = compute_entropy(value_counts)
        dtype = str(col.type)
        print(f"{name:24s}  {nulls=:7.2%}  {uniques=:7.2%}  {entropy=:.3f}  {dtype=:s}")


with ZipFile(RAWDATA_PATH, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")
    print(dir(file))

    filesizes = {info.filename: info.file_size for info in file.infolist()}
    # print([x.filename for x in file.infolist()])

pd.Series({key: filesizes[key] for key in filesizes})

In [None]:
ID_TYPE = "uint32"
VALUE_TYPE = "float32"
TIME_TYPE = "timestamp[s]"
DATE_TYPE = "date32[day]"
BOOL_TYPE = "bool"
STRING_TYPE = "string"
DICT_TYPE = pa.dictionary("int32", "string")


column_types = dict(
    chartevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "itemid": ID_TYPE,
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "value": "string",
        "valuenum": "float32",
        "valueuom": "string",
        "warning": "bool",
    },
    inputevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "starttime": "timestamp[s]",
        "endtime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "itemid": ID_TYPE,
        "amount": "float32",
        "amountuom": "string",
        "rate": "float32",
        "rateuom": "string",
        "orderid": ID_TYPE,
        "linkorderid": ID_TYPE,
        "ordercategoryname": "string",
        "secondaryordercategoryname": "string",
        "ordercomponenttypedescription": "string",
        "ordercategorydescription": "string",
        "patientweight": "float32",
        "totalamount": "float32",
        "totalamountuom": "string",
        "isopenbag": "bool",
        "continueinnextdept": "bool",
        "statusdescription": "string",
        "originalamount": "float32",
        "originalrate": "float32",
    },
    outputevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "charttime": TIME_TYPE,
        "storetime": TIME_TYPE,
        "itemid": ID_TYPE,
        "value": VALUE_TYPE,
        "valueuom": "string",
    },
    procedureevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "starttime": "timestamp[s]",
        "endtime": "timestamp[s]",
        "storetime": "timestamp[ns]",
        "itemid": ID_TYPE,
        "value": "float32",
        "valueuom": "string",
        "location": "string",
        "locationcategory": "string",
        "orderid": ID_TYPE,
        "linkorderid": ID_TYPE,
        "ordercategoryname": "string",
        "ordercategorydescription": "string",
        "patientweight": "float32",
        "isopenbag": "bool",
        "continueinnextdept": "bool",
        "statusdescription": "string",
        "originalamount": "float32",
        "originalrate": "bool",
    },
    datetimeevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "itemid": ID_TYPE,
        "value": "timestamp[s]",
        "valueuom": "string",
        "warning": "bool",
    },
    ingredientevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "caregiver_id": ID_TYPE,
        "starttime": TIME_TYPE,
        "endtime": TIME_TYPE,
        "storetime": TIME_TYPE,
        "itemid": ID_TYPE,
        "amount": VALUE_TYPE,
        "amountuom": STRING_TYPE,
        "rate": VALUE_TYPE,
        "rateuom": STRING_TYPE,
        "orderid": ID_TYPE,
        "linkorderid": ID_TYPE,
        "statusdescription": STRING_TYPE,
        "originalamount": VALUE_TYPE,
        "originalrate": VALUE_TYPE,
    },
    icustays={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "first_careunit": STRING_TYPE,  # category
        "last_careunit": STRING_TYPE,  # category
        "intime": TIME_TYPE,
        "outtime": TIME_TYPE,
        "los": VALUE_TYPE,
    },
    d_items={
        "itemid": ID_TYPE,
        "label": STRING_TYPE,
        "abbreviation": STRING_TYPE,
        "linksto": STRING_TYPE,
        "category": STRING_TYPE,
        "unitname": STRING_TYPE,
        "param_type": STRING_TYPE,
        "lownormalvalue": VALUE_TYPE,
        "highnormalvalue": VALUE_TYPE,
    },
    caregiver={
        "caregiver_id": ID_TYPE,
    },
    transfers={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "transfer_id": ID_TYPE,
        "eventtype": STRING_TYPE,
        "careunit": STRING_TYPE,
        "intime": TIME_TYPE,
        "outtime": TIME_TYPE,
    },
    admissions={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "admittime": TIME_TYPE,
        "dischtime": TIME_TYPE,
        "deathtime": TIME_TYPE,
        "admission_type": STRING_TYPE,
        "admit_provider_id": STRING_TYPE,
        "admission_location": STRING_TYPE,
        "discharge_location": STRING_TYPE,
        "insurance": STRING_TYPE,
        "language": STRING_TYPE,
        "marital_status": STRING_TYPE,
        "race": STRING_TYPE,
        "edregtime": TIME_TYPE,
        "edouttime": TIME_TYPE,
        "hospital_expire_flag": BOOL_TYPE,
    },
    services={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "transfertime": TIME_TYPE,
        "prev_service": STRING_TYPE,
        "curr_service": STRING_TYPE,
    },
    provider={
        "provider_id": STRING_TYPE,
    },
    procedures_icd={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "seq_num": ID_TYPE,
        "chartdate": DATE_TYPE,
        "icd_code": STRING_TYPE,
        "icd_version": ID_TYPE,
    },
    prescriptions={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "pharmacy_id": ID_TYPE,
        "poe_id": STRING_TYPE,
        "poe_seq": ID_TYPE,
        "order_provider_id": STRING_TYPE,
        "starttime": TIME_TYPE,
        "stoptime": TIME_TYPE,
        "drug_type": STRING_TYPE,
        "drug": STRING_TYPE,
        "formulary_drug_cd": STRING_TYPE,
        "gsn": STRING_TYPE,
        "ndc": STRING_TYPE,
        "prod_strength": STRING_TYPE,
        "form_rx": STRING_TYPE,
        "dose_val_rx": STRING_TYPE,  # cast to float32
        "dose_unit_rx": STRING_TYPE,
        "form_val_disp": STRING_TYPE,  # cast to float32
        "form_unit_disp": STRING_TYPE,
        "doses_per_24_hrs": VALUE_TYPE,
        "route": STRING_TYPE,
    },
    poe_detail={
        "poe_id": STRING_TYPE,
        "poe_seq": ID_TYPE,
        "subject_id": ID_TYPE,
        "field_name": STRING_TYPE,
        "field_value": STRING_TYPE,
    },
    poe={
        "poe_id": STRING_TYPE,
        "poe_seq": ID_TYPE,
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "ordertime": TIME_TYPE,
        "order_type": STRING_TYPE,
        "order_subtype": STRING_TYPE,
        "transaction_type": STRING_TYPE,
        "discontinue_of_poe_id": STRING_TYPE,
        "discontinued_by_poe_id": STRING_TYPE,
        "order_provider_id": STRING_TYPE,
        "order_status": STRING_TYPE,
    },
    pharmacy={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "pharmacy_id": ID_TYPE,
        "poe_id": STRING_TYPE,
        "starttime": TIME_TYPE,
        "stoptime": TIME_TYPE,
        "medication": STRING_TYPE,
        "proc_type": STRING_TYPE,
        "status": STRING_TYPE,
        "entertime": TIME_TYPE,
        "verifiedtime": TIME_TYPE,
        "route": STRING_TYPE,
        "frequency": STRING_TYPE,
        "disp_sched": STRING_TYPE,
        "infusion_type": STRING_TYPE,
        "sliding_scale": BOOL_TYPE,
        "lockout_interval": STRING_TYPE,
        "basal_rate": VALUE_TYPE,
        "one_hr_max": STRING_TYPE,
        "doses_per_24_hrs": VALUE_TYPE,
        "duration": VALUE_TYPE,
        "duration_interval": STRING_TYPE,
        "expiration_value": VALUE_TYPE,
        "expiration_unit": STRING_TYPE,
        "expirationdate": TIME_TYPE,
        "dispensation": STRING_TYPE,
        "fill_quantity": STRING_TYPE,
    },
    patients={
        "subject_id": ID_TYPE,
        "gender": STRING_TYPE,
        "anchor_age": ID_TYPE,
        "anchor_year": ID_TYPE,
        "anchor_year_group": STRING_TYPE,
        "dod": DATE_TYPE,
    },
    omr={
        "subject_id": ID_TYPE,
        "chartdate": DATE_TYPE,
        "seq_num": ID_TYPE,
        "result_name": STRING_TYPE,
        "result_value": (
            STRING_TYPE
        ),  # split into 2 floats (blood pressure systolic/diastolic).
    },
    microbiologyevents={
        "microevent_id": ID_TYPE,
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "micro_specimen_id": ID_TYPE,
        "order_provider_id": STRING_TYPE,
        "chartdate": TIME_TYPE,
        "charttime": TIME_TYPE,
        "spec_itemid": ID_TYPE,
        "spec_type_desc": STRING_TYPE,
        "test_seq": ID_TYPE,
        "storedate": TIME_TYPE,
        "storetime": TIME_TYPE,
        "test_itemid": ID_TYPE,
        "test_name": STRING_TYPE,
        "org_itemid": ID_TYPE,
        "org_name": STRING_TYPE,
        "isolate_num": ID_TYPE,
        "quantity": STRING_TYPE,
        "ab_itemid": ID_TYPE,
        "ab_name": STRING_TYPE,
        "dilution_text": STRING_TYPE,  # convert to float
        "dilution_comparison": STRING_TYPE,
        "dilution_value": VALUE_TYPE,
        "interpretation": STRING_TYPE,
        "comments": STRING_TYPE,
    },
    labevents={
        "labevent_id": ID_TYPE,
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "specimen_id": ID_TYPE,
        "itemid": ID_TYPE,
        "order_provider_id": DICT_TYPE,
        "charttime": TIME_TYPE,
        "storetime": TIME_TYPE,
        "value": DICT_TYPE,  # cast Float32
        "valuenum": STRING_TYPE,  # cast Float32
        "valueuom": DICT_TYPE,
        "ref_range_lower": VALUE_TYPE,
        "ref_range_upper": VALUE_TYPE,
        "flag": DICT_TYPE,
        "priority": DICT_TYPE,
        "comments": STRING_TYPE,
    },
    hcpcsevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "chartdate": DATE_TYPE,
        "hcpcs_cd": DICT_TYPE,
        "seq_num": ID_TYPE,
        "short_description": DICT_TYPE,
    },
    emar_detail={
        "subject_id": ID_TYPE,
        "emar_id": STRING_TYPE,
        "emar_seq": ID_TYPE,
        "parent_field_ordinal": DICT_TYPE,
        "administration_type": DICT_TYPE,
        "pharmacy_id": ID_TYPE,
        "barcode_type": DICT_TYPE,
        "reason_for_no_barcode": STRING_TYPE,
        "complete_dose_not_given": BOOL_TYPE,
        "dose_due": STRING_TYPE,  # cast float
        "dose_due_unit": DICT_TYPE,
        "dose_given": STRING_TYPE,  # cast float
        "dose_given_unit": DICT_TYPE,
        "will_remainder_of_dose_be_given": BOOL_TYPE,
        "product_amount_given": STRING_TYPE,  # cast float
        "product_unit": DICT_TYPE,
        "product_code": DICT_TYPE,
        "product_description": STRING_TYPE,
        "product_description_other": STRING_TYPE,
        "prior_infusion_rate": STRING_TYPE,  # cast float
        "infusion_rate": STRING_TYPE,  # cast float
        "infusion_rate_adjustment": DICT_TYPE,
        "infusion_rate_adjustment_amount": STRING_TYPE,  # cast float
        "infusion_rate_unit": DICT_TYPE,
        "route": DICT_TYPE,
        "infusion_complete": BOOL_TYPE,
        "completion_interval": DICT_TYPE,
        "new_iv_bag_hung": BOOL_TYPE,
        "continued_infusion_in_other_location": BOOL_TYPE,
        "restart_interval": DICT_TYPE,
        "side": DICT_TYPE,
        "site": DICT_TYPE,
        "non_formulary_visual_verification": BOOL_TYPE,
    },
    emar={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "emar_id": STRING_TYPE,
        "emar_seq": ID_TYPE,
        "poe_id": STRING_TYPE,
        "pharmacy_id": ID_TYPE,
        "enter_provider_id": DICT_TYPE,
        "charttime": TIME_TYPE,
        "medication": DICT_TYPE,
        "event_txt": DICT_TYPE,
        "scheduletime": TIME_TYPE,
        "storetime": TIME_TYPE,
    },
    drgcodes={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "drg_type": DICT_TYPE,
        "drg_code": ID_TYPE,
        "description": DICT_TYPE,
        "drg_severity": ID_TYPE,
        "drg_mortality": ID_TYPE,
    },
    diagnoses_icd={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "seq_num": ID_TYPE,
        "icd_code": DICT_TYPE,
        "icd_version": ID_TYPE,
    },
    d_labitems={
        "itemid": ID_TYPE,
        "label": STRING_TYPE,
        "fluid": DICT_TYPE,
        "category": DICT_TYPE,
    },
    d_icd_procedures={
        "icd_code": STRING_TYPE,
        "icd_version": ID_TYPE,
        "long_title": STRING_TYPE,
    },
    d_icd_diagnoses={
        "icd_code": STRING_TYPE,
        "icd_version": ID_TYPE,
        "long_title": STRING_TYPE,
    },
    d_hcpcs={
        "code": STRING_TYPE,
        "category": ID_TYPE,
        "long_description": STRING_TYPE,
        "short_description": DICT_TYPE,
    },
);

## d_hcpcs

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/d_hcpcs.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["d_hcpcs"],
        strings_can_be_null=True,
        # null_values=["", "___"],
        # true_values=["Yes", "Y"],
        # false_values=["No", "N"],
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## d_icd_diagnoses

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/d_icd_diagnoses.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["d_icd_diagnoses"],
        strings_can_be_null=True,
        # null_values=["", "___"],
        # true_values=["Yes", "Y"],
        # false_values=["No", "N"],
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## d_icd_procedures

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/d_icd_procedures.csv.gz",
    convert_options=csv.ConvertOptions(
        # column_types=column_types["d_icd_procedures"],
        strings_can_be_null=True,
        # null_values=["", "___"],
        # true_values=["Yes", "Y"],
        # false_values=["No", "N"],
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## d_labitems

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/d_labitems.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["d_labitems"],
        strings_can_be_null=True,
        # null_values=["", "___"],
        # true_values=["Yes", "Y"],
        # false_values=["No", "N"],
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## diagnoses_icd

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/diagnoses_icd.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["diagnoses_icd"],
        strings_can_be_null=True,
        # null_values=["", "___"],
        # true_values=["Yes", "Y"],
        # false_values=["No", "N"],
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## drgcodes

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/drgcodes.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["drgcodes"],
        strings_can_be_null=True,
        # null_values=["", "___"],
        # true_values=["Yes", "Y"],
        # false_values=["No", "N"],
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## emar

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/emar.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["emar"],
        strings_can_be_null=True,
        # null_values=["", "___"],
        # true_values=["Yes", "Y"],
        # false_values=["No", "N"],
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## emar_detail

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/emar_detail.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["emar_detail"],
        strings_can_be_null=True,
        null_values=["", "___"],
        true_values=["Yes", "Y"],
        false_values=["No", "N"],
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

In [None]:
recast_columns = {
    "dose_due": VALUE_TYPE,
    "dose_given": VALUE_TYPE,
    "product_amount_given": VALUE_TYPE,
    "prior_infusion_rate": VALUE_TYPE,
    "infusion_rate": VALUE_TYPE,
    "infusion_rate_adjustment_amount": VALUE_TYPE,
}

In [None]:
for col in recast_columns:
    array = pa.compute.utf8_trim_whitespace(table[col])

    try:
        table = table.set_column(
            table.column_names.index(col),
            col,
            pa.compute.cast(array, pa.float32()),
        )
    except pa.ArrowInvalid:
        pass
    else:
        continue

    prior_null = pa.compute.is_null(array)
    post_null = pa.compute.is_null(
        pa.Array.from_pandas(
            pd.to_numeric(
                pd.Series(array, dtype="string[pyarrow]"),
                errors="coerce",
                dtype_backend="pyarrow",
                downcast=pa.float32(),
            )
        )
    )

    # mask the values that were initially null or can be cast to float.
    mask = pa.compute.or_(
        prior_null,
        pa.compute.invert(post_null),
    )
    dropped = 1 - pa.compute.mean(mask).as_py()
    print(f"Dropping {dropped:.4%} of rows: {col} can't be cast to float!")
    table = table.filter(mask)
    table = table.set_column(
        table.column_names.index(col),
        col,
        pa.compute.cast(
            array.filter(mask),
            pa.float32(),
        ),
    )

## hcpcsevents

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/hcpcsevents.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["hcpcsevents"],
        strings_can_be_null=True,
        # null_values=["", "___"],
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## Labevents

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/labevents.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["labevents"],
        strings_can_be_null=True,
        null_values=["", "___"],
    ),
)

In [None]:
table.schema

In [None]:
%timeit table["value"].value_counts();

In [None]:
%timeit table["value"].combine_chunks().value_counts();

In [None]:
len(table["value"].unique())

In [None]:
x = table["value"].cast(pa.string()).combine_chunks().value_counts()

In [None]:
for col in table.column_names:
    print(col, compute_entropy(table[col].combine_chunks().value_counts()))

In [None]:
import matplotlib.pyplot as plt

In [None]:
vals = table["value"].drop_null().combine_chunks().value_counts()
counts = vals.field(1).sort().cast(pa.float64())
freq = pa.compute.divide(counts, counts.sum())

In [None]:
compute_entropy(vals, normalize=True)

In [None]:
fig, ax = plt.subplots(figsize=(5, 3))
ax.set_xlabel("value_counts: frequency")
ax.set_ylabel("value_counts: likelihood")
# ax.set_xscale("log")
ax.hist(freq, bins=20, log=True);

### drop everything that has no hadm_id or null value or null valueuom

In [None]:
adm_null = pa.compute.is_null(table["hadm_id"])
val_null = pa.compute.is_null(table["value"])
num_null = pa.compute.is_null(table["valuenum"])
uom_null = pa.compute.is_null(table["valueuom"])
mask = pa.compute.invert(
    pa.compute.or_(
        pa.compute.or_(adm_null, val_null),
        pa.compute.or_(num_null, uom_null),
    )
)
table = pa.compute.filter(table, mask)
table_info(table)

### cast value/valuenum to float

In [None]:
table = table.set_column(
    table.column_names.index("value"),
    "value",
    pa.compute.cast(table.column("value"), pa.float32()),
)
table = table.set_column(
    table.column_names.index("valuenum"),
    "valuenum",
    pa.compute.cast(table.column("valuenum"), pa.float32()),
)
assert pa.compute.all(pa.compute.equal(table["value"], table["valuenum"])).as_py()
table.schema

## microbiologyevents

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/microbiologyevents.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["microbiologyevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

### convert  dilution

In [None]:
dilution = table["dilution_text"]
dilution = pa.compute.replace_substring(dilution, "<", "")
dilution = pa.compute.replace_substring(dilution, ">", "")
dilution = pa.compute.replace_substring(dilution, "=", "")
dilution = pa.compute.cast(dilution, pa.float32())

## omr

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/omr.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["omr"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

In [None]:
s = pd.Series(
    table["result_value"],
    dtype="string[pyarrow]",
).str.split("/", expand=True)

mask = pd.to_numeric(s[0], dtype_backend="pyarrow", errors="coerce").notnull()


# add the columns to the table
table = table.set_column(
    table.column_names.index("result_value") + 1,
    "result_value_B",
    pa.array(s[1]),
)

table = table.set_column(
    table.column_names.index("result_value") + 0,
    "result_value_A",
    pa.array(s[0]),
)

# apply mask filtering
table = pa.compute.filter(table, pa.array(mask))

# cast values to float
table = table.set_column(
    table.column_names.index("result_value_A"),
    "result_value_A",
    pa.compute.cast(table["result_value_A"], pa.float32()),
)

table = table.set_column(
    table.column_names.index("result_value_B"),
    "result_value_B",
    pa.compute.cast(table["result_value_B"], pa.float32()),
)
table_info(table)

## patients

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/patients.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["patients"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## pharmacy

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/pharmacy.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["pharmacy"],
        strings_can_be_null=True,
        false_values=["N"],
        true_values=["Y"],
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## poe

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/poe.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["poe"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## poe_detail

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/poe_detail.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["poe_detail"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

## prescriptions

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/prescriptions.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["prescriptions"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["poe_seq"])

### drop val columns that are not float

In [None]:
mask_dose = pd.to_numeric(
    pd.Series(table["dose_val_rx"]),
    errors="coerce",
    dtype_backend="pyarrow",
    downcast=pa.float32(),
).notnull()
mask_form = pd.to_numeric(
    pd.Series(table["form_val_disp"]),
    errors="coerce",
    dtype_backend="pyarrow",
    downcast=pa.float32(),
).notnull()


mask = pa.array(mask_dose & mask_form)
table = pa.compute.filter(table, mask)

table = table.set_column(
    table.column_names.index("dose_val_rx"),
    "dose_val_rx",
    pa.compute.cast(table.column("dose_val_rx"), pa.float32()),
)
table = table.set_column(
    table.column_names.index("form_val_disp"),
    "form_val_disp",
    pa.compute.cast(table.column("form_val_disp"), pa.float32()),
)
table_info(table)

## procedures_icd

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/procedures_icd.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["procedures_icd"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["chartdate"])

## provider

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/provider.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["provider"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["los"])

## services

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/services.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["services"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["los"])

## admissions

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/admissions.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["admissions"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["los"])

## transfers

In [None]:
table = get_table(
    "mimic-iv-2.2/hosp/transfers.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["transfers"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["los"])

# ICU tables

## caregiver

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/caregiver.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["caregiver"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["los"])

## d_items

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/d_items.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["d_items"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["los"])

## icustays

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/icustays.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["icustays"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["los"])

## ingredientevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/ingredientevents.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["ingredientevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
# pa.compute.unique(table["valueuom"])

## datetimeevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/datetimeevents.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["datetimeevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
pa.compute.unique(table["valueuom"])

## Procedureeventes

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/procedureevents.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["procedureevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)
pa.compute.unique(table["ordercategoryname"])

## Inputevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/inputevents.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["inputevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)

## Outputevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/outputevents.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["outputevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)
table_info(table)

## Chartevents

In [None]:
table = get_table(
    "mimic-iv-2.2/icu/chartevents.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["chartevents"],
        strings_can_be_null=True,
        # null_values=null_values,
    ),
)

In [None]:
for name, col in tqdm(zip(table.column_names, table.columns)):
    uniques = pa.compute.count_distinct(col)
    nulls = pa.compute.mean(pa.compute.is_null(col))
    print(f"{name}, {uniques=}  {nulls=}")

### drop everything that has null value or null valueuom

In [None]:
val_null = pa.compute.is_null(table["value"])
uom_null = pa.compute.is_null(table["valueuom"])
mask = pa.compute.invert(pa.compute.or_(val_null, uom_null))
table = pa.compute.filter(table, mask)
table.schema

### cast valueuom to dict and value to float

In [None]:
table = table.set_column(
    table.column_names.index("value"),
    "value",
    pa.compute.cast(table.column("value"), pa.float32()),
)
table = table.set_column(
    table.column_names.index("valueuom"),
    "valueuom",
    pa.compute.dictionary_encode(table["valueuom"]),
)
table.schema

In [None]:
parquet.write_table(table, DATASET_PATH / "chartevents.parquet")

# with polars

In [None]:
import polars as pl

with ZipFile(ds.rawdata_paths, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            return pl.read_csv(file, **options)
            # table = pyarrow.csv.read_csv(file, **options)
            # return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(ds.rawdata_paths, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError

In [None]:
dtypes = {
    "subject_id": pl.datatypes.UInt32(),
    "hadm_id": pl.datatypes.UInt32(),
    "stay_id": pl.datatypes.UInt32(),
    "itemid": pl.datatypes.UInt32(),
    "charttime": pl.datatypes.Datetime(),
    "storetime": pl.datatypes.Datetime(),
    "value": pl.datatypes.Utf8(),
    "valuenum": pl.datatypes.Float32(),
    "valueuom": pl.datatypes.Utf8(),
    "warning": pl.datatypes.Boolean(),
}


table = get_table("mimic-iv-1.0/icu/chartevents.csv.gz", dtypes=dtypes)