In [None]:
import resource


def limit_memory(maxsize):
    soft, hard = resource.getrlimit(resource.RLIMIT_AS)
    # MEMORY LIMIT IN BYTES
    resource.setrlimit(resource.RLIMIT_AS, (maxsize, hard))


# IN GIGABYTE
limit_memory(30 * 1024 * 1024 * 1024)

In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import gzip
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import pyarrow
from pandas import DataFrame
from tqdm.autonotebook import tqdm

from tsdm.datasets import MIMIC_IV

In [None]:
ds = MIMIC_IV(initialize=False)
ds.rawdata_paths

In [None]:
filelist = [
    "mimic-iv-1.0/LICENSE.txt",
    "mimic-iv-1.0/SHA256SUMS.txt",
    "mimic-iv-1.0/core/admissions.csv.gz",
    "mimic-iv-1.0/core/patients.csv.gz",
    "mimic-iv-1.0/core/transfers.csv.gz",
    "mimic-iv-1.0/hosp/d_hcpcs.csv.gz",
    "mimic-iv-1.0/hosp/d_icd_diagnoses.csv.gz",
    "mimic-iv-1.0/hosp/d_icd_procedures.csv.gz",
    "mimic-iv-1.0/hosp/d_labitems.csv.gz",
    "mimic-iv-1.0/hosp/diagnoses_icd.csv.gz",
    "mimic-iv-1.0/hosp/drgcodes.csv.gz",
    "mimic-iv-1.0/hosp/emar.csv.gz",
    "mimic-iv-1.0/hosp/emar_detail.csv.gz",
    "mimic-iv-1.0/hosp/hcpcsevents.csv.gz",
    "mimic-iv-1.0/hosp/labevents.csv.gz",
    "mimic-iv-1.0/hosp/microbiologyevents.csv.gz",
    "mimic-iv-1.0/hosp/pharmacy.csv.gz",
    "mimic-iv-1.0/hosp/poe.csv.gz",
    "mimic-iv-1.0/hosp/poe_detail.csv.gz",
    "mimic-iv-1.0/hosp/prescriptions.csv.gz",
    "mimic-iv-1.0/hosp/procedures_icd.csv.gz",
    "mimic-iv-1.0/hosp/services.csv.gz",
    "mimic-iv-1.0/icu/chartevents.csv.gz",
    "mimic-iv-1.0/icu/d_items.csv.gz",
    "mimic-iv-1.0/icu/datetimeevents.csv.gz",
    "mimic-iv-1.0/icu/icustays.csv.gz",
    "mimic-iv-1.0/icu/inputevents.csv.gz",
    "mimic-iv-1.0/icu/outputevents.csv.gz",
    "mimic-iv-1.0/icu/procedureevents.csv.gz",
]

In [None]:
PATH = "mimic-iv-2.2.zip"

with ZipFile(PATH, "r") as file:
    # if not file.namelist() == filelist:
    #     raise ValueError("The dataset contains unknown files!")
    # print(dir(file))

    filesizes = {info.filename: info.file_size for info in file.infolist()}
    print([x.filename for x in file.infolist()])

In [None]:
{key: filesizes[key] for key in sorted(filesizes, key=filesizes.get)}

In [None]:
with ZipFile(ds.rawdata_paths, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            table = pyarrow.csv.read_csv(file, **options)
            return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(ds.rawdata_paths, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError

In [None]:
ID_TYPE = "uint32"
CATEGORY = pyarrow.dictionary("uint32", "string")


column_types = dict(
    chartevents={
        "subject_id": ID_TYPE,
        "hadm_id": ID_TYPE,
        "stay_id": ID_TYPE,
        "itemid": ID_TYPE,
        "charttime": "timestamp[s]",
        "storetime": "timestamp[s]",
        "value": "string",
        "valuenum": "float32",
        "valueuom": "string",
        "warning": "bool",
    }
)

convert_options = pyarrow.csv.ConvertOptions(
    column_types=column_types["chartevents"],
    strings_can_be_null=True,
    # null_values=null_values,
)

In [None]:
table = get_table(
    "mimic-iv-1.0/icu/chartevents.csv.gz", convert_options=convert_options
)
# convert dictionary
table = table.set_column(
    table.column_names.index("valueuom"),
    "valueuom",
    pyarrow.compute.cast(
        table.column("valueuom").dictionary_encode(),
        pyarrow.dictionary("uint8", "string"),
    ),
)
table.schema

In [None]:
for name, col in tqdm(zip(table.column_names, table.columns)):
    c = pyarrow.compute.count_distinct(col)
    print(f"{name}, {c}")

In [None]:
# ~ 89M

In [None]:
# create version of table with numerical value

mask = pyarrow.compute.utf8_is_numeric(table["value"])
table_value_is_numeric = table.filter(mask)
table_value_is_numeric = table_value_is_numeric.set_column(
    table_value_is_numeric.column_names.index("value"),
    "value",
    pyarrow.compute.cast(table_value_is_numeric["value"], "float32"),
)

In [None]:
pyarrow.compute.unique(table["valuenum"])

In [None]:
table.filter()

In [None]:
raise

In [None]:
pyarrow.categorical

In [None]:
raise

In [None]:
pyarrow.compute.unique(table["valueuom"])

# with polars

In [None]:
import polars as pl

with ZipFile(ds.rawdata_paths, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            return pl.read_csv(file, **options)
            # table = pyarrow.csv.read_csv(file, **options)
            # return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(ds.rawdata_paths, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError

In [None]:
dtypes = {
    "subject_id": pl.datatypes.UInt32(),
    "hadm_id": pl.datatypes.UInt32(),
    "stay_id": pl.datatypes.UInt32(),
    "itemid": pl.datatypes.UInt32(),
    "charttime": pl.datatypes.Datetime(),
    "storetime": pl.datatypes.Datetime(),
    "value": pl.datatypes.Utf8(),
    "valuenum": pl.datatypes.Float32(),
    "valueuom": pl.datatypes.Utf8(),
    "warning": pl.datatypes.Boolean(),
}


table = get_table("mimic-iv-1.0/icu/chartevents.csv.gz", dtypes=dtypes)

In [None]:
pl.datatypes.U

In [None]:
pyarrow.compute.count_distinct(table)

In [None]:
df = pl.from_arrow(table)

In [None]:
?pyarrow.compute.count_distinct

In [None]:
next(iter(table)).name