In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import gzip
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import pyarrow
from pandas import DataFrame
from tqdm.autonotebook import tqdm

from tsdm.datasets import MIMIC_IV

In [None]:
ds = MIMIC_IV(initialize=False)
ds.rawdata_paths

In [None]:
filelist = [
    "mimic-iv-1.0/LICENSE.txt",
    "mimic-iv-1.0/SHA256SUMS.txt",
    "mimic-iv-1.0/core/admissions.csv.gz",
    "mimic-iv-1.0/core/patients.csv.gz",
    "mimic-iv-1.0/core/transfers.csv.gz",
    "mimic-iv-1.0/hosp/d_hcpcs.csv.gz",
    "mimic-iv-1.0/hosp/d_icd_diagnoses.csv.gz",
    "mimic-iv-1.0/hosp/d_icd_procedures.csv.gz",
    "mimic-iv-1.0/hosp/d_labitems.csv.gz",
    "mimic-iv-1.0/hosp/diagnoses_icd.csv.gz",
    "mimic-iv-1.0/hosp/drgcodes.csv.gz",
    "mimic-iv-1.0/hosp/emar.csv.gz",
    "mimic-iv-1.0/hosp/emar_detail.csv.gz",
    "mimic-iv-1.0/hosp/hcpcsevents.csv.gz",
    "mimic-iv-1.0/hosp/labevents.csv.gz",
    "mimic-iv-1.0/hosp/microbiologyevents.csv.gz",
    "mimic-iv-1.0/hosp/pharmacy.csv.gz",
    "mimic-iv-1.0/hosp/poe.csv.gz",
    "mimic-iv-1.0/hosp/poe_detail.csv.gz",
    "mimic-iv-1.0/hosp/prescriptions.csv.gz",
    "mimic-iv-1.0/hosp/procedures_icd.csv.gz",
    "mimic-iv-1.0/hosp/services.csv.gz",
    "mimic-iv-1.0/icu/chartevents.csv.gz",
    "mimic-iv-1.0/icu/d_items.csv.gz",
    "mimic-iv-1.0/icu/datetimeevents.csv.gz",
    "mimic-iv-1.0/icu/icustays.csv.gz",
    "mimic-iv-1.0/icu/inputevents.csv.gz",
    "mimic-iv-1.0/icu/outputevents.csv.gz",
    "mimic-iv-1.0/icu/procedureevents.csv.gz",
]

In [None]:
with ZipFile(ds.rawdata_paths, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")
    print(dir(file))

    filesizes = {info.filename: info.file_size for info in file.infolist()}
    # print(file.infolist())

In [None]:
{key: filesizes[key] for key in sorted(filesizes, key=filesizes.get)}

In [None]:
with ZipFile(ds.rawdata_paths, "r") as file:
    if not file.namelist() == filelist:
        raise ValueError("The dataset contains unknown files!")


def load_gz_table(archive, fname):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            table = pyarrow.csv.read_csv(file)
            return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_table(fname) -> DataFrame:
    fname = Path(fname)
    with ZipFile(ds.rawdata_paths, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname)
            case ".txt":
                return load_txt(archive, fname)
            case _:
                raise ValueError

In [None]:
chartevents = {
    "subject_id": ID_TYPE,
    "hadm_id": ID_TYPE,
    "stay_id": ID_TYPE,
    "itemid": ID_TYPE,
    "charttime": "timestamp[s]",
    "storetime": "timestamp[s]",
    "value": "string",
    "valuenum": "float32",
    "valueuom": CATEGORY,
    "warning": "bool",
}

table = get_table("mimic-iv-1.0/icu/chartevents.csv.gz");