In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import gzip
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import pyarrow as pa
from pandas import DataFrame
from pyarrow import csv

import tsdm

In [None]:
self = tsdm.datasets.MIMIC_III(initialize=False)
RAWDATA_PATH = self.rawdata_paths[f"mimic-iii-clinical-database-{self.__version__}.zip"]
self.table_names

In [None]:
from tsdm.datasets.schema.mimic_iii import (
    FALSE_VALUES,
    NULL_VALUES,
    SCHEMAS,
    TRUE_VALUES,
)

TYPES_MAP = {
    "int64[pyarrow]": "ID_TYPE",
    "string[pyarrow]": "STRING_TYPE",
    "timestamp[s][pyarrow]": "TIME_TYPE",
    "double[pyarrow]": "VALUE_TYPE",
}

In [None]:
from tsdm.utils.data import filter_nulls, table_info


def load_pandas(fname, **kwargs):
    with ZipFile(RAWDATA_PATH, "r") as archive:
        with archive.open(fname) as file:
            options = {
                "dtype_backend": "pyarrow",
                "compression": "gzip",
            } | kwargs
            return pd.read_csv(file, **options)


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            table = csv.read_csv(file, **options)
            return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(RAWDATA_PATH, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError


def get_types(table):
    return table.to_pandas(types_mapper=pd.ArrowDtype).dtypes.astype(str).map(TYPES_MAP)

## ADMISSIONS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/ADMISSIONS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["ADMISSIONS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## CALLOUT

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/CALLOUT.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["CALLOUT"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## CAREGIVERS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/CAREGIVERS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["CAREGIVERS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## CHARTEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/CHARTEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["CHARTEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

In [None]:
table_info(filter_nulls(table, ["ICUSTAY_ID", "VALUE", "VALUENUM", "VALUEUOM"]))

## CPTEVENTS

In [None]:
table = (
    get_table(
        "mimic-iii-clinical-database-1.4/CPTEVENTS.csv.gz",
        convert_options=csv.ConvertOptions(
            column_types=SCHEMAS["CPTEVENTS"],
            strings_can_be_null=True,
            null_values=NULL_VALUES,
            true_values=TRUE_VALUES,
            false_values=FALSE_VALUES,
        ),
    )
    .combine_chunks()
    .cast(pa.schema(SCHEMAS["CPTEVENTS"] | {"CHARTDATE": DATE_TYPE}))
)
table_info(table)

## DATETIMEEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/DATETIMEEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["DATETIMEEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## DIAGNOSES_ICD

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/DIAGNOSES_ICD.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["DIAGNOSES_ICD"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## DRGCODES

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/DRGCODES.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["DRGCODES"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## D_CPT

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_CPT.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["D_CPT"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## D_ICD_DIAGNOSES

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_ICD_DIAGNOSES.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["D_ICD_DIAGNOSES"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## D_ICD_PROCEDURES

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_ICD_PROCEDURES.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["D_ICD_PROCEDURES"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## D_ITEMS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_ITEMS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["D_ITEMS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## D_LABITEMS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_LABITEMS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["D_LABITEMS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## ICUSTAYS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["ICUSTAYS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## INPUTEVENTS_CV

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/INPUTEVENTS_CV.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["INPUTEVENTS_CV"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## INPUTEVENTS_MV

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/INPUTEVENTS_MV.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["INPUTEVENTS_MV"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## LABEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/LABEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["LABEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

In [None]:
table_info(filter_nulls(table, ["VALUE", "VALUENUM", "VALUEUOM"]))

## MICROBIOLOGYEVENTS

In [None]:
table = (
    get_table(
        "mimic-iii-clinical-database-1.4/MICROBIOLOGYEVENTS.csv.gz",
        convert_options=csv.ConvertOptions(
            column_types=SCHEMAS["MICROBIOLOGYEVENTS"],
            strings_can_be_null=True,
            null_values=NULL_VALUES,
            true_values=TRUE_VALUES,
            false_values=FALSE_VALUES,
        ),
    )
    .combine_chunks()
    .cast(pa.schema(SCHEMAS["MICROBIOLOGYEVENTS"] | {"CHARTDATE": DATE_TYPE}))
)
table_info(table)

## NOTEEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/NOTEEVENTS.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=SCHEMAS["NOTEEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
    parse_options=pa.csv.ParseOptions(newlines_in_values=True),
).combine_chunks()
table_info(table)

## OUTPUTEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/OUTPUTEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["OUTPUTEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

In [None]:
table_info(filter_nulls(table, ["VALUE", "VALUEUOM"]))

## PATIENTS

In [None]:
table = (
    get_table(
        "mimic-iii-clinical-database-1.4/PATIENTS.csv.gz",
        convert_options=csv.ConvertOptions(
            column_types=SCHEMAS["PATIENTS"],
            strings_can_be_null=True,
            null_values=NULL_VALUES,
            true_values=TRUE_VALUES,
            false_values=FALSE_VALUES,
        ),
    )
    .cast(
        pa.schema(
            SCHEMAS["PATIENTS"]
            | {key: DATE_TYPE for key in ("DOB", "DOD", "DOD_HOSP", "DOD_SSN")}
        )
    )
    .combine_chunks()
)
table_info(table)

## PRESCRIPTIONS

In [None]:
table = (
    get_table(
        "mimic-iii-clinical-database-1.4/PRESCRIPTIONS.csv.gz",
        convert_options=csv.ConvertOptions(
            column_types=SCHEMAS["PRESCRIPTIONS"],
            strings_can_be_null=True,
            null_values=NULL_VALUES,
            true_values=TRUE_VALUES,
            false_values=FALSE_VALUES,
        ),
    )
    .combine_chunks()
    .cast(
        pa.schema(
            SCHEMAS["PRESCRIPTIONS"] | {"STARTDATE": DATE_TYPE, "ENDDATE": DATE_TYPE}
        )
    )
)
table_info(table)

## PROCEDUREEVENTS_MV

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/PROCEDUREEVENTS_MV.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["PROCEDUREEVENTS_MV"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## PROCEDURES_ICD

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/PROCEDURES_ICD.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["PROCEDURES_ICD"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## SERVICES

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/SERVICES.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["SERVICES"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## TRANSFERS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/TRANSFERS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=SCHEMAS["TRANSFERS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).combine_chunks()
table_info(table)

## FIN