In [None]:
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'  # always print last expr.
%config InlineBackend.figure_format = 'svg'
%load_ext autoreload
%autoreload 2
%matplotlib inline

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
import gzip
import os
import subprocess
from getpass import getpass
from pathlib import Path
from zipfile import ZipFile

import pandas as pd
import polars as pl
import pyarrow as pa
from pandas import DataFrame
from pyarrow import csv, parquet
from tqdm.autonotebook import tqdm

import tsdm

In [None]:
self = tsdm.datasets.MIMIC_III(initialize=False)
self.table_names

In [None]:
RAWDATA_PATH = self.rawdata_paths[f"mimic-iii-clinical-database-{self.__version__}.zip"]

ID_TYPE = "uint32"
VALUE_TYPE = "float32"
TIME_TYPE = "timestamp[s]"
DATE_TYPE = "date32[day]"
BOOL_TYPE = "bool"
STRING_TYPE = "string"
DICT_TYPE = pa.dictionary("int32", "string")
NULL_TYPE = pa.null()
TEXT_TYPE = pa.large_utf8()
TYPES_MAP = {
    "int64[pyarrow]": "ID_TYPE",
    "string[pyarrow]": "STRING_TYPE",
    "timestamp[s][pyarrow]": "TIME_TYPE",
    "double[pyarrow]": "VALUE_TYPE",
}

In [None]:
def load_pandas(fname, **kwargs):
    with ZipFile(RAWDATA_PATH, "r") as archive:
        with archive.open(fname) as file:
            options = {
                "dtype_backend": "pyarrow",
                "compression": "gzip",
            } | kwargs
            return pd.read_csv(file, **options)


def load_gz_table(archive, fname, **options):
    with archive.open(str(fname), "r") as compressed_file:
        with gzip.open(compressed_file, "r") as file:
            # table = pd.read_csv(file)
            # return table
            table = csv.read_csv(file, **options)
            return table


def load_txt(archive, fname):
    with archive.open(str(fname), "r") as file:
        return str(file.read(), encoding="utf8")


def get_types(table):
    return table.to_pandas(types_mapper=pd.ArrowDtype).dtypes.astype(str).map(TYPES_MAP)


def compute_entropy(value_counts) -> float:
    counts = pa.compute.struct_field(value_counts, 1)
    n = len(counts)
    freqs = pa.compute.divide(
        pa.compute.cast(counts, pa.float64()),
        pa.compute.sum(counts),
    )

    H = pa.compute.divide(
        pa.compute.sum(pa.compute.multiply(freqs, pa.compute.log2(freqs))),
        pa.compute.log2(n),
    )
    return -H.as_py()


def get_table(fname, **options) -> DataFrame:
    fname = Path(fname)
    with ZipFile(RAWDATA_PATH, "r") as archive:
        match fname.suffix:
            case ".gz":
                return load_gz_table(archive, fname, **options)
            case ".txt":
                return load_txt(archive, fname, **options)
            case _:
                raise ValueError


def table_info(table) -> None:
    size = table.nbytes / (1024 * 1024 * 1024)
    print(f"shape={table.shape}  {size=:.3f} GiB")
    max_key = max(map(len, table.column_names))
    for name, col in tqdm(zip(table.column_names, table.columns)):
        num_total = len(col)
        num_null = pa.compute.sum(pa.compute.is_null(col)).as_py()
        value_counts = col.value_counts()
        num_uniques = len(value_counts) - bool(num_null)
        nulls = f"{num_null / num_total:7.2%}" if num_null else f"{'None':7s}"
        uniques = (
            num_uniques / (num_total - num_null)
            if num_total > num_null
            else num_uniques / num_total
        )
        entropy = compute_entropy(value_counts)
        dtype = str(col.type)[:10]
        print(
            f"{name:{max_key}s}  {nulls=:s}  {num_uniques=:9d} ({uniques:7.2%})  {entropy=:7.2%}  {dtype=:s}"
        )


def filter_nulls(table: pa.Table, cols: list[str], aggregation="or") -> pa.Table:
    def or_(variables):
        n = len(variables)
        if n == 0:
            return False
        if n == 1:
            return variables[0]
        return pa.compute.or_(
            or_(variables[: n // 2]),
            or_(variables[n // 2 :]),
        )

    def and_(variables):
        n = len(variables)
        if n == 0:
            return True
        if n == 1:
            return variables[0]
        return pa.compute.and_(
            or_(variables[: n // 2]),
            or_(variables[n // 2 :]),
        )

    masks = [table[col].is_null() for col in cols]
    mask = pa.compute.invert(or_(masks))
    return table.filter(mask)


def trim_whitespace(table: pa.Table) -> pa.Table:
    for column in table.column_names:
        array = table[column]
        index = table.column_names.index(column)
        dtype = array.type
        if dtype == (DICT_TYPE, STRING_TYPE, TEXT_TYPE):
            values = pa.compute.cast(
                pa.compute.utf8_trim_whitespace(array.cast("string")),
                dtype,
            )
            table.set_column(index, column, values)
    return table

In [None]:
NULL_VALUES = [
    "",
    " ",
    "  ",
    "   ",
    "    ",
    "     ",
    "      ",
    "       ",
    "        ",
    "-",
    "---",
    "----",
    "-----",
    "-------",
    "?",
    "UNABLE TO OBTAIN",
    "UNKNOWN",
    "Unknown",
    "unknown",
    ".",
    ".*.",
    "___.",
    "_",
    "__",
    "___",
]
TRUE_VALUES = ["Y", "Yes", "1", "T"]
FALSE_VALUES = ["N", "No", "0", "F"]

column_types = {
    "ADMISSIONS": {
        # fmt: off
        "ROW_ID"               : ID_TYPE,
        "SUBJECT_ID"           : ID_TYPE,
        "HADM_ID"              : ID_TYPE,
        "ADMITTIME"            : TIME_TYPE,
        "DISCHTIME"            : TIME_TYPE,
        "DEATHTIME"            : TIME_TYPE,
        "ADMISSION_TYPE"       : DICT_TYPE,
        "ADMISSION_LOCATION"   : DICT_TYPE,
        "DISCHARGE_LOCATION"   : DICT_TYPE,
        "INSURANCE"            : DICT_TYPE,
        "LANGUAGE"             : DICT_TYPE,
        "RELIGION"             : DICT_TYPE,
        "MARITAL_STATUS"       : DICT_TYPE,
        "ETHNICITY"            : DICT_TYPE,
        "EDREGTIME"            : TIME_TYPE,
        "EDOUTTIME"            : TIME_TYPE,
        "DIAGNOSIS"            : STRING_TYPE,
        "HOSPITAL_EXPIRE_FLAG" : BOOL_TYPE,
        "HAS_CHARTEVENTS_DATA" : BOOL_TYPE,
        # fmt: on
    },
    "CALLOUT": {
        # fmt: off
        "ROW_ID"                 : ID_TYPE,
        "SUBJECT_ID"             : ID_TYPE,
        "HADM_ID"                : ID_TYPE,
        "SUBMIT_WARDID"          : ID_TYPE,
        "SUBMIT_CAREUNIT"        : DICT_TYPE,
        "CURR_WARDID"            : ID_TYPE,
        "CURR_CAREUNIT"          : DICT_TYPE,
        "CALLOUT_WARDID"         : ID_TYPE,
        "CALLOUT_SERVICE"        : DICT_TYPE,
        "REQUEST_TELE"           : BOOL_TYPE,
        "REQUEST_RESP"           : BOOL_TYPE,
        "REQUEST_CDIFF"          : BOOL_TYPE,
        "REQUEST_MRSA"           : BOOL_TYPE,
        "REQUEST_VRE"            : BOOL_TYPE,
        "CALLOUT_STATUS"         : DICT_TYPE,
        "CALLOUT_OUTCOME"        : DICT_TYPE,
        "DISCHARGE_WARDID"       : ID_TYPE,
        "ACKNOWLEDGE_STATUS"     : DICT_TYPE,
        "CREATETIME"             : TIME_TYPE,
        "UPDATETIME"             : TIME_TYPE,
        "ACKNOWLEDGETIME"        : TIME_TYPE,
        "OUTCOMETIME"            : TIME_TYPE,
        "FIRSTRESERVATIONTIME"   : TIME_TYPE,
        "CURRENTRESERVATIONTIME" : TIME_TYPE,
        # fmt: on
    },
    "CAREGIVERS": {
        # fmt: off
        "ROW_ID"      : ID_TYPE,
        "CGID"        : ID_TYPE,
        "LABEL"       : DICT_TYPE,
        "DESCRIPTION" : DICT_TYPE,
        # fmt: on
    },
    "CHARTEVENTS": {
        # fmt: off
        "ROW_ID"       : ID_TYPE,
        "SUBJECT_ID"   : ID_TYPE,
        "HADM_ID"      : ID_TYPE,
        "ICUSTAY_ID"   : ID_TYPE,
        "ITEMID"       : ID_TYPE,
        "CHARTTIME"    : TIME_TYPE,
        "STORETIME"    : TIME_TYPE,
        "CGID"         : ID_TYPE,
        "VALUE"        : STRING_TYPE,  # FILTER NULLS
        "VALUENUM"     : VALUE_TYPE,
        "VALUEUOM"     : DICT_TYPE,
        "WARNING"      : BOOL_TYPE,
        "ERROR"        : BOOL_TYPE,
        "RESULTSTATUS" : DICT_TYPE,
        "STOPPED"      : DICT_TYPE,
        # fmt: on
    },
    "CPTEVENTS": {
        # fmt: off
        "ROW_ID"           : ID_TYPE,
        "SUBJECT_ID"       : ID_TYPE,
        "HADM_ID"          : ID_TYPE,
        "COSTCENTER"       : DICT_TYPE,
        "CHARTDATE"        : TIME_TYPE,  # cast date_type
        "CPT_CD"           : DICT_TYPE,
        "CPT_NUMBER"       : ID_TYPE,
        "CPT_SUFFIX"       : DICT_TYPE,
        "TICKET_ID_SEQ"    : ID_TYPE,
        "SECTIONHEADER"    : DICT_TYPE,
        "SUBSECTIONHEADER" : DICT_TYPE,
        "DESCRIPTION"      : DICT_TYPE,
        # fmt: on
    },
    "DATETIMEEVENTS": {
        # fmt: off
        "ROW_ID"       : ID_TYPE,
        "SUBJECT_ID"   : ID_TYPE,
        "HADM_ID"      : ID_TYPE,
        "ICUSTAY_ID"   : ID_TYPE,
        "ITEMID"       : ID_TYPE,
        "CHARTTIME"    : TIME_TYPE,
        "STORETIME"    : TIME_TYPE,
        "CGID"         : ID_TYPE,
        "VALUE"        : TIME_TYPE,
        "VALUEUOM"     : DICT_TYPE,
        "WARNING"      : BOOL_TYPE,
        "ERROR"        : BOOL_TYPE,
        "RESULTSTATUS" : DICT_TYPE,
        "STOPPED"      : DICT_TYPE,
        # fmt: on
    },
    "D_CPT": {
        # fmt: off
        "ROW_ID"              : ID_TYPE,
        "CATEGORY"            : ID_TYPE,
        "SECTIONRANGE"        : DICT_TYPE,
        "SECTIONHEADER"       : DICT_TYPE,
        "SUBSECTIONRANGE"     : STRING_TYPE,
        "SUBSECTIONHEADER"    : STRING_TYPE,
        "CODESUFFIX"          : BOOL_TYPE,
        "MINCODEINSUBSECTION" : ID_TYPE,
        "MAXCODEINSUBSECTION" : ID_TYPE,
        # fmt: on
    },
    "DIAGNOSES_ICD": {
        # fmt: off
        "ROW_ID"     : ID_TYPE,
        "SUBJECT_ID" : ID_TYPE,
        "HADM_ID"    : ID_TYPE,
        "SEQ_NUM"    : ID_TYPE,
        "ICD9_CODE"  : DICT_TYPE,
        # fmt: on
    },
    "D_ICD_DIAGNOSES": {
        # fmt: off
        "ROW_ID"      : ID_TYPE,
        "ICD9_CODE"   : STRING_TYPE,
        "SHORT_TITLE" : STRING_TYPE,
        "LONG_TITLE"  : STRING_TYPE,
        # fmt: on
    },
    "D_ICD_PROCEDURES": {
        # fmt: off
        "ROW_ID"      : ID_TYPE,
        "ICD9_CODE"   : STRING_TYPE,
        "SHORT_TITLE" : STRING_TYPE,
        "LONG_TITLE"  : STRING_TYPE,
        # fmt: on
    },
    "D_ITEMS": {
        # fmt: off
        "ROW_ID"       : ID_TYPE,
        "ITEMID"       : ID_TYPE,
        "LABEL"        : STRING_TYPE,
        "ABBREVIATION" : STRING_TYPE,
        "DBSOURCE"     : DICT_TYPE,
        "LINKSTO"      : DICT_TYPE,
        "CATEGORY"     : DICT_TYPE,
        "UNITNAME"     : DICT_TYPE,
        "PARAM_TYPE"   : DICT_TYPE,
        "CONCEPTID"    : NULL_TYPE,
        # fmt: on
    },
    "D_LABITEMS": {
        # fmt: off
        "ROW_ID"     : ID_TYPE,
        "ITEMID"     : ID_TYPE,
        "LABEL"      : STRING_TYPE,
        "FLUID"      : DICT_TYPE,
        "CATEGORY"   : DICT_TYPE,
        "LOINC_CODE" : STRING_TYPE,
        # fmt: on
    },
    "DRGCODES": {
        # fmt: off
        "ROW_ID"        : ID_TYPE,
        "SUBJECT_ID"    : ID_TYPE,
        "HADM_ID"       : ID_TYPE,
        "DRG_TYPE"      : DICT_TYPE,
        "DRG_CODE"      : ID_TYPE,
        "DESCRIPTION"   : DICT_TYPE,
        "DRG_SEVERITY"  : ID_TYPE,
        "DRG_MORTALITY" : ID_TYPE,
        # fmt: on
    },
    "ICUSTAYS": {
        # fmt: off
        "ROW_ID"         : ID_TYPE,
        "SUBJECT_ID"     : ID_TYPE,
        "HADM_ID"        : ID_TYPE,
        "ICUSTAY_ID"     : ID_TYPE,
        "DBSOURCE"       : DICT_TYPE,
        "FIRST_CAREUNIT" : DICT_TYPE,
        "LAST_CAREUNIT"  : DICT_TYPE,
        "FIRST_WARDID"   : ID_TYPE,
        "LAST_WARDID"    : ID_TYPE,
        "INTIME"         : TIME_TYPE,
        "OUTTIME"        : TIME_TYPE,
        "LOS"            : VALUE_TYPE,
        # fmt: on
    },
    "INPUTEVENTS_CV": {
        # fmt: off
        "ROW_ID"            : ID_TYPE,
        "SUBJECT_ID"        : ID_TYPE,
        "HADM_ID"           : ID_TYPE,
        "ICUSTAY_ID"        : ID_TYPE,
        "CHARTTIME"         : TIME_TYPE,
        "ITEMID"            : ID_TYPE,
        "AMOUNT"            : VALUE_TYPE,
        "AMOUNTUOM"         : DICT_TYPE,
        "RATE"              : VALUE_TYPE,
        "RATEUOM"           : DICT_TYPE,
        "STORETIME"         : TIME_TYPE,
        "CGID"              : ID_TYPE,
        "ORDERID"           : ID_TYPE,
        "LINKORDERID"       : ID_TYPE,
        "STOPPED"           : DICT_TYPE,
        "NEWBOTTLE"         : BOOL_TYPE,
        "ORIGINALAMOUNT"    : VALUE_TYPE,
        "ORIGINALAMOUNTUOM" : DICT_TYPE,
        "ORIGINALROUTE"     : DICT_TYPE,
        "ORIGINALRATE"      : VALUE_TYPE,
        "ORIGINALRATEUOM"   : DICT_TYPE,
        "ORIGINALSITE"      : DICT_TYPE,
        # fmt: on
    },
    "INPUTEVENTS_MV": {
        # fmt: off
        "ROW_ID"                        : ID_TYPE,
        "SUBJECT_ID"                    : ID_TYPE,
        "HADM_ID"                       : ID_TYPE,
        "ICUSTAY_ID"                    : ID_TYPE,
        "STARTTIME"                     : TIME_TYPE,
        "ENDTIME"                       : TIME_TYPE,
        "ITEMID"                        : ID_TYPE,
        "AMOUNT"                        : VALUE_TYPE,
        "AMOUNTUOM"                     : DICT_TYPE,
        "RATE"                          : VALUE_TYPE,
        "RATEUOM"                       : DICT_TYPE,
        "STORETIME"                     : TIME_TYPE,
        "CGID"                          : ID_TYPE,
        "ORDERID"                       : ID_TYPE,
        "LINKORDERID"                   : ID_TYPE,
        "ORDERCATEGORYNAME"             : DICT_TYPE,
        "SECONDARYORDERCATEGORYNAME"    : DICT_TYPE,
        "ORDERCOMPONENTTYPEDESCRIPTION" : DICT_TYPE,
        "ORDERCATEGORYDESCRIPTION"      : DICT_TYPE,
        "PATIENTWEIGHT"                 : VALUE_TYPE,
        "TOTALAMOUNT"                   : VALUE_TYPE,
        "TOTALAMOUNTUOM"                : DICT_TYPE,
        "ISOPENBAG"                     : BOOL_TYPE,
        "CONTINUEINNEXTDEPT"            : BOOL_TYPE,
        "CANCELREASON"                  : ID_TYPE,
        "STATUSDESCRIPTION"             : DICT_TYPE,
        "COMMENTS_EDITEDBY"             : DICT_TYPE,
        "COMMENTS_CANCELEDBY"           : DICT_TYPE,
        "COMMENTS_DATE"                 : TIME_TYPE,
        "ORIGINALAMOUNT"                : VALUE_TYPE,
        "ORIGINALRATE"                  : VALUE_TYPE,
        # fmt: on
    },
    "LABEVENTS": {
        # fmt: off
        "ROW_ID"     : ID_TYPE,
        "SUBJECT_ID" : ID_TYPE,
        "HADM_ID"    : ID_TYPE,
        "ITEMID"     : ID_TYPE,
        "CHARTTIME"  : TIME_TYPE,
        "VALUE"      : DICT_TYPE,  # FILTER NULLS
        "VALUENUM"   : VALUE_TYPE,
        "VALUEUOM"   : DICT_TYPE,
        "FLAG"       : DICT_TYPE,
        # fmt: on
    },
    "MICROBIOLOGYEVENTS": {
        # fmt: off
        "ROW_ID"              : ID_TYPE,
        "SUBJECT_ID"          : ID_TYPE,
        "HADM_ID"             : ID_TYPE,
        "CHARTDATE"           : TIME_TYPE,  # cast DATE
        "CHARTTIME"           : TIME_TYPE,
        "SPEC_ITEMID"         : ID_TYPE,
        "SPEC_TYPE_DESC"      : DICT_TYPE,
        "ORG_ITEMID"          : ID_TYPE,
        "ORG_NAME"            : DICT_TYPE,
        "ISOLATE_NUM"         : ID_TYPE,
        "AB_ITEMID"           : ID_TYPE,
        "AB_NAME"             : DICT_TYPE,
        "DILUTION_TEXT"       : DICT_TYPE,
        "DILUTION_COMPARISON" : DICT_TYPE,
        "DILUTION_VALUE"      : VALUE_TYPE,
        "INTERPRETATION"      : DICT_TYPE,
        # fmt: on
    },
    "NOTEEVENTS": {
        # fmt: off
        "ROW_ID"      : ID_TYPE,
        "SUBJECT_ID"  : ID_TYPE,
        "HADM_ID"     : ID_TYPE,
        "CHARTDATE"   : DATE_TYPE,
        "CHARTTIME"   : TIME_TYPE,
        "STORETIME"   : TIME_TYPE,
        "CATEGORY"    : DICT_TYPE,
        "DESCRIPTION" : DICT_TYPE,
        "CGID"        : ID_TYPE,
        "ISERROR"     : BOOL_TYPE,
        "TEXT"        : TEXT_TYPE,
        # fmt: on
    },
    "OUTPUTEVENTS": {
        # fmt: off
        "ROW_ID"     : ID_TYPE,
        "SUBJECT_ID" : ID_TYPE,
        "HADM_ID"    : ID_TYPE,
        "ICUSTAY_ID" : ID_TYPE,
        "CHARTTIME"  : TIME_TYPE,
        "ITEMID"     : ID_TYPE,
        "VALUE"      : VALUE_TYPE,  # FILTER NULLS
        "VALUEUOM"   : DICT_TYPE,
        "STORETIME"  : TIME_TYPE,
        "CGID"       : ID_TYPE,
        "STOPPED"    : NULL_TYPE,
        "NEWBOTTLE"  : NULL_TYPE,
        "ISERROR"    : NULL_TYPE,
        # fmt: on
    },
    "PATIENTS": {
        # fmt: off
        "ROW_ID"      : ID_TYPE,
        "SUBJECT_ID"  : ID_TYPE,
        "GENDER"      : DICT_TYPE,
        "DOB"         : TIME_TYPE,  # cast DATE_TYPE
        "DOD"         : TIME_TYPE,  # cast DATE_TYPE
        "DOD_HOSP"    : TIME_TYPE,  # cast DATE_TYPE
        "DOD_SSN"     : TIME_TYPE,  # cast DATE_TYPE
        "EXPIRE_FLAG" : BOOL_TYPE,
        # fmt: on
    },
    "PRESCRIPTIONS": {
        # fmt: off
        "ROW_ID"            : ID_TYPE,
        "SUBJECT_ID"        : ID_TYPE,
        "HADM_ID"           : ID_TYPE,
        "ICUSTAY_ID"        : ID_TYPE,
        "STARTDATE"         : TIME_TYPE,  # cast DATE_TYPE
        "ENDDATE"           : TIME_TYPE,  # cast DATE_TYPE
        "DRUG_TYPE"         : DICT_TYPE,
        "DRUG"              : DICT_TYPE,
        "DRUG_NAME_POE"     : DICT_TYPE,
        "DRUG_NAME_GENERIC" : DICT_TYPE,
        "FORMULARY_DRUG_CD" : DICT_TYPE,
        "GSN"               : DICT_TYPE,
        "NDC"               : DICT_TYPE,
        "PROD_STRENGTH"     : DICT_TYPE,
        "DOSE_VAL_RX"       : DICT_TYPE,
        "DOSE_UNIT_RX"      : DICT_TYPE,
        "FORM_VAL_DISP"     : DICT_TYPE,
        "FORM_UNIT_DISP"    : DICT_TYPE,
        "ROUTE"             : DICT_TYPE,
        # fmt: on
    },
    "PROCEDUREEVENTS_MV": {
        # fmt: off
        "ROW_ID"                     : ID_TYPE,
        "SUBJECT_ID"                 : ID_TYPE,
        "HADM_ID"                    : ID_TYPE,
        "ICUSTAY_ID"                 : ID_TYPE,
        "STARTTIME"                  : TIME_TYPE,
        "ENDTIME"                    : TIME_TYPE,
        "ITEMID"                     : ID_TYPE,
        "VALUE"                      : VALUE_TYPE,
        "VALUEUOM"                   : DICT_TYPE,
        "LOCATION"                   : DICT_TYPE,
        "LOCATIONCATEGORY"           : DICT_TYPE,
        "STORETIME"                  : TIME_TYPE,
        "CGID"                       : ID_TYPE,
        "ORDERID"                    : ID_TYPE,
        "LINKORDERID"                : ID_TYPE,
        "ORDERCATEGORYNAME"          : DICT_TYPE,
        "SECONDARYORDERCATEGORYNAME" : NULL_TYPE,
        "ORDERCATEGORYDESCRIPTION"   : DICT_TYPE,
        "ISOPENBAG"                  : BOOL_TYPE,
        "CONTINUEINNEXTDEPT"         : BOOL_TYPE,
        "CANCELREASON"               : ID_TYPE,
        "STATUSDESCRIPTION"          : DICT_TYPE,
        "COMMENTS_EDITEDBY"          : DICT_TYPE,
        "COMMENTS_CANCELEDBY"        : DICT_TYPE,
        "COMMENTS_DATE"              : TIME_TYPE,
        # fmt: on
    },
    "PROCEDURES_ICD": {
        # fmt: off
        "ROW_ID"     : ID_TYPE,
        "SUBJECT_ID" : ID_TYPE,
        "HADM_ID"    : ID_TYPE,
        "SEQ_NUM"    : ID_TYPE,
        "ICD9_CODE"  : DICT_TYPE,
        # fmt: on
    },
    "SERVICES": {
        # fmt: off
        "ROW_ID"       : ID_TYPE,
        "SUBJECT_ID"   : ID_TYPE,
        "HADM_ID"      : ID_TYPE,
        "TRANSFERTIME" : TIME_TYPE,
        "PREV_SERVICE" : DICT_TYPE,
        "CURR_SERVICE" : DICT_TYPE,
        # fmt: on
    },
    "TRANSFERS": {
        # fmt: off
        "ROW_ID"        : ID_TYPE,
        "SUBJECT_ID"    : ID_TYPE,
        "HADM_ID"       : ID_TYPE,
        "ICUSTAY_ID"    : ID_TYPE,
        "DBSOURCE"      : DICT_TYPE,
        "EVENTTYPE"     : DICT_TYPE,
        "PREV_CAREUNIT" : DICT_TYPE,
        "CURR_CAREUNIT" : DICT_TYPE,
        "PREV_WARDID"   : ID_TYPE,
        "CURR_WARDID"   : ID_TYPE,
        "INTIME"        : TIME_TYPE,
        "OUTTIME"       : TIME_TYPE,
        "LOS"           : VALUE_TYPE,
        # fmt: on
    },
};

## ADMISSIONS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/ADMISSIONS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["ADMISSIONS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## CALLOUT

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/CALLOUT.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["CALLOUT"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## CAREGIVERS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/CAREGIVERS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["CAREGIVERS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)
get_types(table)

## CHARTEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/CHARTEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["CHARTEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

In [None]:
table_info(filter_nulls(table, ["ICUSTAY_ID", "VALUE", "VALUENUM", "VALUEUOM"]))

## CPTEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/CPTEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["CPTEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).cast(pa.schema(column_types["CPTEVENTS"] | {"CHARTDATE": DATE_TYPE}))
table_info(table)

## DATETIMEEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/DATETIMEEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["DATETIMEEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## DIAGNOSES_ICD

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/DIAGNOSES_ICD.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["DIAGNOSES_ICD"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## DRGCODES

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/DRGCODES.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["DRGCODES"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## D_CPT

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_CPT.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["D_CPT"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## D_ICD_DIAGNOSES

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_ICD_DIAGNOSES.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["D_ICD_DIAGNOSES"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## D_ICD_PROCEDURES

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_ICD_PROCEDURES.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["D_ICD_PROCEDURES"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## D_ITEMS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_ITEMS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["D_ITEMS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## D_LABITEMS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/D_LABITEMS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["D_LABITEMS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## ICUSTAYS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/ICUSTAYS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["ICUSTAYS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## INPUTEVENTS_CV

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/INPUTEVENTS_CV.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["INPUTEVENTS_CV"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## INPUTEVENTS_MV

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/INPUTEVENTS_MV.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["INPUTEVENTS_MV"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## LABEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/LABEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["LABEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

In [None]:
table_info(filter_nulls(table, ["VALUE", "VALUENUM", "VALUEUOM"]))

## MICROBIOLOGYEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/MICROBIOLOGYEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["MICROBIOLOGYEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).cast(pa.schema(column_types["MICROBIOLOGYEVENTS"] | {"CHARTDATE": DATE_TYPE}))
table_info(table)

## NOTEEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/NOTEEVENTS.csv.gz",
    convert_options=pa.csv.ConvertOptions(
        column_types=column_types["NOTEEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
    parse_options=pa.csv.ParseOptions(newlines_in_values=True),
)
table_info(table)

## OUTPUTEVENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/OUTPUTEVENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["OUTPUTEVENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

In [None]:
table_info(filter_nulls(table, ["VALUE", "VALUEUOM"]))

## PATIENTS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/PATIENTS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["PATIENTS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).cast(
    pa.schema(
        column_types["PATIENTS"]
        | {key: DATE_TYPE for key in ("DOB", "DOD", "DOD_HOSP", "DOD_SSN")}
    )
)
table_info(table)

## PRESCRIPTIONS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/PRESCRIPTIONS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["PRESCRIPTIONS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
).cast(
    pa.schema(
        column_types["PRESCRIPTIONS"] | {"STARTDATE": DATE_TYPE, "ENDDATE": DATE_TYPE}
    )
)
table_info(table)

## PROCEDUREEVENTS_MV

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/PROCEDUREEVENTS_MV.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["PROCEDUREEVENTS_MV"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## PROCEDURES_ICD

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/PROCEDURES_ICD.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["PROCEDURES_ICD"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## SERVICES

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/SERVICES.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["SERVICES"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

## TRANSFERS

In [None]:
table = get_table(
    "mimic-iii-clinical-database-1.4/TRANSFERS.csv.gz",
    convert_options=csv.ConvertOptions(
        column_types=column_types["TRANSFERS"],
        strings_can_be_null=True,
        null_values=NULL_VALUES,
        true_values=TRUE_VALUES,
        false_values=FALSE_VALUES,
    ),
)
table_info(table)

In [None]:
dir(parquet)

## FIN