In [None]:
# https://www.sciencedirect.com/science/article/pii/S2352711024002474 imp curve
# https://www.nature.com/articles/s41598-024-61365-z
# https://ieeexplore.ieee.org/document/9658486 representaciones
# https://proceedings.neurips.cc/paper_files/paper/2019/file/1c336b8080f82bcc2cd2499b4c57261d-Paper.pdf calibration
# https://github.com/dirichletcal/dirichlet_python/blob/master/examples/calibration_example.py

In [None]:
from pathlib import Path

import ipynbname
import pandas as pd

from eicu_pipeline.load_data import EICULoad
from extra.mappings import equate_columns_mimic_and_eicu
from mimic_pipeline.load_data import MimicLoad
from preprocess_data import DataPreprocess

# Load data

In [None]:
# Definition of the CSSR diagnosis codes
# Change this list if you want to train a model on different diagnosis
diagnosis_codes = [
    "INF002",
    "CIR011",
    "END002",
    "RSP002",
    "NEO073",
    "INJ010",
    "NEO022",
    "DIG021",
    "CIR021",
    "INJ008",
]

In [None]:
# Get path to the root of the project
nb_path = Path(ipynbname.path())
project_root = nb_path.parent.parent.resolve()

## Load mimic data using the pipeline

In [None]:
# Get path to the mimic pipeline
mimic_path = project_root / "physionet.org" / "files" / "mimiciv" / "2.2"
mimic_data = MimicLoad(mimic_root=mimic_path, diagnosis_codes=diagnosis_codes)

In [None]:
mimic_data.data

## Load eicu database

In [None]:
eicu_path = project_root / "physionet.org" / "files" / "eicu-crd" / "2.0"
eicu_data = EICULoad(eicu_path, diagnosis_codes)

In [None]:
eicu_data.data

# Preprocess datasets

## Make both datasets have the same columns

In [None]:
mimic_data.data, eicu_data.data = equate_columns_mimic_and_eicu(
    mimic_data=mimic_data.data, eicu_data=eicu_data.data
)

In [None]:
mimic_data.data

In [None]:
eicu_data.data

## Filter nans and encode categorical columns

In [None]:
# This are some features we want to keep
# even if they have a lot of nans
base_features = [
    "Heart Rate",
    "O2 saturation pulseoxymetry",
    "Respiratory Rate",
    "Temperature Celsius",
    "Central Venous Pressure",
    "Pulmonary Artery Pressure mean",
    "Pulmonary Artery Pressure diastolic",
    "Pulmonary Artery Pressure systolic",
]

prefixes = ["last_", "mean_", "median_", "min_", "max_"]

expanded_features = [f"{prefix}{feat}" for prefix in prefixes for feat in base_features]

In [None]:
preprocessed_data = DataPreprocess(
    main_data=mimic_data.data,
    external_data=eicu_data.data,
    label="CCSR CATEGORY 1",
    cat_variables=["gender"],
    important_variables=expanded_features,
    max_nan_percentage=10,
)