In [None]:
from pathlib import Path

import numpy as np
import pandas as pd

from MEDS_tabular_automl.scripts.launch_xgboost import Iterator

In [None]:
from hydra import compose, initialize
from omegaconf import OmegaConf

overrides_config = {
    "MEDS_cohort_dir": "/path/to/data",
    "output_cohort_dir": "/path/to/output_cohort",
    "do_overwrite": False,
    "seed": 1,
    "hydra.verbose": True,
    "tqdm": False,
    "loguru_init": True,
    "task_name": "long_los",
    "tabularization.window_sizes": ["1d", "30d", "365d", "full"],
    "tabularization.aggs": [
        "static/present",
        "code/count",
        "value/count",
        "value/sum",
        "value/sum_sqd",
        "value/min",
        "value/max",
    ],
}
with initialize(version_base=None, config_path="../src/MEDS_tabular_automl/configs/"):  # path to config.yaml
    overrides = [f"{k}={v}" for k, v in overrides_config.items()]
    cfg = compose(config_name="launch_xgboost", overrides=overrides)

# get cfg for iterators from configs
iterator_train = Iterator(cfg, split="train")
iterator_tuning = Iterator(cfg, split="tuning")
iterator_held_out = Iterator(cfg, split="held_out")

In [None]:
# ideally, you could load the data from the iterator into autogluon TabularDatasets (after converting to pandas dataframe)
# this is not possible since autogluon requires Python <=3.11, so until this dependency is resolved you must save and load the data

save_dir = Path("/path/to/saved/data")
train_data, train_labels = iterator_train.collect_in_memory()
# autogluon only accepts data as pandas dataframe
train_data_df = pd.DataFrame(train_data.todense(), columns=iterator_train.get_all_column_names())
train_data_df["label"] = train_labels
train_data_df.to_csv(save_dir / "train_data.csv", index=False)

tuning_data, tuning_labels = iterator_tuning.collect_in_memory()
tuning_data_df = pd.DataFrame(tuning_data.todense(), columns=iterator_tuning.get_all_column_names())
tuning_data_df["label"] = tuning_labels
tuning_data_df.to_csv(save_dir / "tuning_data.csv", index=False)

held_out_data, held_out_labels = iterator_held_out.collect_in_memory()
held_out_data_df = pd.DataFrame(held_out_data.todense(), columns=iterator_held_out.get_all_column_names())
held_out_data_df["label"] = held_out_labels
held_out_data_df.to_csv(save_dir / "held_out_data.csv", index=False)