# ACP Project - Systematic Model Comparison

In [None]:
SCRIPT = False


In [None]:
import warnings, pickle, os, itertools
from dataclasses import dataclass
from joblib import Parallel, delayed, parallel_backend

try:
    from sklearnex import patch_sklearn
    patch_sklearn()
except ImportError:
    pass

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 300)

from IPython.display import display
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(10,10)})

import shap
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

%load_ext autoreload
%autoreload 1

In [None]:
from utils.evaluation import get_metrics, get_threshold_fpr
%aimport utils.evaluation

In [None]:
from sklearn.model_selection import train_test_split
from dataset import SCIData, SCICols
%aimport dataset

sci = SCIData.load('data/sci.h5')

scii = (
    SCIData(SCIData.quickload("data/sci_processed.h5").sort_values("AdmissionDateTime"))
    .mandate(SCICols.news_data_raw)
    .derive_critical_event(within=1, return_subcols=True)
    .augment_shmi(onehot=True)
    .omit_redundant()
    .raw_news()
    .derive_ae_diagnosis_stems(onehot=False)
    .categorize()
   # .onehot_encode_categories()
)

sci_train, sci_test, _, y_test_mortality, _, y_test_criticalcare = train_test_split(
    scii,
    scii.DiedWithinThreshold,
    scii.CriticalCare,
    test_size=0.33,
    random_state=42,
    shuffle=False,
)
sci_train, sci_test = SCIData(sci_train), SCIData(sci_test)
# (X_train, y_train), (X_test, y_test) = (
#     sci_train.xy(outcome="CriticalEvent", dropna=False, fillna=False),
#     sci_test.xy(outcome="CriticalEvent", dropna=False, fillna=False),
# )

In [None]:
(X_train, y_train), (X_test, y_test) = (
    sci_train.xy(outcome="CriticalEvent", dropna=False, fillna=False),
    sci_test.xy(outcome="CriticalEvent", dropna=False, fillna=False),
)
categorical_cols, categories = X_train.describe_categories()
#X_train = X_train.ordinal_encode_categories()
X_train = X_train.ordinal_encode_categories().to_numpy()

In [None]:
from sklearn.base import BaseEstimator
from typing import Dict, Any, Iterable


class Estimator:
    _name: str
    _estimator: BaseEstimator
    _requirements: Dict[str, bool]
    _static_params: Dict[str, Any] = {}
    _tuning_params_default: Dict[str, Any] = {}
    _fit_params: Dict[str, Any] = {}

    def __init__(self, sci_train):
        pass

    @classmethod
    def suggest_parameters(cls, trial):
        return dict()

    @classmethod
    def compile_parameters(cls, params):
        return {
            f"{cls._name}__{key}": value
            for key, value in {
                **cls._static_params,
                **cls._tuning_params_default,
                **params,
            }.items()
        }

    @classmethod
    def factory(cls):
        return cls._estimator(**cls._static_params)

    @classmethod
    def fit_params(cls, X_train, y_train):
        return {f"{cls._name}__{key}": value for key, value in cls._fit_params.items()}


In [None]:
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn import FunctionSampler
from functools import partial


class Resampler(Estimator):
    @classmethod
    def compile_parameters(cls, params):
        return {
            f"{cls._name}__kw_args": {
                **cls._static_params,
                **cls._tuning_params_default,
                **params,
            }
        }

    @classmethod
    def factory(cls):
        return FunctionSampler(
            func=partial(SCIData.resample, cls._estimator),
            validate=False,
            kw_args=cls._static_params,
        )


class Resampler_SMOTE(Resampler):
    _name = "SMOTE"
    _estimator = SMOTENC

    _static_params = dict(random_state=42, n_jobs=None,)

    _tuning_params_default = dict(sampling_strategy=0.1, k_neighbors=5)

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            sampling_strategy=trial.suggest_float(
                f"{cls._name}__sampling_strategy", 0.1, 0.5
            ),
            k_neighbors=trial.suggest_int(f"{cls._name}__k_neighbors", 2, 10),
        )

        return cls.compile_parameters(suggestions)

    @classmethod
    def factory(cls):
        return FunctionSampler(
            func=SCIData.SMOTE, validate=False, kw_args=cls._static_params
        )


class Resampler_RandomUnderSampler(Resampler):
    _name = "RandomUnderSampler"
    _estimator = RandomUnderSampler

    _static_params = dict(random_state=42, replacement=False)

    _tuning_params_default = dict(sampling_strategy=0.1)

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            sampling_strategy=trial.suggest_float(
                f"{cls._name}__sampling_strategy", 0.05, 0.5
            )
        )

        return cls.compile_parameters(suggestions)


class No_Resampling(Resampler):
    _name = "No_Resampling"

    @classmethod
    def suggest_parameters(cls, trial):
        return dict()

    @staticmethod
    def _(X, y):
        return X, y

    @classmethod
    def factory(cls):
        return FunctionSampler(func=cls._, validate=False)


In [None]:
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV


class Estimator_LightGBM(Estimator):
    _name = "LightGBM"
    _estimator = LGBMClassifier

    _requirements = dict(
        onehot=False, ordinal=False, imputation=False, fillna=False, resampling=False
    )

    _static_params = dict(
        objective="binary",
        metric=["l2", "auc"],
        boosting_type="gbdt",
        n_jobs=1,
        random_state=42,
        verbose=-1,
    )

    _tuning_params_default = dict(
        is_unbalance=True,
        reg_alpha=1.8e-3,
        reg_lambda=6e-4,
        num_leaves=14,
        colsample_bytree=0.4,
        subsample=0.97,
        subsample_freq=1,
        min_child_samples=6,
    )

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            reg_alpha=trial.suggest_float(
                f"{cls._name}__reg_alpha", 1e-4, 10.0, log=True
            ),
            reg_lambda=trial.suggest_float(
                f"{cls._name}__reg_lambda", 1e-4, 10.0, log=True
            ),
            num_leaves=trial.suggest_int(f"{cls._name}__num_leaves", 2, 256),
            colsample_bytree=trial.suggest_float(
                f"{cls._name}__colsample_bytree", 0.4, 1.0
            ),
            subsample=trial.suggest_float(f"{cls._name}__subsample", 0.4, 1.0),
            subsample_freq=trial.suggest_int(f"{cls._name}__subsample_freq", 1, 7),
            min_child_samples=trial.suggest_int(
                f"{cls._name}__min_child_samples", 5, 150
            ),
            is_unbalance=trial.suggest_categorical(
                f"{cls._name}__is_unbalance", [True, False]
            ),
        )

        if not suggestions["is_unbalance"]:
            suggestions["scale_pos_weight"] = trial.suggest_int(
                f"{cls._name}__scale_pos_weight", 1, 100
            )

        r = cls.compile_parameters(suggestions)
        if not suggestions["is_unbalance"]:
            del r[f"{cls._name}__is_unbalance"]

        return r


In [None]:
from xgboost import XGBClassifier


class Estimator_XGBoost(Estimator):
    _name = "XGBoost"
    _estimator = XGBClassifier

    _requirements = dict(
        onehot=False, ordinal=False, imputation=False, fillna=False, resampling=False
    )

    _static_params = dict(
        verbosity=0,
        n_jobs=1,
        objective="binary:logistic",
        booster="gbtree",
        enable_categorical=True,
    )

    _tuning_params_default = {
        **dict(
            tree_method="hist",
            alpha=7e-05,
            subsample=0.42,
            colsample_bytree=0.87,
            scale_pos_weight=14,
            max_depth=7,
            min_child_weight=10,
            eta=0.035,
            gamma=4e-08,
            grow_policy="lossguide",
        ),
        "lambda": 7e-2,
    }

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            tree_method=trial.suggest_categorical(
                f"{cls._name}__tree_method", ["approx", "hist"]
            ),
            alpha=trial.suggest_float(f"{cls._name}__alpha", 1e-8, 1.0, log=True),
            subsample=trial.suggest_float(f"{cls._name}__subsample", 0.2, 1.0),
            colsample_bytree=trial.suggest_float(
                f"{cls._name}__colsample_bytree", 0.2, 1.0
            ),
            scale_pos_weight=trial.suggest_int(
                f"{cls._name}__scale_pos_weight", 1, 100
            ),
            max_depth=trial.suggest_int(f"{cls._name}__max_depth", 3, 9, step=2),
            min_child_weight=trial.suggest_int(f"{cls._name}__min_child_weight", 2, 10),
            eta=trial.suggest_float(f"{cls._name}__eta", 1e-8, 1.0, log=True),
            gamma=trial.suggest_float(f"{cls._name}__gamma", 1e-8, 1.0, log=True),
            grow_policy=trial.suggest_categorical(
                f"{cls._name}__grow_policy", ["depthwise", "lossguide"]
            ),
        )
        suggestions["lambda"] = trial.suggest_float(
            f"{cls._name}__lambda", 1e-8, 1.0, log=True
        )

        return cls.compile_parameters(suggestions)


In [None]:
from sklearn.linear_model import LogisticRegression


class Estimator_LogisticRegression(Estimator):
    _name = "LogisticRegression"
    _estimator = LogisticRegression

    _requirements = dict(
        onehot=True, ordinal=False, imputation=True, fillna=True, resampling=False
    )

    _static_params = dict(max_iter=100, solver="lbfgs", random_state=42, penalty="l2")

    _tuning_params_default = dict(penalty="l2", C=5.9, class_weight="balanced")

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            penalty=trial.suggest_categorical(f"{cls._name}__penalty", ["l2", "none"]),
            C=trial.suggest_float(f"{cls._name}__C", 0.01, 10),
            class_weight=trial.suggest_categorical(
                f"{cls._name}__class_weight", [None, "balanced"]
            ),
        )

        # if suggestions["penalty"] == "elasticnet":
        #     suggestions["l1_ratio"] = trial.suggest_float(
        #         f"{cls._name}__l1_ratio", 0.05, 0.95
        #     )

        return cls.compile_parameters(suggestions)



In [None]:
from sklearn.ensemble import RandomForestClassifier


class Estimator_RandomForest(Estimator):
    _estimator = RandomForestClassifier
    _name = "RandomForest"

    _requirements = dict(
        onehot=False, ordinal=True, imputation=False, fillna=True, resampling=False
    )
    _tuning_params_default = dict(
        n_estimators=250,
        max_features=0.56,
        min_samples_split=8,
        min_samples_leaf=3,
        max_samples=0.75,
        class_weight="balanced",
    )

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            n_estimators=trial.suggest_int(f"{cls._name}__n_estimators", 25, 250),
            max_features=trial.suggest_float(f"{cls._name}__max_features", 0.15, 1.0),
            min_samples_split=trial.suggest_int(
                f"{cls._name}__min_samples_split", 2, 15
            ),
            min_samples_leaf=trial.suggest_int(f"{cls._name}__min_samples_leaf", 1, 15),
            max_samples=trial.suggest_float(f"{cls._name}__max_samples", 0.5, 0.99),
            class_weight=trial.suggest_categorical(
                f"{cls._name}__class_weight", [None, "balanced", "balanced_subsample"]
            ),
        )

        return cls.compile_parameters(suggestions)


In [None]:
from utils.isolation_forest_wrapper import IsolationForestWrapper


class Estimator_IsolationForest(Estimator):
    _name = "IsolationForest"
    _estimator = IsolationForestWrapper
    _requirements = dict(
        onehot=True, ordinal=False, imputation=True, fillna=True, resampling=False
    )

    _tuning_params_default = dict(
        n_estimators=140,
        max_samples=0.45,
        contamination=0.02,
        max_features=0.69,
        bootstrap=False,
    )

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            n_estimators=trial.suggest_int(f"{cls._name}__n_estimators", 1, 200),
            max_samples=trial.suggest_float(f"{cls._name}__max_samples", 0.0, 1.0),
            contamination=trial.suggest_float(
                f"{cls._name}__contamination", 1e-6, 1e-1
            ),
            max_features=trial.suggest_float(f"{cls._name}__max_features", 0.0, 1.0),
            bootstrap=trial.suggest_categorical(
                f"{cls._name}__bootstrap", [True, False]
            ),
        )

        return cls.compile_parameters(suggestions)



In [None]:
import torch
from pytorch_tabnet.tab_model import TabNetClassifier


@dataclass
class TabNetWrapper(TabNetClassifier):
    weights: int = 0
    max_epochs: int = 100
    patience: int = 10
    batch_size: int = 1024
    virtual_batch_size: int = 128
    drop_last: bool = True
    eval_metric: str = None

    def fit(self, X, y):
        return super().fit(
            X_train=X.to_numpy(),
            y_train=y.to_numpy(),
            eval_metric=self.eval_metric,
            weights=self.weights,
            max_epochs=self.max_epochs,
            patience=self.patience,
            batch_size=self.batch_size,
            virtual_batch_size=self.virtual_batch_size,
            drop_last=self.drop_last,
        )

    def predict(self, X):
        return super().predict(X.to_numpy())

    def predict_proba(self, X):
        return super().predict_proba(X.to_numpy())

    def decision_function(self, X):
        return self.predict_proba(X)[:, 1]


In [None]:
class Estimator_TabNet(Estimator):
    _estimator = TabNetWrapper
    _name = "TabNet"
    _requirements = dict(
        onehot=False, ordinal=True, imputation=True, fillna=True, resampling=False
    )

    _static_params = dict(
        optimizer_fn=torch.optim.Adam,
        scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
        verbose=0,
        device_name="cuda" if torch.cuda.is_available() else "cpu",
        scheduler_params=dict(mode="min", min_lr=1e-5, factor=0.5),
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        cat_emb_dim=1,
        max_epochs=50,
        eval_metric="average_precision",
        weights=1,
        drop_last=False,
    )

    _tuning_params_default = dict(
        n_d=8,
        n_a=8,
        n_steps=3,
        gamma=1.2,
        lambda_sparse=8e-4,
        mask_type="sparsemax",
        n_shared=3,
        scheduler_params=dict(patience=5),
    )

    def __init__(self, sci_train):
        self._categorical_idxs, self._categorical_dims = sci_train.describe_categories(
            dimensions=True
        )

    def factory(self):
        return self._estimator(
            cat_idxs=self._categorical_idxs,
            cat_dims=[
                _ + 1 for _ in self._categorical_dims
            ],  # Because we may add 1 category when we fill_na
            **self._static_params,
        )

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            n_steps=trial.suggest_int(f"{cls._name}__n_steps", 1, 10),
            n_shared=trial.suggest_int(f"{cls._name}__n_shared", 1, 10),
            gamma=trial.suggest_float(f"{cls._name}__gamma", 1, 1.5),
            lambda_sparse=trial.suggest_float(
                f"{cls._name}__lambda_sparse", 1e-6, 1e-3, log=True
            ),
            mask_type=trial.suggest_categorical(
                f"{cls._name}__mask_type", ["entmax", "sparsemax"]
            ),
            scheduler_params=dict(
                patience=trial.suggest_int(f"{cls._name}__scheduler__patience", 3, 10)
            ),
        )

        n_da = trial.suggest_int(f"{cls._name}__n_da", 4, 32,)
        suggestions["n_d"], suggestions["n_a"] = n_da, n_da

        return cls.compile_parameters(suggestions)

    @classmethod
    def compile_parameters(cls, params):
        r = {
            **cls._static_params,
            **cls._tuning_params_default,
            **params,
            "scheduler_params": {
                **cls._static_params["scheduler_params"],
                **params["scheduler_params"],
            },
        }
        return {f"{cls._name}__{key}": value for key, value in r.items()}


In [None]:
def get_feature_studies(sci_train):
    news = SCICols.news_data_raw
    news_extended = SCICols.news_data_raw + SCICols.news_data_extras
    labs = SCICols.blood
    hospital = [
        "AdmissionMethodDescription",
        "AdmissionSpecialty",
        "SentToSDEC",
        "Readmission",
    ]
    ae = ["AandEPresentingComplaint", "AandEMainDiagnosis"]
    diagnoses = [_ for _ in sci_train.columns if _.startswith("SHMI__")]
    phenotype = ["Female", "Age"]

    return list(
        dict(
            news=news,
            news_extended=news_extended,
            news_with_phenotype=news_extended + phenotype,
            with_ae_notes=news_extended + phenotype + ae,
            with_labs=news_extended + phenotype + labs,
            with_notes_and_labs=news_extended + phenotype + ae + labs,
            with_hospital=news_extended + phenotype + hospital,
            with_notes_and_hospital=news_extended + phenotype + ae + hospital,
            with_labs_and_hospital=news_extended + phenotype + labs + hospital,
            with_labs_and_diagnoses=news_extended + phenotype + labs + diagnoses,
            all=news_extended + phenotype + ae + labs + hospital + diagnoses,
        ).items()
    )


def get_studies(sci_train, study_grid=None, cli_model_arg=None):
    estimators = dict(
        cpu=[
            Estimator_IsolationForest,
            Estimator_LightGBM,
            Estimator_LogisticRegression,
            Estimator_RandomForest,
            Estimator_XGBoost,
        ],
        gpu=[Estimator_TabNet],
    )
    estimators["all"] = estimators[None] = estimators["cpu"] + estimators["gpu"]
    estimators.update({
        _._name: [_] for _ in estimators['all']
    })

    if study_grid is None:
        study_grid = dict(
            estimator=estimators[cli_model_arg],
            resampler=[No_Resampling, Resampler_RandomUnderSampler, Resampler_SMOTE],
            features=get_feature_studies(sci_train),
        )

    k, v = zip(*study_grid.items())
    return [dict(zip(k, _)) for _ in itertools.product(*v)]


In [None]:
from imblearn.pipeline import Pipeline as ImbPipeline
from typing import Optional


class Pipeline(ImbPipeline):
    def persist(self, filename):
        with open(filename, "wb") as file:
            pickle.dump(self, file)

    @classmethod
    def load(cls, filename):
        with open(filename, "rb") as file:
            return pickle.load(file)


class PipelineFactory:
    def __init__(
        self,
        estimator: Estimator,
        X_train,
        y_train,
        resampler: Optional[Estimator] = None,
    ):
        (self._estimator, self._resampler, self._X_train, self._y_train,) = (
            estimator,
            resampler,
            X_train,
            y_train,
        )

    def __call__(self, **kwargs):
        steps = [
            (self._estimator._name, self._estimator.factory(),),
        ]
        if self._resampler is not None:
            steps = [(self._resampler._name, self._resampler.factory(),),] + steps

        return Pipeline(steps=steps).set_params(**kwargs)


In [None]:
from sklearn.model_selection import cross_validate


class Objective:
    def __init__(
        self,
        estimator: Estimator,
        resampler: Estimator,
        X_train,
        y_train,
        cv=5,
        scoring="average_precision",
        cv_jobs=1,
    ):
        (
            self._estimator,
            self._resampler,
            self._X_train,
            self._y_train,
            self._cv,
            self._scoring,
            self._cv_jobs,
        ) = (
            estimator,
            resampler,
            X_train,
            y_train,
            cv,
            scoring,
            cv_jobs,
        )

        self._best_score = 0
        self._best_model = None

        self._pipeline_factory = PipelineFactory(
            estimator=self._estimator,
            resampler=self._resampler,
            X_train=self._X_train,
            y_train=self._y_train,
        )

        self._fit_params = self._estimator.fit_params(self._X_train, self._y_train)

    def __call__(self, trial):
        trial_params = {
            **(self._resampler.suggest_parameters(trial) if self._resampler else {}),
            **self._estimator.suggest_parameters(trial),
        }
        model = self._pipeline_factory(**trial_params)

        score = cross_validate(
            model,
            self._X_train,
            self._y_train,
            cv=self._cv,
            scoring=self._scoring,
            n_jobs=self._cv_jobs,
            fit_params=self._fit_params,
        )["test_score"].mean()

        if score > self._best_score:
            self._best_score = score
            self._best_model = self._pipeline_factory(**trial_params).fit(
                self._X_train, self._y_train
            )

        return score


In [None]:
from typing import Dict, Any, Iterable, Optional, Tuple


def construct_study(
    estimator: Estimator,
    sci_train: SCIData,
    sci_test: SCIData,
    features: Tuple[str, Iterable[str]] = ("All", []),
    resampler: Estimator = None,
    cv=5,
    scoring="average_precision",
    storage=None,
    model_persistence_path=None,
    cv_jobs=1,
    **kwargs,
):
    sci_args = dict(
        x=features[1],
        imputation=estimator._requirements["imputation"],
        onehot_encoding=estimator._requirements["onehot"],
        ordinal_encoding=estimator._requirements["ordinal"],
        fillna=estimator._requirements["fillna"],
    )
    (X_train, y_train), (X_test, y_test) = sci_train.xy(**sci_args), sci_test.xy(**sci_args)

    name = f"{estimator._name}_{resampler._name if resampler else 'None'}_{features[0]}"
    objective = Objective(
        estimator=estimator(SCIData(sci_train[features[1]])),
        resampler=resampler(SCIData(sci_train[features[1]])),
        X_train=X_train,
        y_train=y_train,
        cv=cv,
        scoring=scoring,
        cv_jobs=cv_jobs,
    )
    study = optuna.create_study(direction="maximize", study_name=name, storage=storage)

    def handle_study_result(model_persistence_path=None, n_resamples=99, **kwargs):
        model = objective._best_model
        if model_persistence_path is not None:
            model.persist(f"{model_persistence_path}/{name}")

        try:
            y_pred_proba = model.predict_proba(X_test)[:, 1]
        except AttributeError:
            y_pred_proba = model.decision_function(X_test)
            
        y_pred = np.where(y_pred_proba > get_threshold_fpr(y_test, y_pred_proba, target=0.05), 1, 0)

        metrics = {
            **dict(
                name=name,
                estimator=estimator._name,
                resampler=resampler._name,
                features=features[0]
            ),
            **get_metrics(y_test, y_pred, y_pred_proba, n_resamples)
        }

        return metrics

        

    def call(model_persistence_path=None, n_resamples=99, **kwargs):
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore")
            study.optimize(objective, **kwargs)

        return handle_study_result(model_persistence_path, n_resamples)

    return call


In [None]:
# X_train, y_train = SCIData(sci_train.head(1000)).xy(imputation=False, fillna=True, onehot_encoding=False, ordinal_encoding=True)
# #XX, yy = Resampler_RandomUnderSampler(sci_train).factory().fit_resample(X_train, y_train)

In [None]:
# optuna.logging.set_verbosity(optuna.logging.INFO)
# study_grid = dict(
#     # estimator=[Estimator_LightGBM, Estimator_IsolationForest, Estimator_LogisticRegression, Estimator_RandomForest, Estimator_TabNet, Estimator_XGBoost],
#     estimator=[Estimator_IsolationForest],
#     resampler=[No_Resampling, Resampler_SMOTE, Resampler_RandomUnderSampler],
#     features=get_feature_studies(sci_train),
# )

# for _ in get_studies(sci_train, study_grid):
#     s = construct_study(**_, sci_train=SCIData(sci_train.head(1000)), sci_test=SCIData(sci_test.head(1000)))

#     r = s(n_trials=2)
#     break


In [None]:
# studies = [construct_study(**_, sci_train=SCIData(sci_train.head(1000)), sci_test=SCIData(sci_test.head(1000))) for _ in get_studies(sci_train, study_grid)]
# with parallel_backend("loky", inner_max_num_threads=1):
#             results = Parallel(n_jobs=1)(
#                 delayed(_)(n_trials=2) for _ in studies[:2]
#             )

In [None]:
import argparse

if SCRIPT:
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-m", "--models", help="Can be 'all', 'cpu', or 'gpu'", type=str
    )
    parser.add_argument(
        "-j", "--njobs", help="Number of CPUs to use. Default=1", type=int, default=1
    )
    parser.add_argument(
        "-c", "--cv_jobs", help="Number of CV jobs. Default=5", type=int, default=5
    )
    parser.add_argument(
        "-p",
        "--persist",
        help="Filepath to save the models. If unset, wont save them",
        type=str,
        default=None,
    )
    parser.add_argument(
        "-d",
        "--debug",
        help="Whether to only use a small subset of data for debugging",
        action="store_true",
    )
    parser.add_argument(
        "-t", "--trials", help="Number of trials. Default=1000", type=int, default=1000
    )
    parser.add_argument(
        "-hr", "--hours", help="Trial timeout in hours", type=int, default=2
    )
    parser.add_argument(
        "-s",
        "--storage",
        help="Trial storage for optuna",
        default="sqlite:///models/studies.db",
    )
    parser.add_argument(
        "-o",
        "--output",
        help='Output path for final results',
        default='results.csv'
    )
    parser.add_argument(
        '--n_resamples',
        help='Number of resamples for bootstrapping metrics',
        default=9999
    )
    parser.add_argument("-v", "--verbose", help="Optuna verbosity", action="store_true")

    args = vars(parser.parse_args())
else:
    args = dict(
        models="LightGBM",
        njobs=1,
        cvjobs=1,
        persist=None,
        debug=True,
        trials=2,
        hours=2,
        storage=None,
        verbose=True,
        output='results.csv',
        n_resamples=99,
    )


def run(args):
    if args["verbose"]:
        optuna.logging.set_verbosity(optuna.logging.INFO)
    if args["persist"] is not None:
        try:
            os.makedirs(args["persist"])
        except FileExistsError:
            pass

    sci_train_ = sci_train
    if args["debug"]:
        sci_train_ = SCIData(sci_train.sample(1000))

    studies = [
        construct_study(**_, **args, sci_train=sci_train_, sci_test=sci_test)
        for _ in get_studies(sci_train, cli_model_arg=args["models"])
    ]

    study_args = dict(
        model_persistence_path=args["persist"],
        n_resamples=args['n_resamples'],
        n_trials=args["trials"] if not args["debug"] else 2,
        timeout=args["hours"] * 60 * 60,
    )

    if args["njobs"] > 1:
        print("Starting execution (parallel)")
        with parallel_backend("loky", inner_max_num_threads=args["cv_jobs"]):
            results = Parallel(n_jobs=args["njobs"])(
                delayed(_)(**study_args) for _ in studies
            )
    else:
        print("Starting execution (linear)")
        results = [_(**study_args) for _ in studies]
    
    pd.DataFrame(results).to_csv(args['output'])


run(args)
