# ACP Project - Systematic Model Comparison

In [1]:
import warnings, pickle, os, itertools
from dataclasses import dataclass
from abc import ABC, abstractmethod

try:
    from sklearnex import patch_sklearn
    patch_sklearn()
except ImportError:
    pass

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 300)

from IPython.display import display
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(10,10)})

import shap
import optuna
optuna.logging.set_verbosity(optuna.logging.INFO)

%load_ext autoreload
%autoreload 1

In [19]:
class Notebook:
    CROSS_VAL_N_JOBS = 5
    N_PROCESSES = 5
    N_TRIALS = 1000
    TRIALS_TIMEOUT = 60*60*2
    TRIAL_STORAGE = "sqlite:///models/studies.db"

In [3]:
from sklearn.model_selection import train_test_split
from dataset import SCIData, SCICols
%aimport dataset

sci = SCIData.load('data/sci.h5')

scii = (
    SCIData(SCIData.quickload("data/sci_processed.h5").sort_values("AdmissionDateTime"))
    .mandate(SCICols.news_data_raw)
    .derive_critical_event(within=1, return_subcols=True)
    .augment_shmi(onehot=True)
    .omit_redundant()
    .raw_news()
    .derive_ae_diagnosis_stems(onehot=False)
    .categorize()
   # .onehot_encode_categories()
)

sci_train, sci_test, _, y_test_mortality, _, y_test_criticalcare = train_test_split(
    scii,
    scii.DiedWithinThreshold,
    scii.CriticalCare,
    test_size=0.33,
    random_state=42,
    shuffle=False,
)
sci_train, sci_test = SCIData(sci_train), SCIData(sci_test)
# (X_train, y_train), (X_test, y_test) = (
#     sci_train.xy(outcome="CriticalEvent", dropna=False, fillna=False),
#     sci_test.xy(outcome="CriticalEvent", dropna=False, fillna=False),
# )

In [4]:
from sklearn.base import BaseEstimator
from typing import Dict, Any, Iterable

class Model(ABC):
    _name: str
    _estimator: BaseEstimator
    _requirements: Dict[str, bool]
    _static_params: Dict[str, Any]
    _tuning_params_default: Dict[str, Any]

    @classmethod
    @abstractmethod
    def suggest_parameters(cls, trial):
        return dict()

    @classmethod
    def compile_params(cls, params):
        return {
            f'{cls._name}__{key}': value for key, value in {
                **cls._static_params,
                **cls._tuning_params_default,
                **params
            }.items()
        }

    @classmethod
    @abstractmethod
    def get(cls, X_train, y_train):
        return cls._estimator(**cls._static_params)


In [5]:
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn import FunctionSampler

class Resampler_SMOTE:
    _name = 'SMOTE'
    _estimator = SMOTENC

    _static_params = dict(
        random_state=42,
        n_jobs=None,
    )

    _tuning_params_default = dict(
        sampling_strategy=0.1,
        k_neighbors=5
    )

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            sampling_strategy = trial.suggest_float(
                f'{cls._name}__sampling_strategy', 0.1, 0.5
            ),
            k_neighbors = trial.suggest_int(
                f'{cls._name}__k_neighbors', 2, 10
            )
        )

        return {
            f'{cls._name}__kw_args': {
                **cls._static_params,
                **cls._tuning_params_default,
                **suggestions
            } 
        }

    @classmethod
    def get(cls, X_train, y_train):
        return FunctionSampler(func=SCIData.SMOTE, validate=False, kw_args=cls._static_params)

class Resampler_RandomUnderSampler(Model):
    _name = 'RandomUnderSampler'
    _estimator = RandomUnderSampler

    _static_params = dict(
        random_state = 42,
        replacement = False
    )

    _tuning_params_default = dict(
        sampling_strategy = 0.1
    )

    @classmethod
    def suggest_parameters(cls, trial):
        return cls.compile_params(dict(
            sampling_strategy = trial.suggest_float(
                f'{cls._name}__sampling_strategy', 0.05, 0.5
            )
        ))


In [6]:
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV

@dataclass
class Model_LightGBM(Model):
    _name = 'LightGBM'
    _estimator = LGBMClassifier

    _requirements = dict(
        onehot = False,
        imputation = False,
        fillna = False,
        resampling = False
    )

    _static_params = dict(
        objective='binary',
        metric=['l2', 'auc'],
        boosting_type='gbdt',
        n_jobs=1,
        random_state=42,
        verbose=-1,
    )

    _tuning_params_default = dict(
        is_unbalance=True,
        reg_alpha = 1.8e-3,
        reg_lambda=6e-4,
        num_leaves=14,
        colsample_bytree=0.4,
        subsample=0.97,
        subsample_freq=1,
        min_child_samples=6
    )

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            reg_alpha = trial.suggest_float(
                f'{cls._name}__reg_alpha', 1e-4, 10.0, log=True 
            ),
            reg_lambda = trial.suggest_float(
                f'{cls._name}__reg_lambda', 1e-4, 10.0, log=True 
            ),
            num_leaves = trial.suggest_int(
                f'{cls._name}__num_leaves', 2, 256
            ),
            colsample_bytree = trial.suggest_float(
                f'{cls._name}__colsample_bytree', 0.4, 1.0
            ),
            subsample = trial.suggest_float(
                f'{cls._name}__subsample', 0.4, 1.0
            ),
            subsample_freq = trial.suggest_int(
                f'{cls._name}__subsample_freq', 1, 7
            ),
            min_child_samples = trial.suggest_int(
                f'{cls._name}__min_child_samples', 5, 150
            ),
            is_unbalance = trial.suggest_categorical(
                f'{cls._name}__is_unbalance', [True, False]
            )
        )

        if not suggestions['is_unbalance']:
            suggestions['scale_pos_weight'] = trial.suggest_int(
                f'{cls._name}__scale_pos_weight', 1, 100
            )

        return cls.compile_params(suggestions)

In [7]:
from xgboost import XGBClassifier
class Model_XGBoost(Model):
    _name = 'XGBoost'
    _estimator = XGBClassifier

    _requirements = dict(
        onehot = False,
        imputation = False,
        fillna = False,
        resampling = False
    )

    _static_params = dict(
        verbosity = 0,
        n_jobs = 1,
        objective = 'binary:logistic',
        booster = 'gbtree',
        enable_categorical = True
    )

    _tuning_params_default = {
        **dict(
            tree_method="hist",
            alpha = 7e-05,
            subsample = 0.42,
            colsample_bytree = 0.87,
            scale_pos_weight = 14,
            max_depth = 7,
            min_child_weight = 10,
            eta = 0.035,
            gamma = 4e-08,
            grow_policy = 'lossguide'
        ),
        'lambda': 7e-2
    }

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            tree_method=trial.suggest_categorical(
                f"{cls._name}__tree_method", ["approx", "hist"]
            ),
            alpha=trial.suggest_float(f"{cls._name}__alpha", 1e-8, 1.0, log=True),
            subsample=trial.suggest_float(f"{cls._name}__subsample", 0.2, 1.0),
            colsample_bytree=trial.suggest_float(
                f"{cls._name}__colsample_bytree", 0.2, 1.0
            ),
            scale_pos_weight=trial.suggest_int(f"{cls._name}__scale_pos_weight", 1, 100),
            max_depth=trial.suggest_int(f"{cls._name}__max_depth", 3, 9, step=2),
            min_child_weight=trial.suggest_int(
                f"{cls._name}__min_child_weight", 2, 10
            ),
            eta=trial.suggest_float(f"{cls._name}__eta", 1e-8, 1.0, log=True),
            gamma=trial.suggest_float(f"{cls._name}__gamma", 1e-8, 1.0, log=True),
            grow_policy=trial.suggest_categorical(
                f"{cls._name}__grow_policy", ["depthwise", "lossguide"]
            )
        )
        suggestions["lambda"] = trial.suggest_float(f"{cls._name}__lambda", 1e-8, 1.0, log=True)

        return cls.compile_params(suggestions)

In [8]:
from sklearn.linear_model import LogisticRegression

class Model_LogisticRegression(Model):
    _name = "LogisticRegression"
    _estimator = LogisticRegression

    _requirements = dict(
        onehot = True,
        imputation = True,
        fillna = True,
        resampling = False
    )

    _static_params = dict(
        max_iter = 10000,
        solver = 'saga',
        random_state = 42,
    )

    _tuning_params_default = dict(
        penalty = 'l2',
        C = 5.9,
        class_weight = 'balanced'
    )

    @classmethod
    def suggest_parameters(cls, trial):
        suggestions = dict(
            penalty = trial.suggest_categorical(
                f'{cls._name}__penalty', ['l1', 'l2', 'elasticnet']
            ),
            C = trial.suggest_float(
                f'{cls._name}__C', 0.01, 10
            ),
            class_weight = trial.suggest_categorical(
                f'{cls._name}__class_weight', [None, 'balanced']
            ),
        )

        if suggestions['penalty'] == 'elasticnet':
            suggestions['l1_ratio'] = trial.suggest_float(
                f'{cls._name}__l1_ratio', 0.05, 0.95
            )

        return cls.compile_params(suggestions)
    

In [9]:
def get_studies():
    study_grid = dict(
        model = [Model_LogisticRegression, Model_LightGBM, Model_XGBoost],
        resampler = [None, Resampler_RandomUnderSampler, Resampler_SMOTE],
        impute = [True, False],
    )

    k, v = zip(*study_grid.items())
    return [dict(zip(k, _)) for _ in itertools.product(*v)]

In [12]:
from imblearn.pipeline import Pipeline as ImbPipeline
from typing import Dict, Any, Iterable, Optional
from sklearn.model_selection import cross_validate

def construct_study(model: Model, sci_train: SCIData, sci_test: SCIData, features: Iterable[str], impute: bool, resampler: Optional[Model]=None, cv=5, scoring='average_precision', storage=Notebook.TRIAL_STORAGE):
    X_train, y_train = sci_train.xy(
       imputation=impute, onehot_encoding = model._requirements['onehot'], fillna = model._requirements['fillna']
    )

    steps = [(model._name, model.get(X_train, y_train))]
    if resampler is not None:
        steps = [(resampler._name, resampler.get(X_train, y_train))] + steps
    
    pipeline_factory = lambda _: ImbPipeline(steps=steps).set_params(**_)
    
    objective = lambda trial: cross_validate(pipeline_factory({
            **(resampler.suggest_parameters(trial) if resampler else {}),
            **model.suggest_parameters(trial)
        }), X_train, y_train, cv=cv, scoring=scoring, n_jobs=Notebook.CROSS_VAL_N_JOBS)['test_score'].mean()

    name = f"{model._name}:{resampler._name if resampler else 'None'}:{'Non' if not impute else ''}Imputed:{features}"
    study = optuna.create_study(direction='maximize', study_name=name, storage=storage)

    return lambda **_: study.optimize(objective, **_)

In [13]:
from joblib import Parallel, delayed, parallel_backend

studies = [construct_study(**_, features=['features'], sci_train=sci_train, sci_test=sci_test) for _ in get_studies()]

print('Starting execution..')
with parallel_backend("loky", inner_max_num_threads=5):
    results = Parallel(n_jobs=5)(delayed(_)(n_trials=Notebook.N_TRIALS, timeout=Notebook.TRIALS_TIMEOUT, n_jobs=Notebook.N_PROCESSES) for _ in studies)

[32m[I 2022-10-30 17:50:26,820][0m A new study created in RDB with name: LogisticRegression:None:Imputed:['features'][0m
[32m[I 2022-10-30 17:50:28,180][0m A new study created in RDB with name: LogisticRegression:None:NonImputed:['features'][0m
[32m[I 2022-10-30 17:50:30,027][0m A new study created in RDB with name: LogisticRegression:RandomUnderSampler:Imputed:['features'][0m
