# ACP Project - Systematic Model Comparison

In [10]:
import warnings, pickle, os
from dataclasses import dataclass
from abc import ABC, abstractmethod

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 300)

from IPython.display import display
import matplotlib.pyplot as plt

import seaborn as sns
sns.set(rc={'figure.figsize':(10,10)})

import shap
import optuna

%load_ext autoreload
%autoreload 1

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
from dataset import SCIData, SCICols
%aimport dataset

In [None]:
def construct_pipeline()

In [11]:
from imblearn.pipeline import Pipeline as ImbPipeline
from typing import Dict, Any

class Model(ABC):
    _name: str
    _estimator: ImbPipeline
    _requirements: Dict[str, bool]
    _static_params: Dict[str, Any]
    _tuning_params_default: Dict[str, Any]

    @abstractmethod
    def suggest_parameters(self, trial):
        pass


In [12]:
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from imblearn.pipeline import Pipeline as ImbPipeline

@dataclass
class Model_LightGBM(Model):
    _name = 'LightGBM'
    _estimator = LGBMClassifier()

    _requirements = dict(
        onehot = False,
        imputation = False,
        fillna = False,
        resampling = False
    )

    _static_params = dict(
        objective='binary',
        metric=['l2', 'auc'],
        boosting_type='gbdt',
        n_jobs=1,
        random_state=42,
        verbose=-1,
        verbose_eval=-1
    )

    _tuning_params_default = dict(
        is_unbalance=True,
        lambda_l1 = 1.8e-3,
        lambda_l2=6e-4,
        num_leaves=14,
        feature_fraction=0.4,
        bagging_fraction=0.97,
        bagging_freq=1,
        min_child_samples=6
    )

    def suggest_parameters(self, trial):
        suggestions = dict(
            lambda_l1 = trial.suggest_float(
                f'{self._name}__lambda_l1', 1e-4, 10.0, log=True 
            ),
            lambda_l2 = trial.suggest_float(
                f'{self._name}__lambda_l2', 1e-4, 10.0, log=True 
            ),
            num_leaves = trial.suggest_int(
                f'{self._name}__num_leaves', 2, 256
            ),
            feature_fraction = trial.suggest_float(
                f'{self._name}__feature_fraction', 0.4, 1.0
            ),
            bagging_fraction = trial.suggest_float(
                f'{self._name}__bagging_fraction', 0.4, 1.0
            ),
            bagging_freq = trial.suggest_int(
                f'{self._name}__bagging_freq', 1, 7
            ),
            min_child_samples = trial.suggest_int(
                f'{self._name}__min_child_samples', 5, 150
            ),
        )

        return {
            f'{self._name}__{key}': value for key, value in {
                **self._static_params,
                **self._tuning_params_default
                **suggestions
            }.items()
        }

In [6]:
from sklearn.linear_model import LogisticRegression

class Model_LogisticRegression