# Setup

### Imports

In [None]:
# built-ins
import os
import json
import math
import time
import pickle
import traceback
import time
from os import path
from pathlib import Path
from datetime import datetime
from copy import deepcopy
from functools import partial

# common
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# misc
from IPython.display import display, clear_output, Markdown
from termcolor import colored

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.decomposition import PCA

# metrics
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# training
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from lightgbm import LGBMClassifier

### Initial tasks

In [None]:
# allow multiple outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# suppress warnings
import sys, os, warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# also suppress warnings of parallel processes such as grid search cv
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesses
    
# configure pandas
pd.set_option("display.max_columns", None)

### Utils / Helpers

In [None]:
def enum(*sequential, **named):
    enums = dict(zip(sequential, range(len(sequential))), **named)
    return type('Enum', (), enums)

In [None]:
def merge(a, b):
    return {**a, **b}

def cprint(text, color):
    print(colored(text, color, attrs=['bold']))
    
def print_red(text):
    cprint(text, 'red')

def print_blue(text):
    cprint(text, 'blue')
    
def print_dim(text):
    print(colored(text, 'grey'))

In [None]:
class Time(object):
    def __enter__(self):
        self.start_time = time.time()
  
    def __exit__(self, *args, **kwargs):
        print("--- took %.2f seconds ---" % (time.time() - self.start_time))

In [None]:
class Output:
    class printer(str):
        def __repr__(self):
            return self
    
    def __init__(self):
        self.out = None
    
    def update(self, output):
        output = self.printer(output)
        
        if self.out is None:
            self.out = display(output, display_id=True)
        else:
            self.out.update(output)

In [None]:
class PrintDuration(object):
    class printer(str):
        def __repr__(self):
            return self
        
    def __enter__(self):
        self.start_time = datetime.now()
        self.last_tick = self.start_time
        self.tick_count = 0
        self.tick_times = 0
        
        self.completed = False
        self.progress = 0
        self.ert = 0
        self.att = 0
        self.out = None
        
        return self.tick
  
    def __exit__(self, exc_type, exc_value, tb):
        if exc_type is not None:
            traceback.print_exception(exc_type, exc_value, tb)
        
        self.completed = True
        self.render()
        
    def tdformat(self, seconds):
        hours, remainder = divmod(seconds, 3600)
        minutes, seconds = divmod(remainder, 60)
        return '{:02}:{:02}:{:02}'.format(int(hours), int(minutes), int(seconds))
    
    def render(self):
        output = ''
        
        if self.completed:
            complete_time = (datetime.now() - self.start_time).total_seconds()
            complete_time = self.tdformat(complete_time)
            output = f'100% completed, total run time = {complete_time}'
        else:
            percent = round(self.progress * 100)
            att = self.tdformat(self.att)
            ert = self.tdformat(self.ert)
            output = f'{percent}% completed, remaining time = {ert}, avg ticktime = {att}'
        
        output = self.printer(output)
        
        if self.out is None:
            self.out = display(output, display_id=True)
        else:
            self.out.update(output)
    
    def tick(self, progress):
        now = datetime.now()
        
        # calculate
        work_time = (now - self.start_time).total_seconds()
        tick_time = (now - self.last_tick).total_seconds()
        self.tick_count += 1
        self.tick_times += tick_time
        avg_tick_time = self.tick_times // self.tick_count
        
        if progress > 0:
            total_ticks = self.tick_count // progress
            remained_ticks = total_ticks - self.tick_count
            est_remain_time = avg_tick_time * remained_ticks
        else:
            est_remain_time = 0
            
        # set
        self.progress = progress
        self.att = avg_tick_time
        self.ert = est_remain_time
        
        # render
        self.render() 

### Detect Env

In [None]:
ENV_KAGGLE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE') is not None

### Path Definitions

In [None]:
PATH_ROOT = '.'
PATH_DATASET = path.join(PATH_ROOT, 'dataset')
PATH_CSV = path.join(PATH_DATASET, 'csv')
PATH_MODELS = path.join(PATH_ROOT, 'models')

if ENV_KAGGLE:
    PATH_ROOT = '/kaggle/working'
    PATH_DATASET = '/kaggle/input/personal-key-indicators-of-heart-disease'
    PATH_CSV = PATH_DATASET
    PATH_MODELS = path.join(PATH_ROOT, 'models')
    
# Create directories.
Path(PATH_MODELS).mkdir(parents=True, exist_ok=True)

### Constants

In [None]:
TARGET_COLUMN = 'HeartDisease'
TARGET_CLASSES = ('No', 'Yes')
TARGET = (TARGET_COLUMN, TARGET_CLASSES)

PARAM_STRATEGY = enum('GRID_SEARCH', 'DEFAULTS', 'PREDEFINED')

### Configs

In [None]:
CFG_CSV_NAME = 'data.csv'

if ENV_KAGGLE:
    CFG_CSV_NAME = 'heart_2020_cleaned.csv'

# Hyperparameters

In [None]:
HP_SCORING = ('moved', 'f1', 1)
HP_SEED = 339
HP_CV_SPLITS = (10, 5)
HP_TEST_SIZE = 0.2

# Import Dataset

In [None]:
# read csv
data = pd.read_csv(path.join(PATH_CSV, CFG_CSV_NAME), encoding='utf-8')

# drop duplicates
data = data.drop_duplicates().reset_index(drop=True)

### Limit Data

In [None]:
# limit data helper
def use_limited_data(data, limit):
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=limit, random_state=HP_SEED)
    _, test_index = next(iter(splitter.split(data, data[TARGET_COLUMN])))
    return data.loc[test_index].reset_index(drop=True)

# always keep full data
full_data = data

# use limited data (for local tests)
# data = use_limited_data(full_data, round(len(full_data) * 0.1)) # percent

### Simple EDA

In [None]:
data.shape
data.info()

In [None]:
data.head()

In [None]:
data.describe().T

In [None]:
data.isnull().sum()

In [None]:
data.nunique()

In [None]:
categorical_columns = data.select_dtypes(exclude=['float64']).columns

for col in categorical_columns:
    col = data[col]
    pd.concat([col.value_counts(normalize=True), col.value_counts()], axis=1)

# Pipeline Setup

## Sampler

In [None]:
def sample(N, data, target, classes, indexes):
    N = list(N)

    # separate data
    test_index, train_index = indexes
    test = data.loc[test_index]
    train = data.loc[train_index]

    # calculate sizes
    sizes = [sum(train[target] == c) for c in classes] 

    # fix sizes
    for i, n in enumerate(N):
        if type(n) is float:
            N[i] = round(N[i] * sizes[i])

    # do sampling
    groups = [train[train[target] == c] for c in classes] 
    samples = [group.sample(N[i], replace=(sizes[i] < N[i])) for i, group in enumerate(groups)]
    
    # calculate final sizes
    train_size = sum(N)
    test_size = len(test_index)
    
    # calculate next indexes
    train_index = list(range(train_size))
    test_index = list(range(train_size, train_size + test_size))
    
    # create shuffled train dataframe
    samples = pd.concat(samples, axis=0).sample(frac=1).reset_index(drop=True)
    
    # concatenate data
    data = pd.concat([samples, test], axis=0).reset_index(drop=True)
    
    # return data and indexes
    return data, test_index, train_index

## Preprocessor

In [None]:
class OutlierRemover:
    @staticmethod
    def numeric(data):
        cols = data.select_dtypes(include=['float64', 'int64']).columns.to_list()
        return OutlierRemover(cols)
    
    def __init__(self, cols):
        self.cols = cols
        self.bands = {}
    
    def fit(self, data):
        for col in self.cols:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_band = Q1 - 1.5 * IQR
            upper_band = Q3 + 1.5 * IQR
            
            self.bands[col] = (lower_band, upper_band)
    
    def transform(self, data):
        for col in self.cols:
            lower_band, upper_band = self.bands[col]
            inliers = ~((data[col] < lower_band) | (data[col] > upper_band))
            data = data[inliers]
            
        return data
            
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)
    
class MultiLabelEncoder():
    @staticmethod
    def binary(data):
        cols = [col for col in data.columns if data[col].nunique() == 2]
        return MultiLabelEncoder(cols)
    
    def __init__(self, cols):
        self.cols = cols
        self.encoders = {col: LabelEncoder() for col in cols}
    
    def fit(self, data):
        for col in self.cols:
            self.encoders[col].fit(data[col])

    def transform(self, data):
        for col in self.cols:
            data[col] = self.encoders[col].transform(data[col])
        
        return data
    
    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

class Preprocessor:
    @staticmethod
    def params(override={}):
        defaults = {
            'target': 'HeartDisease',
            'outlier_strategy': 'all',
            'encode_labels': True,
            'pca': False,
            'onehot_encoding': ['Race', 'Diabetic'],
            'ordinal_encoding': {
                'GenHealth': ['Poor', 'Fair', 'Good', 'Very good','Excellent'],
                'AgeCategory': ['18-24', '25-29','30-34', '35-39', '40-44', '45-49', '50-54',
                                '55-59', '60-64', '65-69', '70-74', '75-79', '80 or older']
            }
        }

        return merge(defaults, override)
    
    @staticmethod
    def process(*args, **kwargs):
        processor = Preprocessor(*args, **kwargs)
        processor.apply()
        return processor
    
    def __init__(self, data, test_index=None, train_index=None, options=None):
        if options is None:
            options = Preprocessor.params()
        
        if train_index is None:
            train_index = list(range(len(data)))
            test_index = []
        
        self.data = data
        self.test_index = np.array(test_index)
        self.train_index = np.array(train_index)
        self.options = options
        
        self.target = self.options['target']
        self.update_meta()
    
    def update_meta(self):
        self.features_mask = self.data.columns != self.target
        self.columns = self.data.columns
        self.feature_columns = self.columns[self.features_mask]
        self.has_train = self.train_index.shape[0] > 0
        self.has_test = self.test_index.shape[0] > 0

    def override_data(self, value):
        labels = self.y
        self.data = pd.DataFrame(data=value, index=self.data.index)
        self.data[self.target] = labels
        self.update_meta()
            
    def get_x(self, df):
        return df.drop(self.target, axis=1).to_numpy()
    
    def get_y(self, df):
        return df[self.target].to_numpy()
    
    # .x getter/setter
    @property
    def x(self):
        return self.get_x(self.data)
    
    @x.setter
    def x(self, value):
        self.data.loc[:, self.features_mask] = value
        
    # .y getter/setter
    @property
    def y(self):
        return self.get_y(self.data)
    
    @y.setter
    def y(self, value):
        self.data.loc[:, [self.target]] = value
    
    # .test getter/setter
    @property
    def test(self):
        return self.data.loc[self.test_index]
    
    @test.setter
    def test(self, value):
        self.data.loc[self.test_index] = value
    
    # .train getter/setter
    @property
    def train(self):
        return self.data.loc[self.train_index]
    
    @train.setter
    def train(self, value):
        self.data.loc[self.train_index] = value
    
    # .x_test getter/setter
    @property
    def x_test(self):
        return self.get_x(self.test)
    
    @x_test.setter
    def x_test(self, value):
        self.data.loc[self.test_index, self.features_mask] = value
        
    # .x_train getter/setter
    @property
    def x_train(self):
        return self.get_x(self.train)
    
    @x_train.setter
    def x_train(self, value):
        self.data.loc[self.train_index, self.features_mask] = value
    
    # .y_test getter
    @property
    def y_test(self):
        return self.get_y(self.test)
        
    # .y_test setter
    @property
    def y_train(self):
        return self.get_y(self.train)
    
    def chop(self):
        return self.x_test, self.y_test, self.x_train, self.y_train
    
    def apply(self):
        # remove outliers
        outlier_strategy = self.options.get('outlier_strategy', 'train_only')
        outlier_remover = OutlierRemover.numeric(self.data)
        if outlier_strategy == 'train_only':
            self.train = outlier_remover.fit_transform(self.train)
        elif outlier_strategy == 'include_test':
            outlier_remover.fit(self.train())
            self.data = outlier_remover.transform(self.data)
        elif outlier_strategy == 'all':
            self.data = outlier_remover.fit_transform(self.data)
        else:
            pass
        
        # update removed indexes.
        indexes = self.data.index.values
        self.train_index = self.train_index[np.isin(self.train_index, indexes)]
        self.test_index = self.test_index[np.isin(self.test_index, indexes)]
        
        # encode labels
        encode_labels = self.options.get('encode_labels', True)
        if encode_labels:
            self.data = MultiLabelEncoder.binary(self.data).fit_transform(self.data)
            
        onehot_encoding = self.options.get('onehot_encoding', None)
        if onehot_encoding is not None:
            cols = onehot_encoding
            self.data = pd.get_dummies(self.data, columns=cols, prefix=cols)
            self.update_meta()
            
        # ordinal encoding
        ordinal_encoding = self.options.get('ordinal_encoding', None)
        if ordinal_encoding is not None:
            for col, ordinals in ordinal_encoding.items():
                encoder = OrdinalEncoder(categories=[ordinals])
                self.data[[col]] = encoder.fit_transform(self.data[[col]])
        
        # scaler
        scale = self.options.get('scale', True)
        if scale:
            scaler = StandardScaler()
            self.x_train = scaler.fit_transform(self.x_train)
            
            if self.has_test:
                self.x_test = scaler.transform(self.x_test)
                
        # PCA
        pca_n = self.options.get('pca', False)
        if pca_n:
            pca = PCA(n_components=pca_n, svd_solver='full', copy=True)
            pca.fit(self.x_train)
            result = pca.transform(self.x)
            self.override_data(result)

## Model

In [None]:
class Model:
    def __init__(self, estimator, data, target, scoring, n_splits, test_size, seed,
                 sampling=None, prep_params=None, hp_grid=None):
        
        self.estimator = estimator
        self.data = data
        self.target = target
        self.scoring = scoring
        self.n_splits = n_splits
        self.test_size = test_size
        self.seed = seed
        self.sampling = sampling
        self.prep_params = prep_params
        self.hp_grid = hp_grid

        self.trained = False
        self.results = []
        self.best_result = None
        
        self.parse_args()
    
    def parse_args(self):
        self.n_test_split, self.n_grid_split = self.n_splits
        
        self.score_type, self.score_fn, self.score_cls = self.scoring
        
        self.target_column, self.target_classes = self.target
        self.target_encoded_classes = list(range(len(self.target_classes)))
    
    @property
    def model(self):
        return self.best_result['model']
    
    @property
    def preprocessor(self):
        return self.best_result['preprocessor']
    
    @property
    def scores(self):
        return self.best_result['scores']
    
    @property
    def score(self):
        return self.scores[self.score_type][self.score_fn]
    
    def split(self):
        split = StratifiedShuffleSplit(n_splits=self.n_test_split, test_size=self.test_size, random_state=self.seed)
        return split.split(self.data, self.data[self.target_column])
    
    def train(self, tick=None):
        for split_index, (train_index, test_index) in enumerate(self.split()):
            if tick is not None:
                tick(split_index/self.n_test_split)
            
            # get values
            data = self.data
            sampling = self.sampling
            target_column = self.target_column
            target_classes = self.target_classes
            
            # default values
            best_params = None
            best_estimator = None
            
            # do sampling
            if sampling is not None:
                # set sample sizes
                sample_sizes = sampling

                # if function
                if callable(sampling):
                    train_data = data.loc[train_index]
                    sample_sizes = sampling(train_data)

                # sample data
                data, test_index, train_index = sample(sample_sizes, data, target_column, target_classes, (test_index, train_index))
            
            # preprocess data
            preprocessor = Preprocessor.process(data, test_index, train_index, self.prep_params)
            X_test, Y_test, X_train, Y_train = preprocessor.chop()
                        
            # fix grid
            is_grid, hp_grid = self.fix_hp_grid(self.hp_grid)
            
            # if given parameters are grid do grid search
            if is_grid:
                # create grid searcher
                gscv = GridSearchCV(estimator=self.estimator(), param_grid=hp_grid,
                                  cv=self.n_grid_split, scoring=self.score_fn, n_jobs=-1)
                
                # fit
                gscv.fit(X_train, Y_train)
                
                # collect best results
                best_params = gscv.best_params_
                best_estimator = gscv.best_estimator_
            
            # if given parameters are singular or none do direct training.
            else:
                # create and fit estimator
                best_estimator = self.estimator(**hp_grid)
                best_estimator.fit(X_train, Y_train)
                best_params = hp_grid
            
            # get predictions
            Y_pred = best_estimator.predict(X_test)
            Y_prob = best_estimator.predict_proba(X_test)
            
            # create result 
            self.results.append({
                'y_true': Y_test, 'y_pred': Y_pred, 'y_prob': Y_prob, 
                'params': best_params, 'model': best_estimator,
                'preprocessor': preprocessor, 'seed': self.seed,
            })
            
        self._calculate_scores()
        self._set_best_result()
        self.trained = True
    
    def fix_hp_grid(self, hp_grid=None):
        if hp_grid is None:
            return False, {}

        # check if there is multidimensional value.
        is_grid = sum([np.ndim(v) for v in hp_grid.values()]) > 0
        
        # fix singular values if suppose to be a grid.
        if is_grid:
            hp_grid = {k: [v] if np.ndim(v) == 0 else v for k, v in hp_grid.items()}

        return is_grid, hp_grid
        
    def _calculate_scores(self):
        classes = self.target_encoded_classes[::-1]

        score_fns = {
            'accuracy': accuracy_score,
            'f1_micro': partial(f1_score, average='micro'),
        }

        metrics = {
            'f1': f1_score,
            'recall': recall_score,
            'precision': precision_score,
        }

        for cls in classes:
            for metric, fn in metrics.items():
                name = metric if cls == self.score_cls else f'{cls}.{metric}'
                score_fns[name] = partial(fn, pos_label=cls)

        # calculate for each result.
        for result in self.results:
            # get targets
            Y_true, Y_pred, Y_prob = result['y_true'], result['y_pred'], result['y_prob']

            # create scores
            scores = result['scores'] = dict(raw={}, moved={})

            # calculate raw scores
            for name, fn in score_fns.items():
                scores['raw'][name] = fn(Y_true, Y_pred)

            # calculate threshold moved scores
            moved_score = -1
            moved_pred = None
            moved_threshold = None

            # find the best threshold
            for threshold in np.arange(0.5, 0, -0.01):
                pred = (Y_prob[:, 1] >= threshold).astype(int)
                score = score_fns[self.score_fn](Y_true, pred)

                if score > moved_score:
                    moved_pred = pred
                    moved_score = score
                    moved_threshold = threshold

            # calculate threshold moved scores
            for name, fn in score_fns.items():
                scores['moved'][name] = fn(Y_true, moved_pred)

            # keep the threshold info
            result['moved_threshold'] = moved_threshold
    
    def _set_best_result(self):
        best_score = -1
        best_result = None
        
        for result in self.results:
            score = result['scores'][self.score_type][self.score_fn]

            if score > best_score:
                best_score = score
                best_result = result
        
        self.best_result = best_result

## Trainer

In [None]:
class Trainer:   
    def __init__(self, name, data, target, scoring, n_splits, test_size, seed, sampling=None, prep_params=None):
        self.name = name
        self.data = data
        self.target = target
        self.scoring = scoring
        self.n_splits = n_splits
        self.test_size = test_size
        self.seed = seed
        self.sampling = sampling
        self.prep_params = prep_params
        
        self.estimators = {}
        self.param_strategy = PARAM_STRATEGY.DEFAULTS
        self.predefined_params = {}
    
    def add_estimator(self, name, estimator, hp_grid=None):
        self.estimators[name] = (name, estimator, hp_grid)
    
    def set_predefined_params(self, params):
        self.predefined_params = params
    
    def use_param_strategy(self, strategy):
        self.param_strategy = strategy
    
    def get_estimator_params(self, name):
        if self.param_strategy == PARAM_STRATEGY.DEFAULTS:
            return {}
        
        if self.param_strategy == PARAM_STRATEGY.PREDEFINED:
            return self.predefined_params.get(name, {})
        
        if self.param_strategy == PARAM_STRATEGY.GRID_SEARCH:
            return self.estimators[name][2]
    
    def get_model_path(self, name):
        return path.join(PATH_MODELS, f'{self.name}_{name}.pickle')
    
    def save_model(self, name, model):
        model_path = self.get_model_path(name)
        with open(model_path,'wb') as file:
            pickle.dump(model, file)
        
    def load_model(self, name):
        model_path = self.get_model_path(name)
        with open(model_path, 'rb') as file:
            return pickle.load(file)
        
    def train_estimators(self, **kwargs):
        estimators = kwargs.pop('estimators', self.estimators.keys())
        for name in estimators:
            print_red(f'Estimator: {name}\n')
            model = self.train_estimator(name, **kwargs)
            yield (name, model)
            
    def train_estimator(self, name, reset=False, seed=None, save=True, print_duration=True):      
        if seed is None:
            seed = self.seed
        
        if not reset:
            try:
                model = self.load_model(name)
                setattr(self, name, model)
                
                print(f'Model {name} is loaded from disk successfully.')
                return model
            
            except:
                model = None
        
        name, estimator, _ = self.estimators[name]
        params = self.get_estimator_params(name)

        model = Model(estimator, self.data, self.target, self.scoring, self.n_splits,
                    self.test_size, seed, self.sampling, self.prep_params, params)
        
        if print_duration:
            with PrintDuration() as tick:
                model.train(tick)
        else:
            model.train()
        
        setattr(self, name, model)
        if save:
            self.save_model(name, model)
            
        return model
    
    def search_best_seed(self, name, seed_range=range(100)):
        output = Output()
        
        best_score = -1
        best_seed = 0
        
        print(f'Searching best seed for {name}')
        
        for seed in seed_range:
            output.update(f'  -> Testing seed {seed}')
            model = self.train_estimator(name=name, seed=seed, save=False, print_duration=False)
            score = model.score

            if score > best_score:
                best_score = score
                best_seed = seed
                print(f'* {seed} -> {score}')
        
        print(f'Best seed found as {best_seed}')
        return best_seed

    def get_results_df(self, name, shuffle=False, ascending=False):
        model = getattr(self, name)

        true = model.best_result['y_true'].reshape(-1)
        pred = model.best_result['y_pred'].reshape(-1)
        
        df = pd.DataFrame(data={
            'true': true,
            'prediction': pred,
            'diff': np.absolute(true - pred)
        })
    
        if shuffle:
            df = df.sample(frac=1)
        else:
            df = df.sort_values('diff', ascending=ascending)
        
        return df
    
    def get_scores_df(self, name):
        model = getattr(self, name)
        result = model.best_result

        scores = model.scores
        index = list(scores.keys())
        cols = list(scores[index[0]].keys())
        values = [[val for val in vals.values()] for vals in scores.values()]

        return pd.DataFrame(values, index, cols)

## SetTrainer

In [None]:
class SetTrainer:
    def __init__(self):
        self.estimators = {}
        self.trainer_names = []
        self.param_strategy = PARAM_STRATEGY.GRID_SEARCH
        
    def add_estimator(self, name, estimator, hp_grid=None):
        self.estimators[name] = (name, estimator, hp_grid)
    
    def add_trainer(self, **kwargs):
        name = kwargs['name']
        trainer = Trainer(**kwargs)
        
        for estimator_name in self.estimators:
            _, estimator, hp_grid = self.estimators[estimator_name]
            hp_grid = deepcopy(hp_grid)
            trainer.add_estimator(estimator_name, estimator, hp_grid)
        
        trainer.use_param_strategy(self.param_strategy)
        
        self.trainer_names.append(name)
        setattr(self, name, trainer)
    
    def use_param_strategy(self, strategy):
        self.param_strategy = strategy
    
    def run_trainer(self, name, **kwargs):
        trainer = getattr(self, name)
        for (model_name, model) in trainer.train_estimators(**kwargs):
            yield (name, trainer, model_name, model)
            
    def run_all_trainers(self, **kwargs):
        trainers = kwargs.pop('trainers', self.trainer_names)
        count = len(trainers)
        
        for index, name in enumerate(trainers):
            print_blue(f'Trainer {index+1}/{count}: {name}\n')
            for (trainer_name, trainer, model_name, model) in self.run_trainer(name, **kwargs):
                yield (trainer_name, trainer, model_name, model)
                
    def save_scores(self):
        columns = ['trainer', 'model']
        values = []

        score_cols = None

        for trainer_name in self.trainer_names:
            trainer = getattr(self, trainer_name)

            for model_name in trainer.estimators:
                model = getattr(trainer, model_name, None)

                if model is not None and model.trained:
                    value = [trainer_name, model_name[:10]]

                    if score_cols is None:
                        score_cols = [f'{group[:1]}/{fn[:]}' for group in model.scores for fn in model.scores[group]]

                    value += [round(val, 3) for group in model.scores.values() for val in group.values()]
                    values.append(value)

        columns += score_cols

        df = pd.DataFrame(values, columns=columns)
        df.to_csv(path.join(PATH_MODELS, f'scores.csv'), index=False)
        df.sort_values(by='model', inplace=True)

        return df

## Helper Functions

### Feature Importance

In [None]:
def calculate_feature_importances(data, target, scoring, n_splits, test_size, seed):
    model = Model(RandomForestClassifier, data, target, scoring, n_splits, test_size, seed)
    model.train()
    
    importances = model.model.feature_importances_
    indices = np.argsort(importances)
    graph_x = range(len(indices))
    
    plt.title('Feature Importances')
    plt.barh(graph_x, importances[indices], color='b', align='center')
    plt.yticks(graph_x, model.preprocessor.feature_columns[indices])
    plt.xlabel('Relative Importance')
    plt.show()

data_ = use_limited_data(full_data, 3200)
calculate_feature_importances(data=data_, target=TARGET, scoring=HP_SCORING,
                              n_splits=HP_CV_SPLITS, test_size=HP_TEST_SIZE, seed=HP_SEED)

### Explained Variance Ratio

In [None]:
def calculate_explained_variance_ratio(data, p=0.95):
    X = Preprocessor.process(data).data
    
    pca = PCA(n_components=None, svd_solver='full', copy=True)
    reduced = pca.fit_transform(X)
    
    # extract the smallest number of components which
    # explain at least p% (e.g. 80%) of the variance
    n_components = 1 + np.argmax(np.cumsum(pca.explained_variance_ratio_) >= p)
    print(f'For p={int(p*100)}% n_components should be {n_components}\n')

    # extract the values of the selected components
    #Z = pca.transform(X)[:, :n_components]
    
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('number of components')
    plt.ylabel('cumulative explained variance')
    
data_ = use_limited_data(full_data, 3200)
calculate_explained_variance_ratio(data_, 0.95)

### Best Seed Calculation

In [None]:
def calculate_best_seed(seed_range, estimators, data, target, scoring, n_splits, test_size):
    trainer = Trainer('trainer', data, target, scoring, n_splits, test_size, seed=0)

    for estimator in estimators:
        name = estimator.__name__
        
        trainer.add_estimator(name, estimator)
        trainer.search_best_seed(name, seed_range)

calculate_best_seed(seed_range=range(0), estimators=[LogisticRegression],
                    data=full_data, target=TARGET, scoring=HP_SCORING,
                    n_splits=(3, 1), test_size=HP_TEST_SIZE)

# Model Definitions

#### Parameter Strategies

- `PARAM_STRATEGY.GRID_SEARCH` Will do grid search with provided hyperparameters grid above.
- `PARAM_STRATEGY.DEFAULTS` Will use empty dict, {}, as parameters, which causes default parameters to be used.
- `PARAM_STRATEGY.PREDEFINED` Will use predefined singular parameters.

In [None]:
set_trainer = SetTrainer()

# set default trainer parameter strategy 
set_trainer.use_param_strategy(PARAM_STRATEGY.GRID_SEARCH)

## Logistic Regression 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
# Add estimator
set_trainer.add_estimator('logistic', LogisticRegression, {
    'penalty': ['l2', 'l1', 'elasticnet'],
    'C': [0.8, 1.0, 1.2],
    'max_iter': [50, 100, 200],
})

## Gaussian Naive Bayes 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html), [comparing naive bayes classification algorithms](https://towardsdatascience.com/comparing-a-variety-of-naive-bayes-classification-algorithms-fc5fa298379e)

In [None]:
# Add estimator
set_trainer.add_estimator('nb_gaussian', GaussianNB, {
    'var_smoothing': [0, 1e-10, 1e-9, 1e-8]
})

## kNN 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

In [None]:
# Add estimator
set_trainer.add_estimator('knn', KNeighborsClassifier, {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
})

## Decision Tree 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

In [None]:
# Add estimator
set_trainer.add_estimator('dt', DecisionTreeClassifier, {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'min_samples_split': [1, 2, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [None, 0.8, 0.5],
})

## MLP 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html)

In [None]:
# Add estimator
set_trainer.add_estimator('mlp', MLPClassifier, {
    'hidden_layer_sizes': [(100,), (128,128), (256,)],
    'activation': ['relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001],
    'early_stopping': [True],
})

## Bagging
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)

In [None]:
# Add estimator
set_trainer.add_estimator('bag', BaggingClassifier, {
    'n_estimators': [5, 10, 20],
    'max_samples': [0.7, 1.0],
    'max_features': [0.7, 1.0],
})

## Random Forest 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [None]:
# Add estimator
set_trainer.add_estimator('rf', RandomForestClassifier, {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'max_features': ["sqrt", 1.0, 0.7],
    'max_samples': [None, 1.0, 0.7],
})

## Gradient Boosting 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)

In [None]:
# Add estimator
set_trainer.add_estimator('gb', GradientBoostingClassifier, {
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2],
    'max_depth': [3, 5],
})

## AdaBoost 
[docs](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)

In [None]:
# Add estimator
set_trainer.add_estimator('ada', LGBMClassifier, {
    'n_estimators': [30, 50, 100],
    'learning_rate': [0.8, 1.0, 1.2],
})

## LightGBM
[docs](https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html)

In [None]:
# Add estimator
set_trainer.add_estimator('lgbm', LGBMClassifier, {
    'boosting_type': ['gbdt', 'dart', 'goss', 'rf'],
    'learning_rate': [0.05, 0.1, 0.15],
    'n_estimators': [50, 100, 200],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
})

# Training

### Helpers

In [None]:
def default_sampling_size(train_data):
    cls0_count = train_data[TARGET_COLUMN].value_counts()[TARGET_CLASSES[0]]
    
    cls0_factor = 0.5
    cls1_factor = 0.4 * cls0_factor * cls0_count
    
    return (cls0_factor, round(cls1_factor))

## Trainer Definitions

### Default Trainer

In [None]:
# generate preprocessor parameters
prep_params = Preprocessor.params({
    'outlier_strategy': 'all',
    'encode_labels': True,
    'pca': False,
})

# set sampling size
sampling_size = default_sampling_size

# add trainer.
set_trainer.add_trainer(name='default', data=data, target=TARGET, scoring=HP_SCORING,
                        n_splits=HP_CV_SPLITS, test_size=HP_TEST_SIZE, seed=HP_SEED,
                        sampling=sampling_size, prep_params=prep_params)

# set predefined parameters
# these parameters are found by grid-search with %10 of data.
set_trainer.default.set_predefined_params({
    'logistic': {'C': 1.0, 'max_iter': 50, 'penalty': 'l2'},
    'nb_gaussian': {'var_smoothing': 0},
    'knn': {'n_neighbors': 7, 'p': 2, 'weights': 'distance'},
    'dt': {'criterion': 'entropy', 'max_features': 0.5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'random'},
    'mlp': {'activation': 'relu', 'early_stopping': True, 'hidden_layer_sizes': (128, 128), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'solver': 'adam'},
    'bag': {'max_features': 0.7, 'max_samples': 1.0, 'n_estimators': 20},
    'rf': {'criterion': 'gini', 'max_features': 'sqrt', 'max_samples': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 2},
    'gb': {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 4},
    'ada': {'learning_rate': 0.8, 'n_estimators': 100},
    'lgbm': {'boosting_type': 'gbdt', 'colsample_bytree': 0.8, 'learning_rate': 0.15, 'n_estimators': 200, 'subsample': 0.8},
})

# configure param strategy (otherwise uses set_trainer's default)
set_trainer.default.use_param_strategy(PARAM_STRATEGY.PREDEFINED)

### PCA Trainer

In [None]:
# generate preprocessor parameters
prep_params = Preprocessor.params({
    'outlier_strategy': 'all',
    'encode_labels': True,
    'pca': 0.95,
})

# set sampling size
sampling_size = default_sampling_size

# add trainer.
set_trainer.add_trainer(name='pca', data=data, target=TARGET, scoring=HP_SCORING,
                        n_splits=HP_CV_SPLITS, test_size=HP_TEST_SIZE, seed=HP_SEED,
                        sampling=sampling_size, prep_params=prep_params)

# set predefined parameters
# these parameters are found by grid-search with %10 of data.
set_trainer.pca.set_predefined_params({
    'logistic': {'C': 0.8, 'max_iter': 50, 'penalty': 'l2'},
    'nb_gaussian': {'var_smoothing': 0},
    'knn': {'n_neighbors': 7, 'p': 2, 'weights': 'distance'},
    'dt': {'criterion': 'entropy', 'max_features': 0.8, 'min_samples_leaf': 1, 'min_samples_split': 2, 'splitter': 'best'},
    'mlp': {'activation': 'relu', 'early_stopping': True, 'hidden_layer_sizes': (128, 128), 'learning_rate': 'adaptive', 'learning_rate_init': 0.001, 'solver': 'adam'},
    'bag': {'max_features': 0.7, 'max_samples': 1.0, 'n_estimators': 20},
    'rf': {'criterion': 'gini', 'max_features': 'sqrt', 'max_samples': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 2},
    'gb': {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2},
    'ada': {'learning_rate': 0.8, 'n_estimators': 50},
    'lgbm': {'boosting_type': 'gbdt', 'colsample_bytree': 1.0, 'learning_rate': 0.15, 'n_estimators': 200, 'subsample': 0.8},
})

# configure param strategy (otherwise uses set_trainer's default)
set_trainer.pca.use_param_strategy(PARAM_STRATEGY.PREDEFINED)

### Training

In [None]:
# training options
options = {
    # set trainer parameters
    'trainers': ['default', 'pca'],
    
    # trainer parameters
    'estimators': ['logistic', 'nb_gaussian', 'knn'],
    #'reset': False,
    #'seed': None,
    #'save': False,
}

# train generator
train_gen = set_trainer.run_all_trainers(**options)

# train all 
for (trainer_name, trainer, model_name, model) in train_gen:
    # save all scores
    _ = set_trainer.save_scores()
    
    # Get best params if doing grid search.
    if trainer.param_strategy == PARAM_STRATEGY.GRID_SEARCH:
        print_dim(f"\nbest params: {model.best_result['params']}")
    
    # Show stats.
    print()
    trainer.get_scores_df(model_name).head()
    print()

    # Show predicts.
    #trainer.get_results_df(model_name, ascending=True).head()

In [None]:
set_trainer.save_scores()