## The Framingham Heart Study. Stacking.

### Connecting libraries and scripts

In [1]:
# 1. Core libraries
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle

import xgboost as xgb
from catboost import CatBoostClassifier

from scipy import stats

import optuna

import pickle

from tqdm import tqdm_notebook

# 2. Constants
RAND = 50
N_FOLDS = 5

### Settings

In [2]:
# 1. General settings
pd.set_option('display.max_columns', 100)

# 2. Warnings
import warnings
from warnings import simplefilter

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', message=".*The 'nopython' keyword.*")

### Metrics

In [3]:
import get_metrics

### Reduce memory usage

In [4]:
import reduce_mem_usage

### Functions

**Function checks model overfitting.**<br>

In [5]:
def check_overfitting(model, X_train, y_train, X_test, y_test, metric_fun):
    """
    Checking for overfitiing.
    """
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    value_train = metric_fun(y_train, y_pred_train)
    value_test = metric_fun(y_test, y_pred_test)

    print(f'{metric_fun.__name__} train: %.3f' % value_train)
    print(f'{metric_fun.__name__} test: %.3f' % value_test)
    print(f'delta = {(abs(value_train - value_test)/value_test*100):.1f} %')

### Data import

In [6]:
df = pd.read_csv('csv/preprocessed_dataset.csv')

In [7]:
df.head()

Unnamed: 0,RANDID,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,BMI,HEARTRTE,GLUCOSE,HDLC,LDLC,SEX,educ,CVD,AGE_1,AGE_2,AGE_3,CURSMOKE_1,CURSMOKE_2,CURSMOKE_3,DIABETES_1,DIABETES_2,DIABETES_3,BPMEDS_1,BPMEDS_2,BPMEDS_3,PREVCHD_1,PREVCHD_2,PREVCHD_3,PREVAP_1,PREVAP_2,PREVAP_3,PREVMI_1,PREVMI_2,PREVMI_3,PREVSTRK_1,PREVSTRK_2,PREVSTRK_3,PREVHYP_1,PREVHYP_2,PREVHYP_3,CVD_BY_PERIOD_1,CVD_BY_PERIOD_2,CVD_BY_PERIOD_3
0,2448,202.0,45.5,113.5,68.0,0.0,26.97,74.5,84.5,39.5,173.0,0,4.0,1,39,0,52,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,6238,250.0,52.0,108.0,69.5,0.0,28.73,80.0,76.0,47.0,165.0,1,2.0,0,46,52,58,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,9428,264.0,51.0,134.25,84.5,25.0,25.34,75.0,78.5,47.5,175.5,0,1.0,0,48,54,0,1,1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10552,228.5,64.0,166.5,102.0,25.0,29.38,62.5,96.0,46.5,178.5,1,3.0,1,61,67,0,1,1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0
4,11252,285.0,51.0,130.0,84.0,30.0,23.48,85.0,80.0,48.0,178.0,1,3.0,0,46,51,58,1,1,1,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


### Data processing

**Reducing memory usage.**<br>

In [8]:
df = reduce_mem_usage.reduce_mem_usage(df)

Memory usage of dataframe is 1.49 MB
Memory usage after optimization is: 0.38 MB
Decreased by 74.7%


**Dataframe shuffling.**

In [9]:
df = shuffle(df, random_state=RAND)

**Train / Test / Validation split.**<br>

In [10]:
X = df.drop(['CVD', 'CVD_BY_PERIOD_1', 'CVD_BY_PERIOD_2', 'CVD_BY_PERIOD_3'],
        axis=1)

y = df['CVD']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    shuffle=True,
    stratify=y,  # Стратификация.
    random_state=RAND)

X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.15,
                                                    shuffle=True,
                                                    random_state=RAND)

### Stacking

**Define base models.**

In [11]:
best_params = {
    'verbose': False,
    'scale_pos_weight': 2.832324978392394,
    'random_state': 50,
    'n_estimators': 300,
    'learning_rate': 0.083,
    'eval_metric': 'AUC',
    'allow_writing_files': False}

cb1 = CatBoostClassifier(scale_pos_weight=2.832324978392394,
                         eval_metric='AUC',
                         allow_writing_files=False,
                         verbose=False,
                         random_state=RAND)

cb2 = CatBoostClassifier(**best_params)

lg = LogisticRegression(random_state=RAND, class_weight='balanced')

**Define meta-model.**

In [12]:
meta_model = LogisticRegression(random_state=RAND, class_weight='balanced')

In [13]:
clf1 = CalibratedClassifierCV(base_estimator=cb1, method='isotonic', cv=3)
clf2 = CalibratedClassifierCV(base_estimator=cb2, method='isotonic', cv=3)
clf3 = CalibratedClassifierCV(base_estimator=cb2, method='isotonic', cv=3)
clf4 = CalibratedClassifierCV(base_estimator=cb2, method='isotonic', cv=3)
clf5 = CalibratedClassifierCV(base_estimator=cb2, method='isotonic', cv=3)

estimators = [('cb1', clf1), ('cb2', clf2), ('cb3', clf3), ('cb4', clf3), ('cb5', clf3)]

stf_cv = StratifiedKFold(n_splits=N_FOLDS)

meta = StackingClassifier(
    estimators=estimators,
    cv=stf_cv,
    verbose=False,
    final_estimator=meta_model)

In [14]:
meta.fit(X_train, y_train)

In [15]:
y_pred = meta.predict(X_test)
y_score = meta.predict_proba(X_test)

In [16]:
metrics = get_metrics.get_metrics_classification(y_test,
                                                 y_pred,
                                                 y_score,
                                                 name='Stacking')

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Stacking,0.755,0.806,0.523,0.668,0.587,0.52


<div class="alert alert-block alert-info"> 
<b>Comments</b><br>
-Staсking turned out worse than CatBoost Random CV at the previous step..<br>
</div>