## The Framingham Heart Study. Tuning. Holdout. Saving.

### Connecting libraries and scripts

In [1]:
# 1. Core libraries
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle

import xgboost as xgb
from catboost import CatBoostClassifier

from scipy import stats

import optuna

import pickle

from tqdm import tqdm_notebook

# 2. Constants
RAND = 50
N_FOLDS = 5

### Settings

In [2]:
# 1. General settings
pd.set_option('display.max_columns', 100)

# 2. Warnings
import warnings
from warnings import simplefilter

warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', message=".*The 'nopython' keyword.*")

### Metrics

In [3]:
import get_metrics

### Reduce memory usage

In [4]:
import reduce_mem_usage

### Functions

**Function checks model overfitting.**<br>

In [5]:
def check_overfitting(model, X_train, y_train, X_test, y_test, metric_fun):
    """
    Checkong for overfitiing.
    """
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    value_train = metric_fun(y_train, y_pred_train)
    value_test = metric_fun(y_test, y_pred_test)

    print(f'{metric_fun.__name__} train: %.3f' % value_train)
    print(f'{metric_fun.__name__} test: %.3f' % value_test)
    print(f'delta = {(abs(value_train - value_test)/value_test*100):.1f} %')

### Data import

In [6]:
df = pd.read_csv('csv/preprocessed_dataset.csv')

In [7]:
df.head()

Unnamed: 0,RANDID,TOTCHOL,AGE,SYSBP,DIABP,CIGPDAY,BMI,HEARTRTE,GLUCOSE,HDLC,LDLC,SEX,educ,CVD,AGE_1,AGE_2,AGE_3,CURSMOKE_1,CURSMOKE_2,CURSMOKE_3,DIABETES_1,DIABETES_2,DIABETES_3,BPMEDS_1,BPMEDS_2,BPMEDS_3,PREVCHD_1,PREVCHD_2,PREVCHD_3,PREVAP_1,PREVAP_2,PREVAP_3,PREVMI_1,PREVMI_2,PREVMI_3,PREVSTRK_1,PREVSTRK_2,PREVSTRK_3,PREVHYP_1,PREVHYP_2,PREVHYP_3,CVD_BY_PERIOD_1,CVD_BY_PERIOD_2,CVD_BY_PERIOD_3
0,2448,202.0,45.5,113.5,68.0,0.0,26.97,74.5,84.5,39.5,173.0,0,4.0,1,39,0,52,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,6238,250.0,52.0,108.0,69.5,0.0,28.73,80.0,76.0,47.0,165.0,1,2.0,0,46,52,58,0,0,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,9428,264.0,51.0,134.25,84.5,25.0,25.34,75.0,78.5,47.5,175.5,0,1.0,0,48,54,0,1,1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10552,228.5,64.0,166.5,102.0,25.0,29.38,62.5,96.0,46.5,178.5,1,3.0,1,61,67,0,1,1,0,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0
4,11252,285.0,51.0,130.0,84.0,30.0,23.48,85.0,80.0,48.0,178.0,1,3.0,0,46,51,58,1,1,1,0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


### Data processing

**Reducing memory usage.**<br>

In [8]:
df = reduce_mem_usage.reduce_mem_usage(df)

Memory usage of dataframe is 1.49 MB
Memory usage after optimization is: 0.38 MB
Decreased by 74.7%


**Dataframe shuffling.**

In [9]:
df = shuffle(df, random_state=RAND)

### Baselines

**Train / Test / Validation split.**<br>

In [10]:
X = df.drop(['CVD', 'CVD_BY_PERIOD_1', 'CVD_BY_PERIOD_2', 'CVD_BY_PERIOD_3'],
        axis=1)

y = df['CVD']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    shuffle=True,
    stratify=y,  # Стратификация.
    random_state=RAND)

X_train_, X_val, y_train_, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.15,
                                                    shuffle=True,
                                                    random_state=RAND)

**Standart Scaler.**<br>

In [11]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

**Ratio calculation for scale pos weight.**<br>

In [12]:
# Calculate ratio for parameter scale_pos_weight.

# v. 1
sum_positive = sum(y == 1)
sum_negative = sum(y == 0)
ratio = sum_negative / sum_positive

# v. 2
# dtrain = xgb.DMatrix(X_train, label=y_train)
# train_labels = dtrain.get_label()
# ratio = float(np.sum(train_labels == 0)) / np.sum(train_labels == 1)

**Logistic Regression. Baseline.**<br>

In [13]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(random_state=10))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
y_score = pipeline.predict_proba(X_test)

In [14]:
metrics = get_metrics.get_metrics_classification(y_test,
                                                 y_pred,
                                                 y_score,
                                                 name='Logistic Regression')

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.812,0.829,0.779,0.391,0.521,0.405


<div class="alert alert-block alert-info"> 
<b>Comments</b><br>
- We changed RAND = 50, shuffled dataframe and metric is better - 0.829 vs. 0.809 (baseline with RAND = 10).<br>
</div>

**CatBoost. Baseline.**<br>

In [15]:
cb_classifier = CatBoostClassifier(scale_pos_weight=ratio,
                                   eval_metric='AUC',
                                   allow_writing_files=False,
                                   random_state=RAND)

eval_set = [(X_val, y_val)]

cb_classifier.fit(X_train_,
                  y_train_,
                  eval_set=eval_set,
                  early_stopping_rounds=100,
                  verbose=False)

y_pred = cb_classifier.predict(X_test)
y_score = cb_classifier.predict_proba(X_test)

In [16]:
metrics = pd.concat([
    metrics,
    get_metrics.get_metrics_classification(y_test,
                                           y_pred,
                                           y_score,
                                           name='CatBoost')
])

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.812,0.829,0.779,0.391,0.521,0.405
CatBoost,0.752,0.815,0.519,0.654,0.579,0.486


<div class="alert alert-block alert-info"> 
<b>Comments</b><br>
- We changed RAND = 50, shuffled dataframe and metric is better - 0.815 vs. 0.802 (baseline with RAND = 10).<br>
</div>

### Tuning

**Logistic Regression. Tuning.**<br>

In [17]:
parameters_grid = {
    'C' : np.logspace(-4, 4, 20), # Regularization strength.
    'penalty' : ['l1', 'l2', 'elasticnet', 'none'], # Regularization type (L1 or L2).
    'solver' : ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'], # Optimization algorithm.
    'max_iter' : [100, 1000, 2500, 5000], # Maximum iterations for convergence.
    'random_state': [RAND]
}

lg_random_tuning = LogisticRegression(random_state=RAND)
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

rand_cv_lg = RandomizedSearchCV(lg_random_tuning,
                                parameters_grid,
                                scoring='roc_auc',
                                cv=cv,
                                verbose=1)

In [18]:
%%time
rand_cv_lg.fit(X_train_std, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
CPU times: user 8.54 s, sys: 1.08 s, total: 9.62 s
Wall time: 5.7 s


In [19]:
print(f'Best parameter values: {rand_cv_lg.best_params_}')
print(f'Best score: {rand_cv_lg.best_score_}')

Best parameter values: {'solver': 'newton-cg', 'random_state': 50, 'penalty': 'l2', 'max_iter': 1000, 'C': 4.281332398719396}
Best score: 0.8120632661858462


In [20]:
lg = LogisticRegression(**rand_cv_lg.best_params_)
lg.fit(X_train_std, y_train)

y_pred = lg.predict(X_test_std)
y_score = lg.predict_proba(X_test_std)

In [21]:
metrics = pd.concat([
    metrics,
    get_metrics.get_metrics_classification(y_test,
                                           y_pred,
                                           y_score,
                                           name='Logistic Regression RandomCV')
])

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.812,0.829,0.779,0.391,0.521,0.405
CatBoost,0.752,0.815,0.519,0.654,0.579,0.486
Logistic Regression RandomCV,0.812,0.83,0.779,0.391,0.521,0.405


<div class="alert alert-block alert-info"> 
<b>Comments</b><br>
- Logistic Regression RandomCV metric is better.<br>
</div>

**CatBoost. Tuning.**<br>

In [22]:
parameters_grid = {
    # 'n_estimators': list(range(100, 1100, 100)),
    # 'learning_rate': np.linspace(0.001, 1, 1000),
    'n_estimators': [300],
    'learning_rate': [0.083],
    'eval_metric': ['AUC'],
    'scale_pos_weight': [ratio],
    'allow_writing_files': [False],
    'random_state': [RAND],
    'verbose': [False]
}

cb_random_tuning = CatBoostClassifier(random_state=RAND)
cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)

rand_cv_catboost = RandomizedSearchCV(cb_random_tuning,
                                      parameters_grid,
                                      scoring='roc_auc',
                                      cv=cv,
                                      verbose=False)

In [23]:
%%time
rand_cv_catboost.fit(X_train_, y_train_)

CPU times: user 12.3 s, sys: 1.22 s, total: 13.6 s
Wall time: 2.46 s


In [24]:
print(f'Best parameter values: {rand_cv_catboost.best_params_}')
print(f'Best score: {rand_cv_catboost.best_score_}')

Best parameter values: {'verbose': False, 'scale_pos_weight': 2.832324978392394, 'random_state': 50, 'n_estimators': 300, 'learning_rate': 0.083, 'eval_metric': 'AUC', 'allow_writing_files': False}
Best score: 0.7893652007020727


In [25]:
cb_classifier = CatBoostClassifier(**rand_cv_catboost.best_params_)
cb_classifier.fit(X_train_,
                  y_train_,
                  eval_set=eval_set,
                  early_stopping_rounds=100,
                  verbose=False)

y_pred = cb_classifier.predict(X_test)
y_score = cb_classifier.predict_proba(X_test)

In [26]:
metrics = pd.concat([
    metrics,
    get_metrics.get_metrics_classification(y_test,
                                           y_pred,
                                           y_score,
                                           name='CatBoost RandomCV')
])

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.812,0.829,0.779,0.391,0.521,0.405
CatBoost,0.752,0.815,0.519,0.654,0.579,0.486
Logistic Regression RandomCV,0.812,0.83,0.779,0.391,0.521,0.405
CatBoost RandomCV,0.75,0.821,0.515,0.706,0.596,0.502


<div class="alert alert-block alert-info"> 
<b>Comments</b><br>
- First we selected the parameter - n_estimators.<br>
- Secondly, we selected the parameter - learning_rate.<br>
- The metric is better.<br>
</div>

### Holdout

**Logistic Regression.**<br>

In [27]:
finish_test_preds = []
finish_test_preds_proba = []

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)
cv_predicts_val = np.empty(N_FOLDS)

for idx, (train_idx, test_idx) in enumerate(cv.split(X_train_std, y_train)):
    X_train_, X_val = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_val = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model = LogisticRegression(**rand_cv_lg.best_params_)
    model.fit(X_train_, y_train_)

    # OOF.
    preds_val_proba = model.predict_proba(X_val)[:, 1]
    cv_predicts_val[idx] = get_metrics.roc_auc_score(y_val, preds_val_proba)

    # Holdout.
    preds_test = model.predict(X_test_std)
    preds_test_proba = model.predict_proba(X_test_std)

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    print(f'ID = {idx}, ROC-AUC score: {cv_predicts_val[idx]}.')
    print('---')

print(f'ROC-AUC score mean: {np.mean(cv_predicts_val)}')

ID = 0, ROC-AUC score: 0.8263103682374698.
---
ID = 1, ROC-AUC score: 0.8034155020249549.
---
ID = 2, ROC-AUC score: 0.8409298405786925.
---
ID = 3, ROC-AUC score: 0.7903684383664646.
---
ID = 4, ROC-AUC score: 0.7904154330560647.
---
ROC-AUC score mean: 0.8102879164527295


In [28]:
# Finding mode by class.
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Finding the average according to the probabilities.
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

In [29]:
print(f'ROC-AUC mean OOF: {round(np.mean(cv_predicts_val), 3)}, std: {round(np.std(cv_predicts_val), 3)}')
print('---')
print(f'ROC-AUC HOLDOUT: {round(get_metrics.roc_auc_score(y_test, test_pred_proba[:,1]), 3)}')

ROC-AUC mean OOF: 0.81, std: 0.02
---
ROC-AUC HOLDOUT: 0.793


In [30]:
metrics = pd.concat([
    metrics,
    get_metrics.get_metrics_classification(y_test,
                                           test_pred,
                                           test_pred_proba,
                                           name='Logistic Regression Holdout')
])

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.812,0.829,0.779,0.391,0.521,0.405
CatBoost,0.752,0.815,0.519,0.654,0.579,0.486
Logistic Regression RandomCV,0.812,0.83,0.779,0.391,0.521,0.405
CatBoost RandomCV,0.75,0.821,0.515,0.706,0.596,0.502
Logistic Regression Holdout,0.817,0.793,0.922,0.325,0.481,0.781


<div class="alert alert-block alert-info"> 
<b>Comments</b><br>
- The Holdout metric is stable, but worse than Logistic Regression RandomCV	.<br>
</div>

**CatBoost.**<br>

In [31]:
finish_test_preds = []
finish_test_preds_proba = []

cv = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RAND)
cv_predicts_val = np.empty(N_FOLDS)

for idx, (train_idx, test_idx) in enumerate(cv.split(X_train, y_train)):
    X_train_, X_val = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_train_, y_val = y_train.iloc[train_idx], y_train.iloc[test_idx]

    model = CatBoostClassifier(**rand_cv_catboost.best_params_)
    model.fit(X_train_, y_train_)

    # OOF.
    preds_val_proba = model.predict_proba(X_val)[:, 1]
    cv_predicts_val[idx] = get_metrics.roc_auc_score(y_val, preds_val_proba)

    # Holdout.
    preds_test = model.predict(X_test)
    preds_test_proba = model.predict_proba(X_test)

    finish_test_preds.append(preds_test)
    finish_test_preds_proba.append(preds_test_proba)

    print(f'ID = {idx}, ROC-AUC score: {cv_predicts_val[idx]}.')
    print('---')

print(f'ROC-AUC score mean: {np.mean(cv_predicts_val)}')

ID = 0, ROC-AUC score: 0.7951986328627947.
---
ID = 1, ROC-AUC score: 0.769084907648009.
---
ID = 2, ROC-AUC score: 0.8367160615211743.
---
ID = 3, ROC-AUC score: 0.7719935147328352.
---
ID = 4, ROC-AUC score: 0.7447248460923916.
---
ROC-AUC score mean: 0.783543592571441


In [32]:
# Finding mode by class.
test_pred = stats.mode(np.column_stack(finish_test_preds), axis=1)[0]

# Finding the average according to the probabilities.
test_pred_proba = np.mean(finish_test_preds_proba, axis=0)

In [33]:
print(f'ROC-AUC mean OOF: {round(np.mean(cv_predicts_val), 3)}, std: {round(np.std(cv_predicts_val), 3)}')
print('---')
print(f'ROC-AUC HOLDOUT: {round(get_metrics.roc_auc_score(y_test, test_pred_proba[:,1]), 3)}')

ROC-AUC mean OOF: 0.784, std: 0.031
---
ROC-AUC HOLDOUT: 0.802


In [34]:
metrics = pd.concat([
    metrics,
    get_metrics.get_metrics_classification(y_test,
                                           test_pred,
                                           test_pred_proba,
                                           name='CatBoost Holdout')
])

round(metrics.set_index('model'), 3)

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.812,0.829,0.779,0.391,0.521,0.405
CatBoost,0.752,0.815,0.519,0.654,0.579,0.486
Logistic Regression RandomCV,0.812,0.83,0.779,0.391,0.521,0.405
CatBoost RandomCV,0.75,0.821,0.515,0.706,0.596,0.502
Logistic Regression Holdout,0.817,0.793,0.922,0.325,0.481,0.781
CatBoost Holdout,0.783,0.802,0.585,0.571,0.578,0.456


<div class="alert alert-block alert-info"> 
<b>Comments</b><br>
- The Holdout metric is stable, but worse than CatBoost RandomCV.<br>
</div>

In [35]:
error_columns = ['Logloss']

evaluate_metrics = metrics.set_index('model').style.highlight_max(
    subset=['Accuracy','ROC_AUC', 'Precision', 'Recall', 'f1'], color='green', axis=0) \
    .highlight_min(subset=error_columns, color='lightgreen', axis=0)

evaluate_metrics

Unnamed: 0_level_0,Accuracy,ROC_AUC,Precision,Recall,f1,Logloss
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.812444,0.829218,0.77931,0.391003,0.520737,0.404747
CatBoost,0.752029,0.815364,0.519231,0.653979,0.578867,0.485576
Logistic Regression RandomCV,0.812444,0.829838,0.77931,0.391003,0.520737,0.404612
CatBoost RandomCV,0.750225,0.820533,0.515152,0.705882,0.59562,0.501753
Logistic Regression Holdout,0.816952,0.792717,0.921569,0.32526,0.480818,0.780631
CatBoost Holdout,0.782687,0.801667,0.585106,0.570934,0.577933,0.456006


<div class="alert alert-block alert-info"> 
<b>Comments</b><br>
Precision is important when the cost of false positives is high.
In medical scenarios, false positives might lead to unnecessary treatments or interventions,
and precision becomes crucial to minimize such cases.

Recall is important when the cost of false negatives is high.
In medical situations, missing a positive case (false negatives) could have severe consequences,
and recall is essential to ensure that as many positive cases as possible are detected.

CatBoost RandomCV has better Recall metric, so we can detect more people with CVD.

Conclusion: we will choose as the main model - CatBoost RandomCV.
It has worse ROC-AUC than Logistic Regresion RandomCV, but more stable metric on Holdout and Recall is better.
</div>

### Model saving

In [36]:
with open('models/catboost_random_cv.pkl', 'wb') as file:
    pickle.dump(cb_classifier, file)