# Stacking & Random Search Example

In [1]:
%load_ext watermark
%watermark -p scikit-learn,mlxtend,xgboost

scikit-learn: 1.0
mlxtend     : 0.19.0
xgboost     : 1.5.0



## Dataset

In [2]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import datasets


data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 398 80 171


## Baseline

In [3]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.linear_model import LogisticRegression


forest = RandomForestClassifier(n_estimators=100,
                                random_state=123)

boost = XGBClassifier(random_state=123, verbosity=0, use_label_encoder=False)

metaclassifier = LogisticRegression(random_state=123)

sclf = StackingCVClassifier(classifiers=[forest, boost], 
                            meta_classifier=metaclassifier, 
                            random_state=123)


Random forest:

In [4]:
forest.fit(X_train_sub, y_train_sub)
print(f"Training Accuracy: {forest.score(X_train_sub, y_train_sub):0.2f}")
print(f"Validation Accuracy: {forest.score(X_valid, y_valid):0.2f}")
print(f"Test Accuracy: {forest.score(X_test, y_test):0.2f}")

Training Accuracy: 1.00
Validation Accuracy: 0.95
Test Accuracy: 0.96


Gradient boosting:

In [5]:
boost.fit(X_train_sub, y_train_sub)
print(f"Training Accuracy: {boost.score(X_train_sub, y_train_sub):0.2f}")
print(f"Validation Accuracy: {boost.score(X_valid, y_valid):0.2f}")
print(f"Test Accuracy: {boost.score(X_test, y_test):0.2f}")

Training Accuracy: 1.00
Validation Accuracy: 0.97
Test Accuracy: 0.95


Stacking:

In [6]:
sclf.fit(X_train_sub, y_train_sub)
print(f"Training Accuracy: {sclf.score(X_train_sub, y_train_sub):0.2f}")
print(f"Validation Accuracy: {sclf.score(X_valid, y_valid):0.2f}")
print(f"Test Accuracy: {sclf.score(X_test, y_test):0.2f}")

Training Accuracy: 1.00
Validation Accuracy: 0.97
Test Accuracy: 0.95


## Randomized Search

Example showing how to tune hyperparameters of a nested ensemble using randomized search 
- More info: 
  - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
  - https://scikit-learn.org/stable/modules/grid_search.html#randomized-parameter-search

In [7]:
import numpy as np
import scipy.stats
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import make_pipeline


pipe = make_pipeline(sclf)

params = {
    'stackingcvclassifier__use_probas': [True],
    'stackingcvclassifier__drop_proba_col': [None, 'last'],
    'stackingcvclassifier__xgbclassifier__reg_alpha': scipy.stats.loguniform(1e-5, 1),
    'stackingcvclassifier__xgbclassifier__max_depth': [2, 4, 6, 8],
    'stackingcvclassifier__randomforestclassifier__n_estimators': [100, 1000]
}


search = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=params,
    n_iter=50,
    cv=10,
    verbose=2,
    n_jobs=1)

search.fit(X_train, y_train)
search.best_score_

Fitting 10 folds for each of 50 candidates, totalling 500 fits
[CV] END stackingcvclassifier__drop_proba_col=last, stackingcvclassifier__randomforestclassifier__n_estimators=1000, stackingcvclassifier__use_probas=True, stackingcvclassifier__xgbclassifier__max_depth=2, stackingcvclassifier__xgbclassifier__reg_alpha=2.0132350076392567e-05; total time=   2.0s
[CV] END stackingcvclassifier__drop_proba_col=last, stackingcvclassifier__randomforestclassifier__n_estimators=1000, stackingcvclassifier__use_probas=True, stackingcvclassifier__xgbclassifier__max_depth=2, stackingcvclassifier__xgbclassifier__reg_alpha=2.0132350076392567e-05; total time=   2.0s
[CV] END stackingcvclassifier__drop_proba_col=last, stackingcvclassifier__randomforestclassifier__n_estimators=1000, stackingcvclassifier__use_probas=True, stackingcvclassifier__xgbclassifier__max_depth=2, stackingcvclassifier__xgbclassifier__reg_alpha=2.0132350076392567e-05; total time=   2.0s
[CV] END stackingcvclassifier__drop_proba_col=las

0.97

In [8]:
search.best_params_

{'stackingcvclassifier__drop_proba_col': 'last',
 'stackingcvclassifier__randomforestclassifier__n_estimators': 1000,
 'stackingcvclassifier__use_probas': True,
 'stackingcvclassifier__xgbclassifier__max_depth': 2,
 'stackingcvclassifier__xgbclassifier__reg_alpha': 2.0132350076392567e-05}

In [9]:
print(f"Training Accuracy: {search.best_estimator_.score(X_train, y_train):0.2f}")
print(f"Test Accuracy: {search.best_estimator_.score(X_test, y_test):0.2f}")

Training Accuracy: 1.00
Test Accuracy: 0.96
