# Stacking & Hyperopt Example

In [1]:
%load_ext watermark
%watermark -p scikit-learn,mlxtend,xgboost

scikit-learn: 1.0
mlxtend     : 0.19.0
xgboost     : 1.5.0



## Dataset

In [2]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import datasets


data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 398 80 171


## Baseline

In [3]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import StackingCVClassifier
from sklearn.linear_model import LogisticRegression


forest = RandomForestClassifier(n_estimators=100,
                                random_state=123)

boost = XGBClassifier(random_state=123, verbosity=0, use_label_encoder=False)

metaclassifier = LogisticRegression(random_state=123)

sclf = StackingCVClassifier(classifiers=[forest, boost], 
                            meta_classifier=metaclassifier, 
                            random_state=123)


Random forest:

In [4]:
forest.fit(X_train_sub, y_train_sub)
print(f"Training Accuracy: {forest.score(X_train_sub, y_train_sub):0.2f}")
print(f"Validation Accuracy: {forest.score(X_valid, y_valid):0.2f}")
print(f"Test Accuracy: {forest.score(X_test, y_test):0.2f}")

Training Accuracy: 1.00
Validation Accuracy: 0.95
Test Accuracy: 0.96


Gradient boosting:

In [5]:
boost.fit(X_train_sub, y_train_sub)
print(f"Training Accuracy: {boost.score(X_train_sub, y_train_sub):0.2f}")
print(f"Validation Accuracy: {boost.score(X_valid, y_valid):0.2f}")
print(f"Test Accuracy: {boost.score(X_test, y_test):0.2f}")

Training Accuracy: 1.00
Validation Accuracy: 0.97
Test Accuracy: 0.95


Stacking:

In [6]:
sclf.fit(X_train_sub, y_train_sub)
print(f"Training Accuracy: {sclf.score(X_train_sub, y_train_sub):0.2f}")
print(f"Validation Accuracy: {sclf.score(X_valid, y_valid):0.2f}")
print(f"Test Accuracy: {sclf.score(X_test, y_test):0.2f}")

Training Accuracy: 1.00
Validation Accuracy: 0.97
Test Accuracy: 0.95


## Hyperopt

Example showing how to tune hyperparameters of a nested ensemble using Bayesian optimization via hyperopt. 
- More info: 
  - https://hyperopt.github.io

In [7]:
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin
import hyperopt.pyll.stochastic

hyperopt.pyll.stochastic.sample(hp.loguniform('test', 1e-5, 1)) # range e^{low} to e^{high}

1.8708454833799104

In [8]:
hyperopt.pyll.stochastic.sample(hp.qloguniform('test', 1e-5, 1, 0.1)) # rounded to 0.1

2.2

In [9]:
hyperopt.pyll.stochastic.sample(hp.choice('1_n_estimators', [100, 1000]))

1000

In [10]:
from sklearn.model_selection import cross_val_score


params = {'stackingcvclassifier__use_probas': hp.choice('stackingcvclassifier__use_probas', [True, False]),
           'stackingcvclassifier__drop_proba_col': hp.choice('stackingcvclassifier__drop_proba_col', [None, 'last']),
           'stackingcvclassifier__xgbclassifier__reg_alpha': hp.qloguniform('stackingcvclassifier__xgbclassifier__reg_alpha', 1e-5, 1, 0.1),
           'stackingcvclassifier__xgbclassifier__max_depth': hp.choice('stackingcvclassifier__xgbclassifier__max_depth', [2, 4, 6, 8]),
           'stackingcvclassifier__randomforestclassifier__n_estimators': hp.choice('stackingcvclassifier__randomforestclassifier__n_estimators', [100, 1000])
}




def optimization_objective(params):

    #warnings.filterwarnings(action='ignore', category=DeprecationWarning)
    
    forest = RandomForestClassifier(
        n_estimators=params['stackingcvclassifier__randomforestclassifier__n_estimators'],
        random_state=123
    )

    boost = XGBClassifier(
        random_state=123, verbosity=0, use_label_encoder=False,
        reg_alpha=params['stackingcvclassifier__xgbclassifier__reg_alpha'],
        max_depth=params['stackingcvclassifier__xgbclassifier__max_depth']
        
    )

    metaclassifier = LogisticRegression(random_state=123)

    sclf = StackingCVClassifier(
        classifiers=[forest, boost], 
        meta_classifier=metaclassifier, 
        random_state=123,
        use_probas=params['stackingcvclassifier__use_probas'],
        drop_proba_col=params['stackingcvclassifier__drop_proba_col'],
    )
    

    sclf.fit(X_train, y_train)
    
    accuracies = cross_val_score(
        estimator=sclf, X=X_train, y=y_train, cv=10, n_jobs=-1)

    score = accuracies.mean()

    return {'loss':1-score, 'status': STATUS_OK}


In [11]:
trials = Trials()
best = fmin(fn=optimization_objective,
            space=params,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

100%|████████████| 50/50 [05:10<00:00,  6.21s/trial, best loss: 0.030064102564102635]


In [12]:
best

{'stackingcvclassifier__drop_proba_col': 0,
 'stackingcvclassifier__randomforestclassifier__n_estimators': 1,
 'stackingcvclassifier__use_probas': 1,
 'stackingcvclassifier__xgbclassifier__max_depth': 2,
 'stackingcvclassifier__xgbclassifier__reg_alpha': 1.2000000000000002}

- Attention, `fmin` returns results from `hp.choice` as an index!

In [13]:
from hyperopt import space_eval

best_params = space_eval(params, best)
print(best_params)

{'stackingcvclassifier__drop_proba_col': None, 'stackingcvclassifier__randomforestclassifier__n_estimators': 1000, 'stackingcvclassifier__use_probas': False, 'stackingcvclassifier__xgbclassifier__max_depth': 6, 'stackingcvclassifier__xgbclassifier__reg_alpha': 1.2000000000000002}


In [14]:
forest = RandomForestClassifier(
    n_estimators=best_params['stackingcvclassifier__randomforestclassifier__n_estimators'],
    random_state=123
)

boost = XGBClassifier(
    random_state=123, verbosity=0, use_label_encoder=False,
    reg_alpha=best_params['stackingcvclassifier__xgbclassifier__reg_alpha'],
    max_depth=best_params['stackingcvclassifier__xgbclassifier__max_depth']

)

metaclassifier = LogisticRegression(random_state=123)

sclf = StackingCVClassifier(
    classifiers=[forest, boost], 
    meta_classifier=metaclassifier, 
    random_state=123,
    use_probas=best_params['stackingcvclassifier__use_probas'],
    drop_proba_col=best_params['stackingcvclassifier__drop_proba_col'],
)


sclf.fit(X_train, y_train);

In [15]:
print(f"Training Accuracy: {sclf.score(X_train, y_train):0.2f}")
print(f"Test Accuracy: {sclf.score(X_test, y_test):0.2f}")

Training Accuracy: 1.00
Test Accuracy: 0.95
