In [1]:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

from mlxtend.classifier import StackingCVClassifier, StackingClassifier

from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

import numpy as np

In [2]:
iris = load_iris()
X = iris.data
y = iris.target

In [3]:
RANDOM_SEED = 2

In [4]:
lr_pipe = Pipeline([('prep', StandardScaler()),
                    ('lr', LogisticRegression())])
knn_pipe = Pipeline([('prep', StandardScaler()),
                     ('knn', KNeighborsClassifier())])
rf = RandomForestClassifier(random_state=RANDOM_SEED)

In [5]:
np.random.seed(RANDOM_SEED)
sclf1 = StackingCVClassifier(classifiers=[lr_pipe, knn_pipe, rf], 
                             cv=3,
                             use_features_in_secondary=True, 
#                              use_probas=True,
                             meta_classifier=LogisticRegression(random_state=RANDOM_SEED))

sclf2 = StackingCVClassifier(classifiers=[lr_pipe, knn_pipe, rf], 
                             cv=3,
                             use_features_in_secondary=True, 
#                              use_probas=True,
                             meta_classifier=RandomForestClassifier(random_state=RANDOM_SEED))

In [6]:
sclf1.get_params()

{'meta-logisticregression': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'meta-logisticregression__C': 1.0,
 'meta-logisticregression__class_weight': None,
 'meta-logisticregression__dual': False,
 'meta-logisticregression__fit_intercept': True,
 'meta-logisticregression__intercept_scaling': 1,
 'meta-logisticregression__max_iter': 100,
 'meta-logisticregression__multi_class': 'ovr',
 'meta-logisticregression__n_jobs': 1,
 'meta-logisticregression__penalty': 'l2',
 'meta-logisticregression__random_state': 2,
 'meta-logisticregression__solver': 'liblinear',
 'meta-logisticregression__tol': 0.0001,
 'meta-logisticregression__verbose': 0,
 'meta-logisticregression__warm_start': False,
 'pipeline-1': Pipeline(memory=None,
      steps=[('prep', StandardScaler(copy=True, wi

In [7]:
# instantiate pipeline to select model
pipe = Pipeline([('model', LogisticRegression())])

In [8]:
sclf1_search = {'model': Categorical([sclf1]),
                'model__pipeline-1__lr__C': Real(1e-6, 1e6, prior='log-uniform'),
                'model__pipeline-1__lr__penalty': Categorical(['l1', 'l2']),
                'model__pipeline-2__knn__n_neighbors': Integer(1, 20),
                'model__randomforestclassifier__max_features': Integer(1, 4),
                'model__meta-logisticregression__C': Real(1e-6, 1e6, prior='log-uniform'),
                'model__meta-logisticregression__penalty': Categorical(['l1', 'l2'])
               }

sclf2_search = {'model': Categorical([sclf2]),
                'model__pipeline-1__lr__C': Real(1e-6, 1e6, prior='log-uniform'),
                'model__pipeline-1__lr__penalty': Categorical(['l1', 'l2']),
                'model__pipeline-2__knn__n_neighbors': Integer(1, 20),
                'model__randomforestclassifier__max_features': Integer(1, 4),
                'model__meta-randomforestclassifier__max_features': Integer(1, 4)
                 }

In [9]:
opt = BayesSearchCV(pipe, search_spaces=[(sclf1_search, 50),
                                         (sclf2_search, 50)], 
                    n_jobs=-1, random_state=RANDOM_SEED,cv=5)

In [10]:
opt.fit(X, y)



In [11]:
opt.best_estimator_

Pipeline(memory=None,
     steps=[('model', StackingCVClassifier(classifiers=[Pipeline(memory=None,
     steps=[('prep', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1000000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          mu...uffle=True, stratify=True, use_features_in_secondary=True,
           use_probas=False, verbose=0))])

In [12]:
opt.best_params_

{'model': StackingCVClassifier(classifiers=[Pipeline(memory=None,
      steps=[('prep', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lr', LogisticRegression(C=1000000.0, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2..._estimators=10, n_jobs=1,
             oob_score=False, random_state=2, verbose=0, warm_start=False)],
            cv=3,
            meta_classifier=LogisticRegression(C=27768.248741250649, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=2,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
            shuffle=True, stratify=True, use_features_in_secondary=True,
            use_probas=False, verbose=0),
 'model__meta-logisticregression__C': 27768.248741250649,
 'model__meta-logisticregression__penalty': 'l2',
 'model__p

## NOTE cv_results seem to be messed up

The cv_results seem to be a bit funky. One I get an error when I try to just pass in the cv_results to create a dataframe.  Trying the hack from SO that I found doesn't throw an error but results in messed up column values. For example even though the best estimator has a random forest top level meta-estimator, there is a non-null value for the number of leaves for a LightGBM classifier in that row.

In [13]:
import pandas as pd
cv_results = pd.DataFrame.from_dict(opt.cv_results_, orient='index').transpose()

In [14]:
cv_results.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112 entries, 0 to 111
Data columns (total 28 columns):
split0_test_score                                         112 non-null object
split1_test_score                                         112 non-null object
split2_test_score                                         112 non-null object
split3_test_score                                         112 non-null object
split4_test_score                                         112 non-null object
mean_test_score                                           112 non-null object
std_test_score                                            112 non-null object
rank_test_score                                           112 non-null object
split0_train_score                                        112 non-null object
split1_train_score                                        112 non-null object
split2_train_score                                        112 non-null object
split3_train_score                   

In [15]:
pd.options.display.max_columns = None
cv_results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__meta-logisticregression__C,param_model__meta-logisticregression__penalty,param_model__pipeline-1__lr__C,param_model__pipeline-1__lr__penalty,param_model__pipeline-2__knn__n_neighbors,param_model__randomforestclassifier__max_features,params,param_model__meta-randomforestclassifier__max_features
50,1,1,0.933333,0.966667,1,0.98,0.0266667,1,0.933333,0.958333,0.983333,0.983333,0.966667,0.965,0.0185592,0.0933761,0.00370451,0.00230842,0.000193233,StackingCVClassifier(classifiers=[Pipeline(mem...,27768.2,l2,1e+06,l2,1,1,{'model': StackingCVClassifier(classifiers=[Pi...,4
31,1,1,0.933333,0.933333,1,0.973333,0.0326599,1,0.983333,0.966667,0.966667,1,0.941667,0.971667,0.0194365,0.109185,0.0115463,0.00168471,0.000341298,StackingCVClassifier(classifiers=[Pipeline(mem...,1145.64,l1,1e+06,l2,1,1,{'model': StackingCVClassifier(classifiers=[Pi...,4
2,1,1,0.933333,0.933333,1,0.973333,0.0326599,1,0.975,0.983333,0.983333,0.975,0.975,0.978333,0.00408248,0.0967443,0.000523307,0.00241852,7.52613e-05,StackingCVClassifier(classifiers=[Pipeline(mem...,2654.74,l2,0.85279,l2,3,3,{'model': StackingCVClassifier(classifiers=[Pi...,1
25,1,1,0.933333,0.933333,1,0.973333,0.0326599,1,0.966667,0.958333,0.991667,0.991667,0.966667,0.975,0.0139443,0.0991703,0.0117233,0.00235844,0.000133265,StackingCVClassifier(classifiers=[Pipeline(mem...,132632,l2,8.60158,l2,2,4,{'model': StackingCVClassifier(classifiers=[Pi...,4
7,1,1,0.966667,0.9,1,0.973333,0.038873,1,0.95,0.975,0.975,0.983333,0.975,0.971667,0.0113039,0.0760417,0.0152712,0.00179734,0.000371353,StackingCVClassifier(classifiers=[Pipeline(mem...,3.39233,l1,0.0248948,l2,9,1,{'model': StackingCVClassifier(classifiers=[Pi...,4
53,1,1,0.9,0.933333,1,0.966667,0.0421637,2,0.966667,0.983333,0.991667,1,0.983333,0.985,0.0110554,0.113453,0.0189017,0.00235591,0.000174889,StackingCVClassifier(classifiers=[Pipeline(mem...,184.569,l2,1e+06,l1,20,1,{'model': StackingCVClassifier(classifiers=[Pi...,4
27,1,0.966667,0.933333,0.933333,1,0.966667,0.0298142,3,0.966667,0.983333,0.991667,1,0.975,0.983333,0.0117851,0.152063,0.0428115,0.00251765,9.15194e-05,StackingCVClassifier(classifiers=[Pipeline(mem...,0.218166,l2,1e+06,l1,20,1,{'model': StackingCVClassifier(classifiers=[Pi...,4
48,1,1,0.9,0.933333,1,0.966667,0.0421637,2,0.975,0.966667,0.983333,0.991667,0.966667,0.976667,0.00971825,0.152184,0.0313248,0.00246234,0.000151615,StackingCVClassifier(classifiers=[Pipeline(mem...,4716.71,l2,1e+06,l1,1,1,{'model': StackingCVClassifier(classifiers=[Pi...,4
49,1,1,0.9,0.933333,1,0.966667,0.0421637,2,0.975,0.966667,0.983333,1,0.975,0.98,0.0113039,0.176939,0.0257984,0.0024507,0.000165355,StackingCVClassifier(classifiers=[Pipeline(mem...,68.1023,l1,1e+06,l1,1,1,{'model': StackingCVClassifier(classifiers=[Pi...,1
51,1,0.966667,0.933333,0.933333,1,0.966667,0.0298142,2,0.941667,0.975,0.975,1,0.966667,0.971667,0.0187083,0.136231,0.00387234,0.00231805,0.000119735,StackingCVClassifier(classifiers=[Pipeline(mem...,52.4214,l1,1e+06,l1,1,1,{'model': StackingCVClassifier(classifiers=[Pi...,4


In [16]:
# extract the meta estimator
cv_results['meta_estimator'] = cv_results.param_model.apply(lambda x: str(x.meta_classifier).split('(')[0])

In [17]:
cv_results.head()

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__meta-logisticregression__C,param_model__meta-logisticregression__penalty,param_model__pipeline-1__lr__C,param_model__pipeline-1__lr__penalty,param_model__pipeline-2__knn__n_neighbors,param_model__randomforestclassifier__max_features,params,param_model__meta-randomforestclassifier__max_features,meta_estimator
0,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.0,6,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.0,0.122095,0.00628204,0.00238056,0.000352727,StackingCVClassifier(classifiers=[Pipeline(mem...,2.04699e-06,l2,0.167444,l1,7,2,{'model': StackingCVClassifier(classifiers=[Pi...,3,LogisticRegression
1,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.0,6,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.0,0.111792,0.0130548,0.00227761,0.000324271,StackingCVClassifier(classifiers=[Pipeline(mem...,0.00394327,l1,28.4188,l2,4,3,{'model': StackingCVClassifier(classifiers=[Pi...,4,LogisticRegression
2,1.0,1.0,0.933333,0.933333,1.0,0.973333,0.0326599,1,0.975,0.983333,0.983333,0.975,0.975,0.978333,0.00408248,0.0967443,0.000523307,0.00241852,7.52613e-05,StackingCVClassifier(classifiers=[Pipeline(mem...,2654.74,l2,0.85279,l2,3,3,{'model': StackingCVClassifier(classifiers=[Pi...,1,LogisticRegression
3,0.9,1.0,0.9,0.933333,0.8,0.906667,0.0646357,4,0.9,0.958333,0.933333,0.958333,0.966667,0.943333,0.0243812,0.095845,0.00298655,0.0024056,3.0555e-05,StackingCVClassifier(classifiers=[Pipeline(mem...,0.137236,l1,3.35676e-05,l2,5,1,{'model': StackingCVClassifier(classifiers=[Pi...,3,LogisticRegression
4,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.0,6,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,0.0,0.0930199,0.00062262,0.00234542,6.02615e-05,StackingCVClassifier(classifiers=[Pipeline(mem...,0.015773,l1,0.000263584,l2,10,3,{'model': StackingCVClassifier(classifiers=[Pi...,2,LogisticRegression


In [18]:
rf_cv_results = cv_results.loc[cv_results.meta_estimator=='RandomForestClassifier', :]
lr_cv_results = cv_results.loc[cv_results.meta_estimator!='RandomForestClassifier', :]

In [19]:
rf_cv_results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 56 to 111
Data columns (total 29 columns):
split0_test_score                                         56 non-null object
split1_test_score                                         56 non-null object
split2_test_score                                         56 non-null object
split3_test_score                                         56 non-null object
split4_test_score                                         56 non-null object
mean_test_score                                           56 non-null object
std_test_score                                            56 non-null object
rank_test_score                                           56 non-null object
split0_train_score                                        56 non-null object
split1_train_score                                        56 non-null object
split2_train_score                                        56 non-null object
split3_train_score                              

In [20]:
lr_cv_results.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56 entries, 0 to 55
Data columns (total 29 columns):
split0_test_score                                         56 non-null object
split1_test_score                                         56 non-null object
split2_test_score                                         56 non-null object
split3_test_score                                         56 non-null object
split4_test_score                                         56 non-null object
mean_test_score                                           56 non-null object
std_test_score                                            56 non-null object
rank_test_score                                           56 non-null object
split0_train_score                                        56 non-null object
split1_train_score                                        56 non-null object
split2_train_score                                        56 non-null object
split3_train_score                                

Looks like the model parameters match the first row above

In [21]:
# extract the max feature values for meta-rf classifier and store them in df
max_feat = rf_cv_results.params.apply(lambda x: x['model__meta-randomforestclassifier__max_features'])
rf_cv_results['param_model__meta-randomforestclassifier__max_features'] = max_feat

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
rf_cv_results.head()

Unnamed: 0,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model,param_model__meta-logisticregression__C,param_model__meta-logisticregression__penalty,param_model__pipeline-1__lr__C,param_model__pipeline-1__lr__penalty,param_model__pipeline-2__knn__n_neighbors,param_model__randomforestclassifier__max_features,params,param_model__meta-randomforestclassifier__max_features,meta_estimator
56,0.966667,0.966667,0.9,0.933333,1,0.953333,0.0339935,3,0.991667,0.991667,1.0,0.991667,1.0,0.995,0.00408248,0.140893,0.0154289,0.00363278,5.02397e-05,StackingCVClassifier(classifiers=[Pipeline(mem...,,,46947.7,l2,16,4,{'model': StackingCVClassifier(classifiers=[Pi...,3,RandomForestClassifier
57,0.966667,0.966667,0.933333,0.966667,1,0.966667,0.0210819,1,0.991667,0.991667,1.0,0.991667,0.983333,0.991667,0.00527046,0.136802,0.0273076,0.00352283,7.21782e-05,StackingCVClassifier(classifiers=[Pipeline(mem...,,,0.00353923,l2,2,2,{'model': StackingCVClassifier(classifiers=[Pi...,4,RandomForestClassifier
58,0.966667,0.966667,0.933333,0.9,1,0.953333,0.0339935,3,0.983333,0.991667,0.991667,0.991667,0.975,0.986667,0.00666667,0.115806,0.00970817,0.00331521,0.000213492,StackingCVClassifier(classifiers=[Pipeline(mem...,,,0.00469033,l1,7,3,{'model': StackingCVClassifier(classifiers=[Pi...,1,RandomForestClassifier
59,0.966667,0.966667,0.9,0.9,1,0.946667,0.04,8,0.983333,0.966667,0.991667,0.983333,0.983333,0.981667,0.00816497,0.131787,0.00776716,0.00349274,8.15429e-05,StackingCVClassifier(classifiers=[Pipeline(mem...,,,111193.0,l1,1,1,{'model': StackingCVClassifier(classifiers=[Pi...,3,RandomForestClassifier
60,0.966667,0.966667,0.9,0.966667,1,0.96,0.0326599,2,0.975,0.983333,0.991667,0.991667,0.991667,0.986667,0.00666667,0.115576,0.00398703,0.00339971,8.76012e-05,StackingCVClassifier(classifiers=[Pipeline(mem...,,,0.0831697,l2,2,3,{'model': StackingCVClassifier(classifiers=[Pi...,2,RandomForestClassifier
