# STAT451: Machine Learning -- L07: Ensemble Methods Part 1/2

STAT 451: Intro to Machine Learning (Fall 2021)  
Instructor: Sebastian Raschka

In [1]:
%load_ext watermark
%watermark -p scikit-learn,mlxtend

scikit-learn: 1.0
mlxtend     : 0.19.0



# Stacking

## Dataset

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import datasets

data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_temp, X_test, y_temp, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

X_train, X_valid, y_train, y_valid = \
    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 318 80 171


## MLxtend standard Stacking (prone to overfitting)

In [3]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from mlxtend.classifier import StackingClassifier


clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(random_state=123)
clf3 = HistGradientBoostingClassifier(random_state=123)
clf4 = AdaBoostClassifier(random_state=123)
clf5 = DecisionTreeClassifier(random_state=123,
                              max_depth=None)

lr = LogisticRegression(random_state=123)

sclf = StackingClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], 
                          meta_classifier=lr)


sclf.fit(X_train, y_train)
print("Training Accuracy: %0.2f" % sclf.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % sclf.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % sclf.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.96
Test Accuracy: 0.98


## MLxtend Stacking + CV

In [4]:
from mlxtend.classifier import StackingCVClassifier


clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(random_state=123)
clf3 = HistGradientBoostingClassifier(random_state=123)
clf4 = AdaBoostClassifier(random_state=123)
clf5 = DecisionTreeClassifier(random_state=123,
                              max_depth=None)

lr = LogisticRegression(random_state=123)

sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], 
                            meta_classifier=lr, 
                            cv=10,
                            random_state=123)


sclf.fit(X_train, y_train)
print("Training Accuracy: %0.2f" % sclf.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % sclf.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % sclf.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.97
Test Accuracy: 0.98


## Stacking Classifier from scikit-learn (also includes CV)

In [5]:
from sklearn.ensemble import StackingClassifier


clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(random_state=123)
clf3 = HistGradientBoostingClassifier(random_state=123)
clf4 = AdaBoostClassifier(random_state=123)
clf5 = DecisionTreeClassifier(random_state=123,
                              max_depth=None)

lr = LogisticRegression(random_state=123)

estimators = [('clf1', clf1),
              ('clf2', clf2),
              ('clf3', clf3),
              ('clf4', clf4),
              ('clf5', clf5)]

sclf = StackingClassifier(estimators=estimators, 
                          final_estimator=lr, 
                          cv=10)


sclf.fit(X_train, y_train)
print("Training Accuracy: %0.2f" % sclf.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % sclf.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % sclf.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.99
Test Accuracy: 0.98


## MLxtend StackingCVClassifier with same behavior as scikit-learn above

In [6]:
# stack_method{‘auto’, ‘predict_proba’, ‘decision_function’, ‘predict’}, default=’auto’


from mlxtend.classifier import StackingCVClassifier


sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], 
                            meta_classifier=lr, 
                            use_probas=True, # changed
                            drop_proba_col='last',
                            #use_features_in_secondary=True,
                            cv=10,
                            random_state=123)


sclf.fit(X_train, y_train)
print("Training Accuracy: %0.2f" % sclf.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % sclf.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % sclf.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.99
Test Accuracy: 0.98
