In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn import metrics

np.random.seed(123)

In [17]:
#import and split data
dx, dy = load_breast_cancer(return_X_y=True)
dx_std = StandardScaler().fit_transform(dx)
dx_train, dx_test, dy_train, dy_test = train_test_split(dx_std, dy, test_size=0.2, random_state=100)

#Check the data
print([[x,y] for x, y in zip(dx_train[0:5], dy_train[0:5])])

[[array([-0.30596615,  0.00473593, -0.38550046, -0.36316091, -1.12158738,
       -1.25934446, -1.1061848 , -1.15433606,  0.02694897, -1.10397473,
       -0.0042302 , -0.0051771 , -0.13418261, -0.16230835,  0.15001311,
       -0.94370263, -1.03455602, -1.23751977,  0.16438843, -0.30445597,
       -0.42228058, -0.55810202, -0.50699109, -0.45087495, -1.3268508 ,
       -1.22364683, -1.29697922, -1.57589532, -0.74701944, -1.16682532]), 1], [array([-0.35424856,  2.24104744, -0.39003138, -0.39984984, -1.07675315,
       -0.87368185, -0.33709238, -0.65746658, -0.89673971, -0.81053069,
       -0.6986422 ,  0.25971711, -0.67525107, -0.51719593,  0.45702556,
       -0.22065966,  0.23496007, -0.6774098 , -0.43495016, -0.37594547,
       -0.49268864,  1.63866073, -0.54869126, -0.50079967, -0.42383144,
       -0.58693459, -0.13571466, -0.75639985, -0.85541074, -0.63871258]), 1], [array([ 0.46087218, -0.01620767,  0.62324754,  0.29496443,  1.98834244,
        2.50271372,  2.54364574,  1.94179302,  2

In [18]:
base_classifiers = [('DT', DecisionTreeClassifier(max_depth=4)),
                    ('LR', LogisticRegression(C=0.1, max_iter=1000)),
                    ('RF', RandomForestClassifier(n_estimators=150, max_depth=3)),
                    ('SVM', SVC(C=0.1, gamma=0.01, kernel='linear')),
                    ('KNN', KNeighborsClassifier(n_neighbors=10))
                   ]

In [27]:
for bc in base_classifiers:
    lr = bc[1]
    lr.fit(dx_train, dy_train)
    
    predictions = lr.predict(dx_test)
    print(bc[0]+' cross validation accuracy:', cross_val_score(lr, dx_train, dy_train, cv=5).mean())
    print(bc[0]+' testing accuracy:',metrics.accuracy_score(dy_test, predictions))

DT cross validation accuracy: 0.9296703296703297
DT testing accuracy: 0.956140350877193
LR cross validation accuracy: 0.9780219780219781
LR testing accuracy: 0.9649122807017544
RF cross validation accuracy: 0.9494505494505494
RF testing accuracy: 0.956140350877193
SVM cross validation accuracy: 0.9802197802197803
SVM testing accuracy: 0.9649122807017544
KNN cross validation accuracy: 0.9670329670329669
KNN testing accuracy: 0.9649122807017544


In [29]:
# ensemble learning of Voting
ensemble = VotingClassifier(base_classifiers)
ensemble.fit(dx_train, dy_train)

predictions = ensemble.predict(dx_test)

print('Ensemble cross validation accuracy:', cross_val_score(ensemble, dx_train, dy_train, cv=5).mean())
print('Ensemble testing accuracy:', metrics.accuracy_score(dy_test, predictions))

Ensemble cross validation accuracy: 0.9758241758241757
Ensemble testing accuracy: 0.9649122807017544


In [31]:
# ensemble learning of Bootstrap Aggregation
each_learner_ensemble_size = 3
base_learners = []
base_predictions = []
base_accuracy = []

# train each learner
for bc in base_classifiers:
    for _ in range(each_learner_ensemble_size):
        lr = bc[1]
        
        # resample
        bootstrap_sample_indices = np.random.randint(0,len(dx_train), size = len(dx_train))
        bootstrap_x = dx_train[bootstrap_sample_indices]
        bootstrap_y = dy_train[bootstrap_sample_indices]
    
        lr.fit(bootstrap_x, bootstrap_y)
        base_learners.append(lr)
    
        predictions = lr.predict(dx_test)
        base_predictions.append(predictions)
    
        accuracy = metrics.accuracy_score(dy_test, predictions)
        base_accuracy.append(accuracy)

In [34]:
# ensemble all of the learning by voting
ensemble_predictions = []
for i in range(len(dy_test)):
    
    # count each number of different class
    counts = [0 for _ in range(len(base_learners))]
    for learner_p in base_predictions:
        counts[learner_p[i]] = counts[learner_p[i]] + 1
    final = np.argmax(counts)
    
    ensemble_predictions.append(final)

ensemble_acc = metrics.accuracy_score(dy_test, ensemble_predictions)

In [37]:
print('Base Learner:')
print('-'*50)
for index, acc in enumerate(sorted(base_accuracy)):
    print(f'Learner {index+1}: %.4f' % acc)
print('-'*50)
print(f'Ensemble model: %.4f' % ensemble_acc)

Base Learner:
--------------------------------------------------
Learner 1: 0.9123
Learner 2: 0.9123
Learner 3: 0.9474
Learner 4: 0.9474
Learner 5: 0.9474
Learner 6: 0.9474
Learner 7: 0.9561
Learner 8: 0.9561
Learner 9: 0.9561
Learner 10: 0.9649
Learner 11: 0.9649
Learner 12: 0.9649
Learner 13: 0.9737
Learner 14: 0.9825
Learner 15: 0.9825
--------------------------------------------------
Ensemble model: 0.9649
