<p><b>
    UFRN<br>
    Brain Institute<br>
    Computational Neurophysiology Lab<br>
    </b></p>
<p><i>Rodrigo Santiago</i></p>
<p> Natal, 2021 </p>
<br>
<br>
<p>DIM0872 — Machine Learning</p>

<h1>Check Point 5 — Ensemble Methods</h1>

<p>Python version:</p>

In [1]:
import sys
print(sys.version)

3.6.13 (default, Feb 20 2021, 21:42:50) 
[GCC 5.4.0 20160609]


<p>Versions of scientific modules:</p>

In [2]:
!pip3 freeze | grep numpy
!pip3 freeze | grep scipy
!pip3 freeze | grep scikit-learn

numpy==1.18.2
scipy==1.4.1
scikit-learn==0.22.2.post1


<h2>Loading modules and functions</h2>

In [3]:
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import minmax_scale
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier,StackingClassifier,RandomForestClassifier
from sklearn.model_selection import KFold,StratifiedKFold
# from scipy.stats import sem, iqr, shapiro
from time import time
import matplotlib.pyplot as plt
%matplotlib inline

<h2>Loading database</h2>

In [4]:
BaseOriginal          = pickle.load(open("BaseOriginal.txt","rb"))
BaseReduzida1         = pickle.load(open("BaseReduzida1.txt","rb"))
BaseReduzida2         = pickle.load(open("BaseReduzida2.txt","rb"))
BaseReduzida3         = pickle.load(open("BaseReduzida3.txt","rb"))
BaseOriginal_classes  = pickle.load(open("BaseOriginal_classes.txt","rb"))
BaseReduzida1_classes = pickle.load(open("BaseReduzida1_classes.txt","rb"))
BaseReduzida2_classes = pickle.load(open("BaseReduzida2_classes.txt","rb"))
BaseReduzida3_classes = pickle.load(open("BaseReduzida3_classes.txt","rb"))

In [5]:
X = BaseReduzida3
y = BaseReduzida3_classes

<h2>Classification</h2>

<h3>Number of estimators</h3>

In [6]:
n_leaners_list = [10,15,20]

<h3>Bagging</h3>

<h4>Function</h4>

In [7]:
def baggingAcc(X,y,model,n_leaners):
    
    # stratified 10-fold cross-validation
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    # accuracy list
    bagging_acc = []

    bagging = BaggingClassifier(base_estimator=model,n_estimators=n_leaners,
                                random_state=0,n_jobs=30,max_samples=0.7)
    
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        bagging.fit(X_train, y_train)
        bagging_acc.append(bagging.score(X_test,y_test))
    
    return bagging_acc

<h4>Decition Tree</h4>

In [8]:
model = DecisionTreeClassifier(random_state=0) # no prunning

bag_acc_dt = [np.mean(baggingAcc(X,y,model,n_leaners)) for n_leaners in n_leaners_list]

np.round(bag_acc_dt,2)

array([0.89, 0.86, 0.89])

<h4>k-Nearest Neighbors</h4>

In [9]:
model = KNeighborsClassifier(n_neighbors=1,weights='distance')

X_scaled = minmax_scale(BaseReduzida3)

bag_acc_knn = [np.mean(baggingAcc(X_scaled,y,model,n_leaners)) for n_leaners in n_leaners_list]

np.round(bag_acc_knn,2)

array([0.94, 0.98, 0.95])

<h4>Gaussian Naïve Bayes</h4>

In [10]:
model = GaussianNB()

bag_acc_nb = [np.mean(baggingAcc(X,y,model,n_leaners)) for n_leaners in n_leaners_list]

np.round(bag_acc_nb,2)

array([0.82, 0.82, 0.82])

<h4>Multi-Layer Perceptron</h4>

In [11]:
model = MLPClassifier(hidden_layer_sizes=50,learning_rate_init=.001,max_iter=2000,random_state=0)

bag_acc_mlp = [np.mean(baggingAcc(X,y,model,n_leaners)) for n_leaners in n_leaners_list]

np.round(bag_acc_mlp,2)

array([0.93, 0.92, 0.91])

<h4>Compilation</h4>

In [12]:
bag_acc = np.vstack((bag_acc_dt,bag_acc_knn,bag_acc_nb,bag_acc_mlp))
bag_acc

array([[0.89166667, 0.85833333, 0.89166667],
       [0.94166667, 0.975     , 0.95      ],
       [0.825     , 0.825     , 0.825     ],
       [0.93333333, 0.91666667, 0.90833333]])

In [13]:
mean_size = list(np.round(np.mean(bag_acc,axis=1),2))
mean_size.append('')
d = {'  ': ['DT','kNN','GNB','MLP','Mean (class)'],
     '10': [np.round(bag_acc_dt[0],2),
            np.round(bag_acc_knn[0],2),
            np.round(bag_acc_nb[0],2),
            np.round(bag_acc_mlp[0],2),
            np.round(np.mean(bag_acc,axis=0)[0],2)],
     '15': [np.round(bag_acc_dt[1],2),
            np.round(bag_acc_knn[1],2),
            np.round(bag_acc_nb[1],2),
            np.round(bag_acc_mlp[1],2),
            np.round(np.mean(bag_acc,axis=0)[1],2)],
     '20': [np.round(bag_acc_dt[2],2),
            np.round(bag_acc_knn[2],2),
            np.round(bag_acc_nb[2],2),
            np.round(bag_acc_mlp[2],2),
            np.round(np.mean(bag_acc,axis=0)[2],2)],
     'Mean (size)': mean_size}

In [14]:
df_bagging = pd.DataFrame(data=d)
print(df_bagging.to_string(index=False))

                 10    15    20 Mean (size)
           DT  0.89  0.86  0.89        0.88
          kNN  0.94  0.98  0.95        0.96
          GNB  0.82  0.82  0.82        0.82
          MLP  0.93  0.92  0.91        0.92
 Mean (class)  0.90  0.89  0.89            


<h3>AdaBoost</h3>

<h4>Function</h4>

In [15]:
def boostingAcc(X,y,model,n_leaners,algorithm='SAMME.R'):
    
    # stratified 10-fold cross-validation
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    # accuracy list
    boosting_acc = []

    boosting = AdaBoostClassifier(base_estimator=model,n_estimators=n_leaners,
                                  random_state=0,algorithm=algorithm)
    
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        boosting.fit(X_train, y_train)
        boosting_acc.append(boosting.score(X_test,y_test))
    
    return boosting_acc

<h4>Decition Tree</h4>

In [16]:
model = DecisionTreeClassifier(random_state=0) # no prunning

boo_acc_dt = [np.mean(boostingAcc(X,y,model,n_leaners)) for n_leaners in n_leaners_list]

np.round(boo_acc_dt,2)

array([0.82, 0.82, 0.82])

<h4>k-Nearest Neighbors</h4>

In [17]:
model = KNeighborsClassifier(n_neighbors=1,weights='distance')

X_scaled = minmax_scale(BaseReduzida3)

boo_acc_knn = [np.mean(boostingAcc(X_scaled,y,model,n_leaners)) for n_leaners in n_leaners_list]

np.round(boo_acc_knn,2)

ValueError: KNeighborsClassifier doesn't support sample_weight.

<h4>Gaussian Naïve Bayes</h4>

In [18]:
model = GaussianNB()

boo_acc_nb = [np.mean(boostingAcc(X,y,model,n_leaners,'SAMME')) for n_leaners in n_leaners_list]

np.round(boo_acc_nb,2)

array([0.8 , 0.82, 0.82])

<h4>Multi-Layer Perceptron</h4>

In [19]:
model = MLPClassifier(hidden_layer_sizes=50,learning_rate_init=.001,max_iter=2000,random_state=0)

boo_acc_mlp = [np.mean(boostingAcc(X,y,model,n_leaners)) for n_leaners in n_leaners_list]

np.round(boo_acc_mlp,2)

ValueError: MLPClassifier doesn't support sample_weight.

<h4>Compilation</h4>

In [20]:
# boo_acc = np.vstack((boo_acc_dt,boo_acc_knn,boo_acc_nb,boo_acc_mlp))
boo_acc = np.vstack((boo_acc_dt,boo_acc_nb))
boo_acc

array([[0.825, 0.825, 0.825],
       [0.8  , 0.825, 0.825]])

In [21]:
mean_size = list(np.round(np.mean(boo_acc,axis=1),2))
mean_size.append('')
# d = {'  ': ['DT','kNN','GNB','MLP','Mean (class)'],
d = {'  ': ['DT','GNB','Mean (class)'],
     '10': [np.round(boo_acc_dt[0],2),
#             np.round(boo_acc_knn[0],2),
            np.round(boo_acc_nb[0],2),
#             np.round(boo_acc_mlp[0],2),
            np.round(np.mean(boo_acc,axis=0)[0],2)],
     '15': [np.round(boo_acc_dt[1],2),
#             np.round(boo_acc_knn[1],2),
            np.round(boo_acc_nb[1],2),
#             np.round(boo_acc_mlp[1],2),
            np.round(np.mean(boo_acc,axis=0)[1],2)],
     '20': [np.round(boo_acc_dt[2],2),
#             np.round(boo_acc_knn[2],2),
            np.round(boo_acc_nb[2],2),
#             np.round(boo_acc_mlp[2],2),
            np.round(np.mean(boo_acc,axis=0)[2],2)],
     'Mean (size)': mean_size}

In [22]:
df_boosting = pd.DataFrame(data=d)
print(df_boosting.to_string(index=False))

                 10    15    20 Mean (size)
           DT  0.82  0.82  0.82        0.82
          GNB  0.80  0.82  0.82        0.82
 Mean (class)  0.81  0.82  0.82            


<h4>Base learners that support "sample_weigh" in the fitting method:</h4>

In [23]:
import inspect
from sklearn.utils.testing import all_estimators
for name, clf in all_estimators(type_filter='classifier'):
    if 'sample_weight' in inspect.getfullargspec(clf.fit)[0]:
        print(name)



AdaBoostClassifier
BaggingClassifier
BernoulliNB
CalibratedClassifierCV
CategoricalNB
ComplementNB
DecisionTreeClassifier
DummyClassifier
ExtraTreeClassifier
ExtraTreesClassifier
GaussianNB
GradientBoostingClassifier
LinearSVC
LogisticRegression
LogisticRegressionCV
MultiOutputClassifier
MultinomialNB
NuSVC
Perceptron
RandomForestClassifier
RidgeClassifier
RidgeClassifierCV
SGDClassifier
SVC
StackingClassifier
VotingClassifier




In [24]:
from sklearn.svm import LinearSVC

model = LinearSVC(random_state=0,max_iter=5000)

boo_acc_dt = [np.mean(boostingAcc(X,y,model,n_leaners,'SAMME')) for n_leaners in n_leaners_list]

np.round(boo_acc_dt,2)

array([0.94, 0.94, 0.94])

In [25]:
from sklearn.linear_model import Perceptron

model = Perceptron(random_state=0)

boo_acc_dt = [np.mean(boostingAcc(X,y,model,n_leaners,'SAMME')) for n_leaners in n_leaners_list]

np.round(boo_acc_dt,2)

array([0.92, 0.92, 0.92])

In [26]:
# stratified 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# accuracy list
SVM_acc = []

SVM = LinearSVC(random_state=0,max_iter=5000)

for train_index, test_index in kf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    SVM.fit(X_train, y_train)
    SVM_acc.append(SVM.score(X_test,y_test))

In [27]:
SVM_acc

[1.0, 0.75, 1.0, 1.0, 1.0, 1.0, 1.0, 0.6666666666666666, 1.0, 1.0]

In [28]:
np.mean(SVM_acc)

0.9416666666666667

In [29]:
# stratified 10-fold cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

# accuracy list
perceptron_acc = []

perceptron = Perceptron(random_state=0)

for train_index, test_index in kf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    perceptron.fit(X_train, y_train)
    perceptron_acc.append(perceptron.score(X_test,y_test))

In [30]:
perceptron_acc

[1.0, 1.0, 0.75, 1.0, 1.0, 1.0, 1.0, 0.6666666666666666, 1.0, 1.0]

In [31]:
np.mean(perceptron_acc)

0.9416666666666667

<h3>Homogeneous Stacking</h3>

<h4>Function</h4>

In [32]:
def stackingAcc(X,y,estimators,final_estimator):
    
    # stratified 10-fold cross-validation
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    # accuracy list
    stacking_acc = []

    stacking = StackingClassifier(estimators=estimators,final_estimator=final_estimator,n_jobs=30)
    
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        stacking.fit(X_train, y_train)
        stacking_acc.append(stacking.score(X_test,y_test))
    
    return stacking_acc

<h4>Decition Tree</h4>

In [33]:
estimators = [
    ('dt0', DecisionTreeClassifier(random_state=0,ccp_alpha=0.)),
    ('dt1', DecisionTreeClassifier(random_state=4,ccp_alpha=0.05)),
    ('dt2', DecisionTreeClassifier(random_state=0,ccp_alpha=0.1)),
    ('dt3', DecisionTreeClassifier(random_state=0,ccp_alpha=0.15)),
    ('dt4', DecisionTreeClassifier(random_state=0,ccp_alpha=0.2))
]

final_estimator = DecisionTreeClassifier(random_state=4)

sta_acc_dt = np.mean(stackingAcc(X,y,estimators,final_estimator))

np.round(sta_acc_dt,2)

0.85

<h4>k-Nearest Neighbors</h4>

In [34]:
X_scaled = minmax_scale(BaseReduzida3)

estimators = [
    ('knn0', KNeighborsClassifier(n_neighbors=1,weights='distance')),
    ('knn1', KNeighborsClassifier(n_neighbors=2,weights='distance')),
    ('knn2', KNeighborsClassifier(n_neighbors=8,weights='distance')),
    ('knn3', KNeighborsClassifier(n_neighbors=16,weights='distance')),
    ('knn4', KNeighborsClassifier(n_neighbors=22,weights='distance'))
]

final_estimator = KNeighborsClassifier(n_neighbors=1,weights='distance')

sta_acc_knn = np.mean(stackingAcc(X_scaled,y,estimators,final_estimator))

np.round(sta_acc_knn,2)

0.95

<h4>Gaussian Naïve Bayes</h4>

In [35]:
estimators = [
    ('nb0', GaussianNB(var_smoothing=1e-1)),
    ('nb1', GaussianNB(var_smoothing=5e-2)),
    ('nb2', GaussianNB(var_smoothing=1e-3)),
    ('nb3', GaussianNB(var_smoothing=1e-5)),
    ('nb4', GaussianNB(var_smoothing=1e-9))
]

final_estimator = GaussianNB()

sta_acc_nb = np.mean(stackingAcc(X,y,estimators,final_estimator))

np.round(sta_acc_nb,2)

0.94

<h4>Multi-Layer Perceptron</h4>

In [36]:
estimators = [
    ('mlp0', MLPClassifier(hidden_layer_sizes=50,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp1', MLPClassifier(hidden_layer_sizes=100,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp2', MLPClassifier(hidden_layer_sizes=150,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp3', MLPClassifier(hidden_layer_sizes=200,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp4', MLPClassifier(hidden_layer_sizes=300,learning_rate_init=.2,max_iter=2000,random_state=0))
]

final_estimator = MLPClassifier(hidden_layer_sizes=500,learning_rate_init=.2,max_iter=2000,random_state=0)

sta_acc_mlp = np.mean(stackingAcc(X,y,estimators,final_estimator))

np.round(sta_acc_mlp,2)

0.97

<h4>Compilation</h4>

In [37]:
sta_acc = np.vstack((sta_acc_dt,sta_acc_knn,sta_acc_nb,sta_acc_mlp))
sta_acc

array([[0.85      ],
       [0.95      ],
       [0.94166667],
       [0.96666667]])

In [38]:
mean_size = list(np.round(np.mean(bag_acc,axis=1),2))
mean_size.append('')
d = {'  ': ['DT','kNN','GNB','MLP','Mean'],
     'Stacking': [np.round(sta_acc[0,0],2),
                  np.round(sta_acc[1,0],2),
                  np.round(sta_acc[2,0],2),
                  np.round(sta_acc[3,0],2),
                  np.round(np.mean(sta_acc),2)]
     }

In [39]:
df_sta = pd.DataFrame(data=d)
print(df_sta.to_string(index=False))

       Stacking
   DT      0.85
  kNN      0.95
  GNB      0.94
  MLP      0.97
 Mean      0.93


<h3>Heterogeneous Stacking</h3>

In [40]:
from sklearn.linear_model import LogisticRegression

<h4>MLP and kNN</h4>

In [41]:
estimators = [
    ('mlp0', MLPClassifier(hidden_layer_sizes=50,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp1', MLPClassifier(hidden_layer_sizes=100,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp2', MLPClassifier(hidden_layer_sizes=150,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp3', MLPClassifier(hidden_layer_sizes=200,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp4', MLPClassifier(hidden_layer_sizes=300,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('knn0', KNeighborsClassifier(n_neighbors=1,weights='distance')),
    ('knn1', KNeighborsClassifier(n_neighbors=2,weights='distance')),
    ('knn2', KNeighborsClassifier(n_neighbors=8,weights='distance')),
    ('knn3', KNeighborsClassifier(n_neighbors=16,weights='distance')),
    ('knn4', KNeighborsClassifier(n_neighbors=22,weights='distance'))
]

final_estimator = None # logistic regression (default)

sta_acc_het_mlp_knn = np.mean(stackingAcc(X,y,estimators,final_estimator))

np.round(sta_acc_het_mlp_knn,2)

0.93

<h4>MLP and DT</h4>

In [42]:
estimators = [
    ('mlp0', MLPClassifier(hidden_layer_sizes=50,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp1', MLPClassifier(hidden_layer_sizes=100,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp2', MLPClassifier(hidden_layer_sizes=150,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp3', MLPClassifier(hidden_layer_sizes=200,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp4', MLPClassifier(hidden_layer_sizes=300,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('dt0', DecisionTreeClassifier(random_state=0,ccp_alpha=0.)),
    ('dt1', DecisionTreeClassifier(random_state=4,ccp_alpha=0.05)),
    ('dt2', DecisionTreeClassifier(random_state=0,ccp_alpha=0.1)),
    ('dt3', DecisionTreeClassifier(random_state=0,ccp_alpha=0.15)),
    ('dt4', DecisionTreeClassifier(random_state=0,ccp_alpha=0.2))
]

final_estimator = None # logistic regression (default)

sta_acc_het_mlp_dt = np.mean(stackingAcc(X,y,estimators,final_estimator))

np.round(sta_acc_het_mlp_dt,2)

0.88

<h4>kNN and DT</h4>

In [43]:
estimators = [
    ('knn0', KNeighborsClassifier(n_neighbors=1,weights='distance')),
    ('knn1', KNeighborsClassifier(n_neighbors=2,weights='distance')),
    ('knn2', KNeighborsClassifier(n_neighbors=8,weights='distance')),
    ('knn3', KNeighborsClassifier(n_neighbors=16,weights='distance')),
    ('knn4', KNeighborsClassifier(n_neighbors=22,weights='distance')),
    ('dt0', DecisionTreeClassifier(random_state=0,ccp_alpha=0.)),
    ('dt1', DecisionTreeClassifier(random_state=4,ccp_alpha=0.05)),
    ('dt2', DecisionTreeClassifier(random_state=0,ccp_alpha=0.1)),
    ('dt3', DecisionTreeClassifier(random_state=0,ccp_alpha=0.15)),
    ('dt4', DecisionTreeClassifier(random_state=0,ccp_alpha=0.2))
]

final_estimator = None # logistic regression (default)

sta_acc_het_knn_dt = np.mean(stackingAcc(X,y,estimators,final_estimator))

np.round(sta_acc_het_knn_dt,2)

0.82

<h4>MLP, kNN and DT</h4>

In [44]:
estimators = [
    ('mlp0', MLPClassifier(hidden_layer_sizes=50,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp1', MLPClassifier(hidden_layer_sizes=100,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp2', MLPClassifier(hidden_layer_sizes=150,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp3', MLPClassifier(hidden_layer_sizes=200,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('mlp4', MLPClassifier(hidden_layer_sizes=300,learning_rate_init=.2,max_iter=2000,random_state=0)),
    ('knn0', KNeighborsClassifier(n_neighbors=1,weights='distance')),
    ('knn1', KNeighborsClassifier(n_neighbors=2,weights='distance')),
    ('knn2', KNeighborsClassifier(n_neighbors=8,weights='distance')),
    ('knn3', KNeighborsClassifier(n_neighbors=16,weights='distance')),
    ('knn4', KNeighborsClassifier(n_neighbors=22,weights='distance')),
    ('dt0', DecisionTreeClassifier(random_state=0,ccp_alpha=0.)),
    ('dt1', DecisionTreeClassifier(random_state=4,ccp_alpha=0.05)),
    ('dt2', DecisionTreeClassifier(random_state=0,ccp_alpha=0.1)),
    ('dt3', DecisionTreeClassifier(random_state=0,ccp_alpha=0.15)),
    ('dt4', DecisionTreeClassifier(random_state=0,ccp_alpha=0.2))
]

final_estimator = None # logistic regression (default)

sta_acc_het_mlp_knn_dt = np.mean(stackingAcc(X,y,estimators,final_estimator))

np.round(sta_acc_het_mlp_knn_dt,2)

0.88

<h3>Random Forest</h3>

<h4>Function</h4>

In [45]:
def randomforestAcc(X,y,n_estimators,seed):
    
    # stratified 10-fold cross-validation
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    # accuracy list
    rf_acc_list = []

    rf = RandomForestClassifier(n_estimators=n_estimators,n_jobs=30,random_state=seed)
    
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        rf.fit(X_train, y_train)
        rf_acc_list.append(rf.score(X_test,y_test))
    
    return rf_acc_list

<h4>Accuracy for 10, 15 and 20 learners</h4>

In [46]:
rf_acc = [np.mean(randomforestAcc(X,y,n_leaners,seed=seed))
          for n_leaners in n_leaners_list
          for seed in np.arange(30)]

np.round(np.mean(np.split(np.array(rf_acc),3),axis=1),2)

array([0.8 , 0.82, 0.83])

<h3>Bagging with bootstrap of the features</h3>

<h4>Function</h4>

In [47]:
def baggingAcc(X,y,model,n_leaners,max_features=0.9):
    
    # stratified 10-fold cross-validation
    kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

    # accuracy list
    bagging_acc = []

    bagging = BaggingClassifier(base_estimator=model,n_estimators=n_leaners,
                                bootstrap_features=True,max_features=max_features,
                                random_state=0,n_jobs=30,max_samples=0.7)
    
    for train_index, test_index in kf.split(X,y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        bagging.fit(X_train, y_train)
        bagging_acc.append(bagging.score(X_test,y_test))
    
    return bagging_acc

<h4>Decition Tree</h4>

In [48]:
model = DecisionTreeClassifier(random_state=0) # no prunning

bag_acc_dt = [np.mean(baggingAcc(X,y,model,n_leaners,max_features=0.8)) for n_leaners in n_leaners_list]

np.round(bag_acc_dt,2)

array([0.92, 0.89, 0.92])

<h4>k-Nearest Neighbors</h4>

In [49]:
model = KNeighborsClassifier(n_neighbors=1,weights='distance')

X_scaled = minmax_scale(BaseReduzida3)

bag_acc_knn = [np.mean(baggingAcc(X_scaled,y,model,n_leaners,max_features=0.8)) for n_leaners in n_leaners_list]

np.round(bag_acc_knn,2)

array([1.  , 0.95, 0.88])

<h4>Gaussian Naïve Bayes</h4>

In [50]:
model = GaussianNB()

bag_acc_nb = [np.mean(baggingAcc(X,y,model,n_leaners,max_features=0.8)) for n_leaners in n_leaners_list]

np.round(bag_acc_nb,2)

array([0.78, 0.81, 0.83])

<h4>Multi-Layer Perceptron</h4>

In [51]:
model = MLPClassifier(hidden_layer_sizes=50,learning_rate_init=.2,max_iter=2000,random_state=0)

bag_acc_mlp = [np.mean(baggingAcc(X,y,model,n_leaners,max_features=0.8)) for n_leaners in n_leaners_list]

np.round(bag_acc_mlp,2)

array([0.92, 0.84, 0.9 ])

<h4>Compilation</h4>

In [52]:
bag_acc = np.vstack((bag_acc_dt,bag_acc_knn,bag_acc_nb,bag_acc_mlp))
bag_acc

array([[0.91666667, 0.89166667, 0.925     ],
       [1.        , 0.95      , 0.88333333],
       [0.775     , 0.80833333, 0.83333333],
       [0.925     , 0.84166667, 0.9       ]])

In [53]:
mean_size = list(np.round(np.mean(bag_acc,axis=1),2))
mean_size.append('')
d = {'  ': ['DT','kNN','GNB','MLP','Mean (class)'],
     '10': [np.round(bag_acc_dt[0],2),
            np.round(bag_acc_knn[0],2),
            np.round(bag_acc_nb[0],2),
            np.round(bag_acc_mlp[0],2),
            np.round(np.mean(bag_acc,axis=0)[0],2)],
     '15': [np.round(bag_acc_dt[1],2),
            np.round(bag_acc_knn[1],2),
            np.round(bag_acc_nb[1],2),
            np.round(bag_acc_mlp[1],2),
            np.round(np.mean(bag_acc,axis=0)[1],2)],
     '20': [np.round(bag_acc_dt[2],2),
            np.round(bag_acc_knn[2],2),
            np.round(bag_acc_nb[2],2),
            np.round(bag_acc_mlp[2],2),
            np.round(np.mean(bag_acc,axis=0)[2],2)],
     'Mean (size)': mean_size}

In [54]:
df_bagging = pd.DataFrame(data=d)
print(df_bagging.to_string(index=False))

                 10    15    20 Mean (size)
           DT  0.92  0.89  0.92        0.91
          kNN  1.00  0.95  0.88        0.94
          GNB  0.78  0.81  0.83        0.81
          MLP  0.92  0.84  0.90        0.89
 Mean (class)  0.90  0.87  0.89            
