## Pre-processing and Visualization

In [1]:
# Import the required libraries.
import copy
import random
import numpy as np
import pandas as pd
from string import ascii_lowercase
from itertools import combinations
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support

In [2]:
# Set the random seed.
random.seed(15)
np.random.seed(15)

In [3]:
# Read the dataset.
data = pd.read_csv('../ensemble_data.csv')

In [4]:
data = data.apply(lambda x: x.astype(str).str.lower())

In [5]:
# Generate the map for categorical data.
letter_map = {letter: int(index) for index, letter in enumerate(ascii_lowercase, start = 1)}
letter_map['?'] = 0

In [6]:
# Make the categorical variables as numeric.
for col in data.columns:
    data[col] = data[col].map(letter_map)

In [7]:
data

Unnamed: 0,type,cap_shape,cap_surface,cap_color,bruises,odor,gill_attachment,gill_spacing,gill_size,gill_color,...,stalk_surface_below_ring,stalk_color_above_ring,stalk_color_below_ring,veil_type,veil_color,ring_number,ring_type,spore_print_color,population,habitat
0,16,24,19,14,20,16,6,3,14,11,...,19,23,23,16,23,15,16,11,19,21
1,5,24,19,25,20,1,6,3,2,11,...,19,23,23,16,23,15,16,14,14,7
2,5,2,19,23,20,12,6,3,2,14,...,19,23,23,16,23,15,16,14,14,13
3,16,24,25,23,20,16,6,3,14,14,...,19,23,23,16,23,15,16,11,19,21
4,5,24,19,7,6,14,6,23,2,11,...,19,23,23,16,23,15,5,14,1,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,5,11,19,14,6,14,1,3,2,25,...,19,15,15,16,15,15,16,2,3,12
8120,5,24,19,14,6,14,1,3,2,25,...,19,15,15,16,14,15,16,2,22,12
8121,5,6,19,14,6,14,1,3,2,14,...,19,15,15,16,15,15,16,2,3,12
8122,16,11,25,14,6,25,6,3,14,2,...,11,23,23,16,23,15,5,23,22,12


In [8]:
# Generate the attribute and class datasets.
X = data.loc[:, data.columns != 'type']
Y = data.loc[:, data.columns == 'type']

In [9]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X.values, Y.values, test_size = 0.40, random_state = 42)

In [10]:
kf = KFold(n_splits = 5, shuffle = True)

## Stacking Model

In [11]:
names = ["K-Nearest Neighbors", "Linear SVM", "Decision Tree", "Random Forest",
         "Neural Net", "Naive Bayes", "Logistic Regression"]
classifiers = [KNeighborsClassifier(2),
               SVC(kernel = "linear", C = 0.025),
               DecisionTreeClassifier(max_depth = 4),
               RandomForestClassifier(max_depth = 4, n_estimators = 5, max_features = 1),
               MLPClassifier(alpha = 1, max_iter = 100),
               GaussianNB(),
               LogisticRegression(solver = 'liblinear', max_iter = 300)]

In [12]:
X1_Train = np.zeros((X_Train.shape[0], len(names)))
Y1_Train = np.zeros((Y_Train.shape[0], 1))

# Generate the training dataset for the Level-1 classifier.
j = 0
for train_index, val_index in kf.split(X_Train):
    X_train, X_val = X_Train[train_index], X_Train[val_index]
    Y_train, Y_val = Y_Train[train_index], Y_Train[val_index]
    preds = np.zeros((X_val.shape[0], len(names)))
    i = 0

    for name, classifier in zip(names, classifiers):
        classifier.fit(X_train, Y_train.ravel())
        pred = classifier.predict(X_val)
        preds[:, i] = pred
        i += 1

    X1_Train[j: j + X_val.shape[0], :] = preds
    Y1_Train[j: j + Y_val.shape[0],] = Y_val
    j += X_val.shape[0]



In [13]:
# Train the individual base models.
trained_base_classifiers = copy.deepcopy(classifiers)  
for name, classifier in zip(names, trained_base_classifiers):
    classifier.fit(X_Train, Y_Train.ravel())



In [14]:
X1_Test = np.zeros((X_Test.shape[0], len(names)))
Y1_Test = Y_Test

# Generate the test dataset for the Level-1 classifier.
i = 0
for name, classifier in zip(names, trained_base_classifiers):
    pred = classifier.predict(X_Test)
    X1_Test[:, i] = pred
    i += 1

In [15]:
# Train the individual meta models.
trained_meta_classifiers = copy.deepcopy(classifiers)
for name, classifier in zip(names, trained_meta_classifiers):
    classifier.fit(X1_Train, Y1_Train.ravel())



In [16]:
# Output the scores of the trained base classifiers.
print("Base Learners:")
for name, classifier in zip(names, trained_base_classifiers):
    print(name + ":")
    Y_Pred = classifier.predict(X_Test)
    prf = precision_recall_fscore_support(Y_Test, Y_Pred, average = 'macro')
    score = classifier.score(X_Test, Y_Test.ravel())
    print("Accuracy: ", round(score, 4), " Precision: ", round(prf[0], 4), " Recall: ", round(prf[1], 4), " F-Score: ", round(prf[2], 4))

Base Learners:
K-Nearest Neighbors:
Accuracy:  0.9997  Precision:  0.9997  Recall:  0.9997  F-Score:  0.9997
Linear SVM:
Accuracy:  0.9665  Precision:  0.9663  Recall:  0.9668  F-Score:  0.9664
Decision Tree:
Accuracy:  0.9791  Precision:  0.979  Recall:  0.9794  F-Score:  0.9791
Random Forest:
Accuracy:  0.9268  Precision:  0.9288  Recall:  0.9257  F-Score:  0.9265
Neural Net:
Accuracy:  1.0  Precision:  1.0  Recall:  1.0  F-Score:  1.0
Naive Bayes:
Accuracy:  0.8643  Precision:  0.8743  Recall:  0.867  F-Score:  0.8639
Logistic Regression:
Accuracy:  0.9582  Precision:  0.958  Recall:  0.9583  F-Score:  0.9581


In [17]:
# Output the scores of the trained meta classifiers.
print("Meta-Learners:")
for name, classifier in zip(names, trained_meta_classifiers):
    print(name + ":")
    Y1_Pred = classifier.predict(X1_Test)
    prf = precision_recall_fscore_support(Y1_Test, Y1_Pred, average = 'macro')
    score = classifier.score(X1_Test, Y1_Test.ravel())
    print("Accuracy: ", round(score, 4), " Precision: ", round(prf[0], 4), " Recall: ", round(prf[1], 4), " F-Score: ", round(prf[2], 4))

Meta-Learners:
K-Nearest Neighbors:
Accuracy:  0.9997  Precision:  0.9997  Recall:  0.9997  F-Score:  0.9997
Linear SVM:
Accuracy:  1.0  Precision:  1.0  Recall:  1.0  F-Score:  1.0
Decision Tree:
Accuracy:  1.0  Precision:  1.0  Recall:  1.0  F-Score:  1.0
Random Forest:
Accuracy:  0.9895  Precision:  0.9894  Recall:  0.9899  F-Score:  0.9895
Neural Net:
Accuracy:  1.0  Precision:  1.0  Recall:  1.0  F-Score:  1.0
Naive Bayes:
Accuracy:  1.0  Precision:  1.0  Recall:  1.0  F-Score:  1.0
Logistic Regression:
Accuracy:  1.0  Precision:  1.0  Recall:  1.0  F-Score:  1.0
