# Introduction

In this notebook, I'll be demonstrating ensemble methods.

We will start with bagging and boosting, and then show an example of AdaBoost (Adaptive Boosting).

If you haven't already, please refer to [01-data-exploration.ipynb](), as that notebook describes most of the data loading and pre-processing steps that we'll perform at the beginning of this notebook.

Links of interest:
- [Scikit-Learn: Decision Trees](https://scikit-learn.org/stable/modules/tree.html)
- [Scikit-Learn: Bagging Meta Estimator](https://scikit-learn.org/stable/modules/ensemble.html#bagging-meta-estimator)
- [Scikit-Learn: Forests of Randomized Trees](https://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees)
- [Scikit-Learn: Ensemble Methods: AdaBoost](https://scikit-learn.org/stable/modules/ensemble.html#adaboost)

# Imports, Data Access / Loading, and Pre-processing

In [None]:
import os
import pandas as pd
import numpy as np

# We use two different plotting libraries, depending on which kind of plot we want
import matplotlib.pyplot as plt
import seaborn as sns

# Set an option for Pandas to display smaller floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format

# Turn off warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn import metrics

In [None]:
# Need to get Google Drive access
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Load the dataset into a Pandas dataframe
data_dir = os.path.join('/content/gdrive/My Drive/classes/be432-2021/notebooks/wisconsin_breast_cancer_data.csv')
df = pd.read_csv(data_dir)

In [None]:
df.head()

In [None]:
label_encoder = LabelEncoder()
diagnosis_cat = df['diagnosis']

# Fit the encoder to the categories, and immediately 
diagnosis_lab = label_encoder.fit_transform(diagnosis_cat)

# Add the diagnosis label back to the dataframe
df['diagnosis_label'] = diagnosis_lab

In [None]:
# Create the splitting object
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=25)

# Apply the split to the data frame using the "diagnosis" column as our label
for train_index, test_index in split.split(df, df["diagnosis"]):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]

In [None]:
training_values = train_set.drop(['id','diagnosis', 'diagnosis_label'], axis=1)
training_labels = train_set[['diagnosis_label']].copy()

testing_values = test_set.drop(['id','diagnosis', 'diagnosis_label'], axis=1)
testing_labels = test_set[['diagnosis_label']].copy()

In [None]:
# Separate out our training data into classes for easier plotting
malignant = training_values.loc[training_labels['diagnosis_label'] == 1,:]
benign = training_values.loc[training_labels['diagnosis_label'] == 0,:]

# Bagging

## Training

In [None]:
# Import our classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier

# Import the bagging class
from sklearn.ensemble import BaggingClassifier

In [None]:
# Construct individual classifiers as well as the bagged version of each one
tree_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()


tree_bag = BaggingClassifier(tree_clf,
                            max_samples=0.5, max_features=0.5)
knn_bag = BaggingClassifier(knn_clf,
                            max_samples=0.5, max_features=0.5)

# Train each of the classifiers on the training data
tree_clf.fit(training_values, training_labels)
knn_clf.fit(training_values, training_labels)

tree_bag.fit(training_values, training_labels)
knn_bag.fit(training_values, training_labels)

## Evaluation

In [None]:
# Perform prediction for each of the classifiers
tree_clf_predictions = tree_clf.predict(testing_values)
knn_clf_predictions  = knn_clf.predict(testing_values)
tree_bag_predictions = tree_bag.predict(testing_values)
knn_bag_predictions  = knn_bag.predict(testing_values)

In [None]:
print(tree_clf_predictions)

In [None]:
print(55 * "=")
print("Decision Tree Classifier")
print(55 * "-")
print(metrics.classification_report(testing_labels, tree_clf_predictions, target_names=['Benign', 'Malignant']))

In [None]:
print(55 * "=")
print("Decision Tree Bagging")
print(55 * "-")
print(metrics.classification_report(testing_labels, tree_bag_predictions, target_names=['Benign', 'Malignant']))

In [None]:
print(55 * "=")
print("KNN Classifier")
print(55 * "-")
print(metrics.classification_report(testing_labels, knn_clf_predictions, target_names=['Benign', 'Malignant']))

In [None]:
print(55 * "=")
print("KNN Bagging")
print(55 * "-")
print(metrics.classification_report(testing_labels, knn_bag_predictions, target_names=['Benign', 'Malignant']))

# Testing Robustness and Stability

In [None]:
#  How many trials to run
n_repeat = 50

# Size of the training set to use in each trial
n_train = 200

In [None]:
estimators = [("Tree", DecisionTreeClassifier()),
              ("Bagging(Tree)", BaggingClassifier(DecisionTreeClassifier())),
              ("KNN", KNeighborsClassifier(n_neighbors=3)),
              ("Bagging(KNN)", BaggingClassifier(KNeighborsClassifier(n_neighbors=3)))]

n_estimators = len(estimators)

In [None]:
# Loop over estimators to compare
for n, (name, estimator) in enumerate(estimators):

    # Compute predictions
    y_scores = []

    for i in range(n_repeat):
        training_idx = np.arange(0, len(training_labels))
        np.random.shuffle(training_idx)
        training_idx = training_idx[:n_train]
        
        estimator.fit(training_values.iloc[training_idx,:], training_labels.iloc[training_idx])

        y_predict = estimator.predict(testing_values)
        y_scores.append(metrics.f1_score(testing_labels, y_predict))

    # Print the results
    print(f'F-1 Scores for {name}:')
    print(f'Average Score: {np.mean(y_scores):.2}')
    print(f'STD of Score: {np.std(y_scores):.2}')
    print()

# AdaBoost

In [None]:
# Create and fit an AdaBoosted decision tree
from sklearn.ensemble import AdaBoostClassifier

bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)

# Train using just two features so we can visualize
X_train = training_values.iloc[:,:2]
y_train = training_labels.iloc[:]

bdt.fit(X_train, y_train)


In [None]:
# Plot the decision boundaries
plt.figure(figsize=(10, 10))
plot_step = 0.02

x_min, x_max = X_train.iloc[:, 0].min() - 1, X_train.iloc[:, 0].max() + 1
y_min, y_max = X_train.iloc[:, 1].min() - 1, X_train.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                     np.arange(y_min, y_max, plot_step))

Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = plt.contourf(xx, yy, Z)
plt.axis("tight")

# Plot the training points
for i, n in zip(range(2), ["Benign", "Malignant"]):
    idx = np.where(y_train == i)
    idx = idx[0]
    plt.scatter(X_train.iloc[idx, 0], X_train.iloc[idx, 1],
                label="Class %s" % n)
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.legend(loc='upper right', frameon=True)
plt.xlabel('Radius Mean')
plt.ylabel('Texture Mean')
plt.title('Decision Boundary')
plt.show()

# Receiver Operating Characteristic Curves

In [None]:
# Re-train adaboost with full training set
bdt.fit(training_values, training_labels)

testing_predictions = bdt.predict_proba(testing_values)

In [None]:
print(testing_predictions)

In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(2):
    fpr[i], tpr[i], _ = metrics.roc_curve(testing_labels, testing_predictions[:,i])
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])

In [None]:
print(f"{fpr[0]}")
print(f"{fpr[1]}")

In [None]:
len(fpr[1])

In [None]:
plt.figure()
lw = 2
plt.plot(fpr[1], tpr[1], color='darkorange',
         lw=lw, label=f'ROC (AUC = {roc_auc[1]})')
plt.plot([0, 1], [0, 1], color='navy', 
         lw=lw, linestyle='--', label=f'Random')
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()