# Introduction

In this notebook, I'll be demonstrating decision trees and random forests.

If you haven't already, please refer to [01-data-exploration.ipynb](), as that notebook describes most of the data loading and pre-processing steps that we'll perform at the beginning of this notebook.

Links of interest:
- [Scikit-Learn: Decision Trees](https://scikit-learn.org/stable/modules/tree.html)
- [Scikit-Learn: Bagging Meta Estimator](https://scikit-learn.org/stable/modules/ensemble.html#bagging-meta-estimator)
- [Scikit-Learn: Forests of Randomized Trees](https://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees)

# Imports, Data Access / Loading, and Pre-processing

In [None]:
import os
import pandas as pd
import numpy as np

# We use two different plotting libraries, depending on which kind of plot we want
import matplotlib.pyplot as plt
import seaborn as sns

# Set an option for Pandas to display smaller floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format

# Turn off warnings
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.decomposition import PCA
from sklearn import metrics

In [None]:
# Need to get Google Drive access
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Load the dataset into a Pandas dataframe
data_dir = os.path.join('/content/gdrive/My Drive/classes/be432-2021/notebooks/wisconsin_breast_cancer_data.csv')
df = pd.read_csv(data_dir)

In [None]:
label_encoder = LabelEncoder()
diagnosis_cat = df['diagnosis']

# Fit the encoder to the categories, and immediately 
diagnosis_lab = label_encoder.fit_transform(diagnosis_cat)

# Add the diagnosis label back to the dataframe
df['diagnosis_label'] = diagnosis_lab

In [None]:
df

In [None]:
# Create the splitting object
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=25)

# Apply the split to the data frame using the "diagnosis" column as our label
for train_index, test_index in split.split(df, df["diagnosis"]):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]

In [None]:
training_values = train_set.drop(['id','diagnosis', 'diagnosis_label'], axis=1)
training_labels = train_set[['diagnosis_label']].copy()

testing_values = test_set.drop(['id','diagnosis', 'diagnosis_label'], axis=1)
testing_labels = test_set[['diagnosis_label']].copy()

In [None]:
# Separate out our training data into classes for easier plotting
malignant = training_values.loc[training_labels['diagnosis_label'] == 1,:]
benign = training_values.loc[training_labels['diagnosis_label'] == 0,:]

# Decision Trees

## Training

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

tree_clf = DecisionTreeClassifier()
tree_clf.fit(training_values, training_labels)

# Make predictions
tree_predictions = tree_clf.predict(testing_values)

In [None]:
tree_predictions

## Evaluation

In [None]:
print(55 * "=")
print("Decision Trees")
print(55 * "-")
print(metrics.classification_report(testing_labels, tree_predictions, target_names=['Benign', 'Malignant']))

In [None]:
# Print confusion matrix
print(55 * "=")
print("Decision Trees")
print(55 * "-")
tree_matrix = metrics.confusion_matrix(testing_labels, tree_predictions)

print("True Positive: {}".format(tree_matrix[1][1]))
print("True Negative: {}".format(tree_matrix[0][0]))
print("False Positive: {}".format(tree_matrix[0][1]))
print("False Negative: {}".format(tree_matrix[1][0]))


## Visualization

In [None]:
# Print the actual decision tree
import graphviz
import pydotplus

plot_colors = "ryb"
plot_step = 0.02

for pairidx, pair in enumerate([[0, 1]]):
    # We only take the two corresponding features
    X = np.array(training_values.iloc[:, pair])
    y = np.array(training_labels)

    # Train
    clf = DecisionTreeClassifier().fit(X, y)

    # Plot the decision boundary
    # plt.subplot(1, 1, pairidx + 1)

    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))
    plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)

    plt.xlabel(training_values.columns[pair[0]])
    plt.ylabel(training_values.columns[pair[1]])

    # Plot the training points
    for i, color in zip(range(len(label_encoder.classes_)), plot_colors):
        idx = np.where(y == i)
        plt.scatter(X[idx, 0], X[idx, 1], c=color, label=label_encoder.classes_[i],
                    cmap=plt.cm.RdYlBu, edgecolor='black', s=15)

plt.suptitle("Decision surface of a decision tree using paired features")
plt.legend(loc='lower right', borderpad=0, handletextpad=0)
plt.axis("tight")
plt.show()
# plt.figure()
# tree_clf = DecisionTreeClassifier().fit(training_values, training_labels)
# tree.plot_tree(tree_clf, filled=True)
# plt.show()

In [None]:
dot_data = tree.export_graphviz(tree_clf, out_file=None, 
                                feature_names=training_values.columns,  
                                class_names=label_encoder.classes_,  
                                filled=True, rounded=True,  
                                special_characters=True)  

# Create the graph from the dot data
pydot_graph = pydotplus.graph_from_dot_data(dot_data)

# Set the output size
pydot_graph.set_size('"10,10!"')

# Create the graphviz object
gvz_graph = graphviz.Source(pydot_graph.to_string())

# Display
gvz_graph

# Decision Trees: Random Subsamples

Decision Trees are sensitive to training -- we can see this by randomly sub-sampling the training set and creating trees from it.

In [None]:
# Create a vector to randomly split the data
n_training = len(training_values)
idx_split = np.arange(0, n_training)
np.random.shuffle(idx_split)

training_values_a = training_values.iloc[idx_split[:int(n_training/2)], :]
training_labels_a = training_labels.iloc[idx_split[:int(n_training/2)]]

training_values_b = training_values.iloc[idx_split[int(n_training/2):], :]
training_labels_b = training_labels.iloc[idx_split[int(n_training/2):]]

In [None]:
tree_a = DecisionTreeClassifier()
tree_a.fit(training_values_a, training_labels_a)

tree_b = DecisionTreeClassifier()
tree_b.fit(training_values_b, training_labels_b)

In [None]:
dot_data = tree.export_graphviz(tree_a, out_file=None, 
                                feature_names=training_values.columns,  
                                class_names=label_encoder.classes_,  
                                filled=True, rounded=True,  
                                special_characters=True)  

# Create the graph from the dot data
pydot_graph = pydotplus.graph_from_dot_data(dot_data)

# Set the output size
pydot_graph.set_size('"10,10!"')

# Create the graphviz object
gvz_graph = graphviz.Source(pydot_graph.to_string())

# Display
gvz_graph

In [None]:
dot_data = tree.export_graphviz(tree_b, out_file=None, 
                                feature_names=training_values.columns,  
                                class_names=label_encoder.classes_,  
                                filled=True, rounded=True,  
                                special_characters=True)  

# Create the graph from the dot data
pydot_graph = pydotplus.graph_from_dot_data(dot_data)

# Set the output size
pydot_graph.set_size('"10,10!"')

# Create the graphviz object
gvz_graph = graphviz.Source(pydot_graph.to_string())

# Display
gvz_graph

# Random Forests

## Training

In [None]:
from sklearn.ensemble import RandomForestClassifier

max_features = 10
n_estimators = 5

rf_clf = RandomForestClassifier(max_features=max_features, n_estimators=n_estimators)
rf_clf.fit(training_values, training_labels)

# Make predictions
rf_predictions = rf_clf.predict(testing_values)

## Evaluation

In [None]:
print(55 * "=")
print(f"Random Forests with {max_features} features and {n_estimators} trees:")
print(55 * "-")
print(metrics.classification_report(testing_labels, rf_predictions, target_names=['Benign', 'Malignant']))

In [None]:
# Print confusion matrix
print(55 * "=")
print(f"Random Forests with {max_features} features and {n_estimators} trees:")
print(55 * "-")
rf_matrix = metrics.confusion_matrix(testing_labels, rf_predictions)

print("True Positive: {}".format(rf_matrix[1][1]))
print("True Negative: {}".format(rf_matrix[0][0]))
print("False Positive: {}".format(rf_matrix[0][1]))
print("False Negative: {}".format(rf_matrix[1][0]))


In [None]:
for max_features in [10, 20, 30]:
  for n_estimators in [5, 10, 30, 50, 100]:
    rf_clf = RandomForestClassifier(max_features=max_features, n_estimators=n_estimators)
    rf_clf.fit(training_values, training_labels)

    # Make predictions
    rf_predictions = rf_clf.predict(testing_values)
    print(55 * "=")
    print(f"Random Forests with {max_features} features and {n_estimators} trees:")
    print(55 * "-")
    print(metrics.classification_report(testing_labels, rf_predictions, target_names=['Benign', 'Malignant']))