# Introduction

In this notebook, I'll be illustrating some common data science / machine learning algorithms, and showing how to use them in Python. Most of these algorithms are implemented in the `scikit-learn` package, so that's what we'll be using here.

If you haven't already, please refer to [01-data-exploration.ipynb](), as that notebook describes most of the data loading and pre-processing steps that we'll perform at the beginning of this notebook.

# Imports, Data Access / Loading, and Pre-processing

This is a big chunk of code that basically does what all the code in the previous notebook does:

- Loads the CSV data
- Splits the data into training and testing sets via stratified sampling
- Cleans the data:
  - Transforms the target variable
  - Drops NA values
  - Scales the attributes
- Creates a pipeline to handle the data preprocessing

For the purposes of this section, we will not do any feature combination, correlation analysis, or dimensionality reduction just yet.

## Imports

In [None]:
import os
import pandas as pd
import numpy as np

# We use two different plotting libraries, depending on which kind of plot we want
import matplotlib.pyplot as plt
import seaborn as sns

# Set an option for Pandas to display smaller floating-point numbers
pd.options.display.float_format = '{:,.2f}'.format

# Turn off warnings
import warnings
warnings.filterwarnings("ignore")

## Data Loading

### Google Drive (for Colab)

In [None]:
# Need to get Google Drive access
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
# Load the dataset into a Pandas dataframe
data_dir = os.path.join('/content/gdrive/My Drive/2020-tata-memorial-workshop/wisconsin_breast_cancer_data.csv')
df = pd.read_csv(data_dir)

### Local Files

In [None]:
data_path = os.path.join('data', 'bca_wisconsin', 'bca_wisconsin.csv')
df = pd.read_csv(data_path)

In [None]:
df.head()

## Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

diagnosis_cat = df['diagnosis']

# Fit the encoder to the categories, and immediately 
diagnosis_lab = label_encoder.fit_transform(diagnosis_cat)

# Add the diagnosis label back to the dataframe
df['diagnosis_label'] = diagnosis_lab

## Train/Test Splitting

In [None]:
# Stratified Split
from sklearn.model_selection import StratifiedShuffleSplit

# Create the splitting object
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

# Apply the split to the data frame using the "diagnosis" column as our label
for train_index, test_index in split.split(df, df["diagnosis"]):
    train_set = df.loc[train_index]
    test_set = df.loc[test_index]

In [None]:
training_values = train_set.drop(['id','diagnosis', 'diagnosis_label'], axis=1)
training_labels = train_set[['diagnosis_label']].copy()

testing_values = test_set.drop(['id','diagnosis', 'diagnosis_label'], axis=1)
testing_labels = test_set[['diagnosis_label']].copy()

## Data Scaling and PCA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA

# Make the pipeline (automatically add the step names)
input_pipeline = make_pipeline(StandardScaler(), PCA(n_components=2))

training_values_transformed = input_pipeline.fit_transform(training_values)
testing_values_transformed = input_pipeline.transform(testing_values)

# Separate out our testing data into classes for easier plotting
malignant = testing_values_transformed[testing_labels['diagnosis_label'] == 1,:]
benign = testing_values_transformed[testing_labels['diagnosis_label'] == 0,:]

# Clustering

Clustering is for unlabeled data, where you can decide on a label just based on the structure of the data.

## KMeans

In [None]:
from sklearn.cluster import KMeans

# Clustering pipeline - Start to finish
kmeans_pipeline = make_pipeline(KMeans(n_clusters=2))

# Cluster via K-means
kmeans_model = kmeans_pipeline.fit(training_values_transformed)
kmeans_testing_prediction = kmeans_model.predict(testing_values_transformed)

In [None]:
# Separate the data into classes for easier plotting
cluster_one = testing_values_transformed[kmeans_testing_prediction == 0,:]
cluster_two = testing_values_transformed[kmeans_testing_prediction == 1,:]
#cluster_three = testing_values_transformed[kmeans_testing_prediction == 2,:]


In [None]:
f, axes = plt.subplots(1, 2, sharey=True, figsize=(10,6))

axes[0].scatter(cluster_one[:, 0], cluster_one[:, 1], alpha=.8, label="Cluster 1")
axes[0].scatter(cluster_two[:, 0], cluster_two[:, 1], alpha=.8, label="Cluster 2")
#axes[0].scatter(cluster_three[:, 0], cluster_three[:, 1], alpha=.8, label="Cluster 3")
axes[0].set_title("Cluster Prediction")

axes[1].scatter(malignant[:,0], malignant[:,1], alpha=.8, label="Malignant")
axes[1].scatter(benign[:,0], benign[:,1], alpha=.8, label="Benign")
axes[1].set_title("True Class")

# Annotate Plot
for ax in axes:
  ax.set(xlabel=r'$x_{1}$',
         ylabel=r'$x_{2}$')
  
  ax.legend(frameon=True)
  ax.grid(linestyle=':')

plt.tight_layout()
plt.show()

## Mean Shift

In [None]:
from sklearn.cluster import MeanShift
meanshift_pipeline = make_pipeline(MeanShift())

# Cluster via Mean Shift
meanshift_model = meanshift_pipeline.fit(training_values_transformed)
meanshift_testing_prediction = meanshift_model.predict(testing_values_transformed)

# Get the unique set of clusters for this algorithm
clusters = np.unique(meanshift_testing_prediction)
nclusters = len(clusters)
print("Mean Shift found {} clusters".format(nclusters))

In [None]:
f, axes = plt.subplots(1, 2, sharey=True, figsize=(10,6))

# Plot out all the different clusters
for cluster in clusters:
  this_cluster = testing_values_transformed[meanshift_testing_prediction == cluster, :]
  axes[0].scatter(this_cluster[:, 0], this_cluster[:, 1], alpha=.8, label="Cluster {}".format(cluster))
axes[0].set_title("Cluster Prediction")

axes[1].scatter(malignant[:,0], malignant[:,1], alpha=.8, label="Malignant")
axes[1].scatter(benign[:,0], benign[:,1], alpha=.8, label="Benign")
axes[1].set_title("True Class")

# Annotate Plot
for ax in axes:
  ax.set(xlabel=r'$x_{1}$',
         ylabel=r'$x_{2}$')
  
  ax.legend(frameon=True)
  ax.grid(linestyle=':')

plt.tight_layout()
plt.show()

## DBSCAN

Good for data which contains clusters of similar density. This automatically finds "balls" of samples.

In [None]:
from sklearn.cluster import DBSCAN

# Cluster via DBSCAN
dbscan_model = DBSCAN(eps=0.85).fit(testing_values_transformed)

dbscan_testing_prediction = dbscan_model.labels_

# Get the unique set of clusters for this algorithm
clusters = np.unique(dbscan_testing_prediction)
nclusters = len(clusters)
print("DBSCAN found {} clusters".format(nclusters))

In [None]:
f, axes = plt.subplots(1, 2, sharey=True, figsize=(10,6))

# Plot out all the different clusters
for cluster in clusters:
  this_cluster = testing_values_transformed[dbscan_testing_prediction == cluster, :]
  axes[0].scatter(this_cluster[:, 0], this_cluster[:, 1], alpha=.8, label="Cluster {}".format(cluster))
axes[0].set_title("Cluster Prediction")

axes[1].scatter(malignant[:,0], malignant[:,1], alpha=.8, label="Malignant")
axes[1].scatter(benign[:,0], benign[:,1], alpha=.8, label="Benign")
axes[1].set_title("True Class")

# Annotate Plot
for ax in axes:
  ax.set(xlabel=r'$x_{1}$',
         ylabel=r'$x_{2}$')
  
  ax.legend(frameon=True)
  ax.grid(linestyle=':')

plt.tight_layout()
plt.show()

## Evaluation

To evaluate, we have several metrics to choose from depending on whether or not we have ground truth labels.

In [None]:
from sklearn import metrics

If we **DO** have the labels:
- [Adjusted Rand index](https://scikit-learn.org/stable/modules/clustering.html#adjusted-rand-index)
- [Mutual Information](https://scikit-learn.org/stable/modules/clustering.html#mutual-information-based-scores)
- [Homogeneity, Completeness, and V-measure](https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure)
- [Fowlkes-Mallows Scores](https://scikit-learn.org/stable/modules/clustering.html#fowlkes-mallows-scores)

If we **DO NOT** have the labels:
- [Silhouette Coefficient](https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient)
- [Calinski-Harabasz Index](https://scikit-learn.org/stable/modules/clustering.html#calinski-harabasz-index)
- [Davies-Bouldin Index](https://scikit-learn.org/stable/modules/clustering.html#davies-bouldin-index)

Let's compare  the best- and worst-performing clustering approaches we saw above, and evaluate them in terms of these metrics.

In [None]:
kmeans_prediction = kmeans_model.predict(testing_values_transformed)
meanshift_prediction = meanshift_model.predict(testing_values_transformed)

# Need to pull the true labels out of the pandas dataframe
testing_labels_array = np.array(testing_labels['diagnosis_label'])

In [None]:
def print_row_labels(name, prediction, labels):
  print('{}\t\t{:.3f}\t\t\t{:.3f}\t\t\t{:.3f}\t\t{:.3f}\t\t\t{:.3f}\t\t\t\t{:.3f}'.format(
      name,
      metrics.homogeneity_score(labels, prediction),
      metrics.completeness_score(labels, prediction),
      metrics.v_measure_score(labels, prediction),
      metrics.adjusted_rand_score(labels, prediction),
      metrics.adjusted_mutual_info_score(labels,  prediction),
      metrics.fowlkes_mallows_score(labels, prediction)))

def print_row_nolabels(name, prediction, data):
  print('{}\t\t{:.3f}\t\t\t{:.3f}\t\t\t{:.3f}'.format(
      name,
      metrics.silhouette_score(data, prediction),
      metrics.calinski_harabasz_score(data, prediction),
      metrics.davies_bouldin_score(data, prediction)))

In [None]:
print('Using Labels')
print('Algorithm\tHomogeneity Score\tCompleteness Score\tV-Measure\tAdjusted Rand Index\tAdjusted Mutual Information\tFowlkes-Mallows Score')
print(160 * '-')
print_row_labels('KMeans', kmeans_prediction, testing_labels_array)
print_row_labels('MShift', meanshift_prediction, testing_labels_array)
print('')

print('Without Labels')
print('Algorithm\tSilhouette Coefficient\tCalinski-Harabasz Index\tDavies-Bouldin Indexavies')
print(100 * '-')
print_row_nolabels('KMeans', kmeans_prediction, testing_values_transformed)
print_row_nolabels('MShift', meanshift_prediction, testing_values_transformed)
print('')

# Classification

Once we have labels, we can turn our attention to classification -- this will allow us to assign labels to our testing set.

We'll go through some common methods, training and calculating the evaluation performance for each of them using basic parameters.
For details on modifying / optimizing these, see individual notebooks or the `scikit-learn` User Guide.

## Decision Trees

In [None]:
if df['radius_mean'] < 100:
    if df['texture_mean'] > 50:
        # The class is malignant
    else:
        # The class is benign
else:
    # The class is benign

In [None]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier()
tree_clf.fit(training_values_transformed, training_labels)

# Make predictions
tree_predictions = tree_clf.predict(testing_values_transformed)

## Support Vector Machines

In [None]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(training_values_transformed, training_labels)

# Make predictions
svm_predictions = svm_model.predict(testing_values_transformed)

In [None]:
svm_site1 = SVC()

## Naive Bayesian Analysis

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb_model = GaussianNB()
gnb_model.fit(training_values_transformed, training_labels)

# Make predictions
nb_predictions = gnb_model.predict(testing_values_transformed)

## Evaluation

In [None]:
print(55 * "=")
print("Decision Trees")
print(55 * "-")
print(metrics.classification_report(testing_labels, tree_predictions, target_names=['Benign', 'Malignant']))

print(55 * "=")
print("Support Vector Machines")
print(55 * "-")
print(metrics.classification_report(testing_labels, svm_predictions, target_names=['Benign', 'Malignant']))

print(55 * "=")
print("Naive Bayes")
print(55 * "-")
print(metrics.classification_report(testing_labels, nb_predictions, target_names=['Benign', 'Malignant']))

In [None]:
# Print confusion matrix
print(55 * "=")
print("Decision Trees")
print(55 * "-")
tree_matrix = metrics.confusion_matrix(testing_labels, tree_predictions)

print("True Positive: {}".format(tree_matrix[0][0]))
print("True Negative: {}".format(tree_matrix[1][1]))
print("False Positive: {}".format(tree_matrix[0][1]))
print("False Negative: {}".format(tree_matrix[1][0]))

print(55 * "=")
print("Support Vector Machine")
print(55 * "-")
svm_matrix = metrics.confusion_matrix(testing_labels, svm_predictions)

print("True Positive: {}".format(svm_matrix[0][0]))
print("True Negative: {}".format(svm_matrix[1][1]))
print("False Positive: {}".format(svm_matrix[0][1]))
print("False Negative: {}".format(svm_matrix[1][0]))

print(55 * "=")
print("Naive Bayes")
print(55 * "-")
nb_matrix = metrics.confusion_matrix(testing_labels, nb_predictions)

print("True Positive: {}".format(nb_matrix[0][0]))
print("True Negative: {}".format(nb_matrix[1][1]))
print("False Positive: {}".format(nb_matrix[0][1]))
print("False Negative: {}".format(nb_matrix[1][0]))
