# General Preamble Code

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mglearn
from IPython.display import display

# Additional Import Code for dataset BC

In [None]:
from sklearn.datasets import load_breast_cancer
x, y = load_breast_cancer(return_X_y=True)
print(x, type(x),"\n")
print(y, type(y))

# Question 1
## For the 'mean radius' and 'mean perimeter' features, plot histograms separated by the target class (i.e., create one histogram of 'mean radius' for benign tumors and another for malignant tumors). The GaussianNB classifier assumes that continuous features follow a Gaussian (normal) distribution. Based on your plots, does the assumption of a normal distribution appear reasonable for these features within each class?

In [None]:
print("############ Assignment 3 Question 1 BEGIN ############")
data = load_breast_cancer()
df_bc = pd.DataFrame(x, columns=data.feature_names)
df_bc['target'] = y

# plot histograms for mean radius and mean perimeter (target class)
features = ['mean radius', 'mean perimeter']
target_names = ['malignant', 'benign']
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# mean radius
df_bc[df_bc['target'] == 0][features[0]].plot(
    kind='hist', alpha=0.6, ax=axes[0], label=target_names[0], bins=30
)
df_bc[df_bc['target'] == 1][features[0]].plot(
    kind='hist', alpha=0.6, ax=axes[0], label=target_names[1], bins=30
)
axes[0].set_title(f"histogram of mean radius by class")
axes[0].set_xlabel(features[0])
axes[0].set_ylabel("Frequency")
axes[0].legend()

# mean perimeter
df_bc[df_bc['target'] == 0][features[1]].plot(
    kind='hist', alpha=0.6, ax=axes[1], label=target_names[0], bins=30
)
df_bc[df_bc['target'] == 1][features[1]].plot(
    kind='hist', alpha=0.6, ax=axes[1], label=target_names[1], bins=30
)
axes[1].set_title(f"histogram of mean perimeter by class")
axes[1].set_xlabel(features[1])
axes[1].set_ylabel("Frequency")
axes[1].legend()

plt.show()

print("\nBased on the histograms, both features for both classes appear roughly normal. The assumption is reasonable.")
print("############# Assignment 3 Question 1 END #############")

# Question 2
## The naive assumption assumes that features are independent of each other. Generate a correlation matrix for the numerical features in the dataset. Do you see any highly correlated features? Explain how the model can still perform reasonably well even if this core assumption is violated.

In [None]:
print("############ Assignment 3 Question 2 BEGIN ############")

# generate a correlation matrix
correlation_matrix = df_bc.corr()
print(correlation_matrix)
print()

# identify highly correlated features
print("Here are the highly correlated features from our correlation matrix:")
print("--------------------------------------------------------------------")
print("mean area and worst area: 0.959213")
print("mean area and worst perimeter: 0.959120")
print("mean area and worst radius: 0.962746")
print("mean concave points and worst concave points: 0.910155")
print("mean concavity and mean concave points: 0.921391")
print("mean perimeter and mean area: 0.986507")
print("mean perimeter and worst area: 0.941550")
print("mean perimeter and worst perimeter: 0.970387")
print("mean perimeter and worst radius: 0.969476")
print("mean radius and mean area: 0.987357")
print("mean radius and mean perimeter: 0.997855")
print("mean radius and worst area: 0.941082")
print("mean radius and worst perimeter: 0.965137")
print("mean radius and worst radius: 0.969539")
print("mean texture and worst texture: 0.912045")
print("perimeter error and area error: 0.937655")
print("radius error and area error: 0.951830")
print("radius error and perimeter error: 0.972794")
print("worst perimeter and worst area: 0.977578")
print("worst radius and worst area: 0.984015")
print("worst radius and worst perimeter: 0.993708")
print("All of these had a strong correlation of over 0.9")
print()

# explain how model performs well even if features aren't independent
print("The model performs well even if the features aren't independent because Naive Bayes is only predicting which classification is more probable or more likely, rather than trying to calculate the true underlying probabilities. We just use independence to simplify the calculation.")

print("############# Assignment 3 Question 2 END #############")

# Question 3
## Train GaussianNB classifier and report its accuracy on the training and test set. Remember general test/train split and random_state instructions from before.

In [None]:
print("############ Assignment 3 Question 3 BEGIN ############")

# creating train and test data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, train_size=0.75, random_state=0)

# creating naive bayes model
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(x_train, y_train)

# evaluating the model
y_pred = gnb.predict(x_test)
print(f"Accuracy (train set): {gnb.score(x_train, y_train):.4f}")
print(f"Accuracy (test set): {gnb.score(x_test, y_test):.4f}")
print("Number of mislabeled test data points out of a total %d points : %d"
      % (x_test.shape[0], (y_test != y_pred).sum()))

print("############# Assignment 3 Question 3 END #############")

# Question 4
## Plot the confusion matrix for this classifier, and also compute “recall” for the malignant class. In the context of cancer diagnosis, why is this metric (also known as sensitivity) often considered more critical than precision?

In [None]:
print("############ Assignment 3 Question 4 BEGIN ############")

# plot the confusion matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, recall_score
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.show()

# compute recall for the malignant class (class 0)
recall_malignant = recall_score(y_test, y_pred, pos_label=0)
print(f"\nRecall of malignant class: {recall_malignant:.4f}")

# explain why sensitivity > precision for cancer diagnosis
print("Sensitivity/recall is more critical than precision in cancer diagnosis because missing a malignant case (false negative) can have severe consequences for the patient. It is more important to correctly identify all malignant cases, even if it means some benign cases are incorrectly flagged which are false positives.")

print("############# Assignment 3 Question 4 END #############")

# Question 5
## The CategoricalNB model cannot handle continuous data. To use it, we must first discretize (or "bin") our features. Using KBinsDiscretizer with n_bins=4 and encode='ordinal', transform the entire feature set X into a discretized version. Create a new training and test split using this fully discretized dataset and train a CategoricalNB model. 

In [None]:
print("############ Assignment 3 Question 5 BEGIN ############")

# transform continuous data into discrete categories
from sklearn.preprocessing import KBinsDiscretizer
encoder = KBinsDiscretizer(n_bins=4, encode='ordinal')
x_discrete = encoder.fit(x).transform(x)
x_train_discrete, x_test_discrete, y_train_discrete, y_test_discrete = train_test_split(x_discrete, y, test_size=0.25, train_size=0.75, random_state=0)

# train categoricalnb model with transformed data
from sklearn.naive_bayes import CategoricalNB
cnb = CategoricalNB()
cnb.fit(x_train_discrete, y_train_discrete)
y_pred_discrete = cnb.predict(x_test_discrete)
print("CategoricalNB model predictions w/ discretized features:\n", str(y_pred_discrete))

print("############# Assignment 3 Question 5 END #############")

# Question 6
## Report its accuracy on the discretized test set. How does its performance compare to the GaussianNB model? Briefly explain why their performances might differ.

In [None]:
print("############ Assignment 3 Question 6 BEGIN ############")

# evaluating the model
print("Accuracy on the discretized test set:")
print(f"Accuracy (train set): {cnb.score(x_train_discrete, y_train_discrete):.4f}")
print(f"Accuracy (test set): {cnb.score(x_test_discrete, y_test_discrete):.4f}")
print("Number of mislabeled points out of a total %d points : %d"
      % (x_test_discrete.shape[0], (y_test_discrete != y_pred_discrete).sum()))

# compare and explain different in performance
print("\nThe test accuracy for CategoricalNB was 0.9301 while the test accuracy of GaussianNB was 0.9371. GaussianNB performed better.")
print("This performance difference is due to the information lost from our dataset after binning; features went from having infinite distinctions (continuous numerical data) to being ordinal categories.")

print("############# Assignment 3 Question 6 END #############")

# Question 7
## The CategoricalNB classifier has a hyperparameter called alpha which controls Laplace smoothing. What problem does this smoothing solve in the context of our discretized data? What could happen if a specific bin for a feature (e.g., the highest bin for 'mean radius') was present in the test set but never appeared in the training set for malignant cases, and we were not using any smoothing (alpha=0)?

In [None]:
print("############ Assignment 3 Question 7 BEGIN ############")

# what does smoothing mean in this context
print("The problem that smoothing solves in our discretized data is if a certain bin of a feature doesn't appear in our training data, then the model will predict a 'No' or 'Benign' tumor when it encounters that bin in testing data.")
print("This happens even if other features predict high probability of a 'malignant' tumor since the probabilities are multiplied (zeroed-out).\n")

# what happens if a bin was present for test set but not training sets but we didn't use smoothing
print("If there was a feature where a bin appeared in our test set but not in the training set for malignant cases, then the probability of YES (i.e. malignant diagnosis) given that feature (e.g. mean radius) will be 0 (false) no matter what.")
print("Without smoothing, that feature would zero-out the other features' probabilities of malignant tumor, leading to a false negative (we diagnose as benign when really it is malignant).")

print("############# Assignment 3 Question 7 END #############")