# CS4448 — Breast Cancer Classification (Naive Bayes and Baselines)
**Includes:** data load, shapes, train/test split, model training, predictions, score, confusion matrix, classification report, visualization, K-Fold CV, and multiple-model comparison.

In [None]:
# Imports
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay

# For reproducibility
RANDOM_STATE = 11


## Loading the Dataset

In [None]:
breast_cancer = load_breast_cancer()
X, y = breast_cancer.data, breast_cancer.target
print('Dataset loaded: Breast Cancer Wisconsin (Diagnostic)')
print('Feature names (first 5):', breast_cancer.feature_names[:5])
print('Target names:', breast_cancer.target_names)

## Checking the Sample and Target Sizes

In [None]:
print('X shape:', X.shape)  # (n_samples, n_features)
print('y shape:', y.shape)
unique, counts = np.unique(y, return_counts=True)
print('Class distribution:', dict(zip(unique, counts)))

## Splitting the Data for Training and Testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=RANDOM_STATE, stratify=y)
print('Training and Testing Set Sizes')
print('X_train:', X_train.shape, 'y_train:', y_train.shape)
print('X_test:', X_test.shape, 'y_test:', y_test.shape)

## Creating the Model (GaussianNB)

In [None]:
nb = GaussianNB()
nb

## Training the Model

In [None]:
nb.fit(X_train, y_train)

## Predicting

In [None]:
y_pred = nb.predict(X_test)
print('First 10 predictions:', y_pred[:10])

## Estimator Method `score`

In [None]:
test_score = nb.score(X_test, y_test)
train_score = nb.score(X_train, y_train)
print(f'Train accuracy: {train_score:.4f}')
print(f'Test  accuracy: {test_score:.4f}')

## Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=[0,1])
print('Confusion Matrix (labels 0,1):\n', cm)

## Classification Report

In [None]:
print(classification_report(y_test, y_pred, target_names=breast_cancer.target_names))

## Visualizing the Confusion Matrix

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=breast_cancer.target_names)
fig, ax = plt.subplots()
disp.plot(ax=ax)
plt.title('GaussianNB Confusion Matrix')
plt.show()

## K-Fold Cross-Validation

In [None]:
kfold = KFold(n_splits=10, random_state=RANDOM_STATE, shuffle=True)
scores = cross_val_score(estimator=nb, X=breast_cancer.data, y=breast_cancer.target, cv=kfold)
print('CV scores:', scores)
print(f'CV mean: {scores.mean():.4f} ± {scores.std():.4f}')

## Running Multiple Models to Find the Best One

In [None]:
estimators = {
    'GaussianNB': GaussianNB(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'LogisticRegression': LogisticRegression(solver='lbfgs', multi_class='ovr', max_iter=10000),
    'SVC': SVC(gamma='scale')
}

results = {}
for name, est in estimators.items():
    cv_scores = cross_val_score(estimator=est, X=breast_cancer.data, y=breast_cancer.target, cv=kfold)
    results[name] = (cv_scores.mean(), cv_scores.std())
    print(f'{name:>20}: mean={cv_scores.mean():.4f} ± {cv_scores.std():.4f}')

best_name = max(results, key=lambda k: results[k][0])
print('\nBest estimator by CV mean:', best_name, '→', f'{results[best_name][0]:.4f} ± {results[best_name][1]:.4f}')