In [1]:
import pandas as pd
import numpy as np
import sklearn.model_selection as sms
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

# ================================
# Naive Bayes Classifier using Scikit-Learn
# This script implements a Naive Bayes Classifier using Scikit-Learn's GaussianNB
# for the Breast Cancer Wisconsin dataset.
# ================================

# Load and preprocess dataset
# ================================
file_path = r'data.csv'  # Path to the dataset file

# Load the dataset
raw_dataset = pd.read_csv(file_path)

# Print the data types of each column to verify correctness
print(raw_dataset.dtypes)

# Check for missing values in each column
print(raw_dataset.isnull().sum())

# Drop non-informative columns (e.g., 'Unnamed: 32' and 'id')
raw_dataset = raw_dataset.drop(['Unnamed: 32', 'id'], axis=1)  # Drop unnecessary columns

# Convert qualitative labels to numerical ones
def label_to_numeric(column):
    if column.dtype == 'object':
        unique_labels, _ = pd.factorize(column)
        return pd.Series(unique_labels, index=column.index)
    return column

dataset = raw_dataset.apply(label_to_numeric)

# Create a mapping from original labels to numeric ones
label_map = dict(zip(raw_dataset['diagnosis'], dataset['diagnosis']))
print("Label Mapping:", label_map)

# Feature extraction and train-test split
# ================================
features = list(set(dataset.columns) - {'diagnosis'})  # Extract feature labels
label = dataset['diagnosis']
data = dataset[features]

# Split the dataset into training and testing sets
data_trainset, data_testset, label_trainset, label_testset = sms.train_test_split(data, label, test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
# ================================
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(data_trainset, label_trainset)

# Predict the labels for the test set
prediction = naive_bayes_model.predict(data_testset)

# Evaluate the model
# ================================
# Calculate accuracy
accuracy = accuracy_score(label_testset, prediction)
print("Accuracy:", accuracy, '\n\n')

# Create a DataFrame for comparison
Comparison = pd.concat([label_testset.reset_index(drop=True), pd.Series(prediction)], axis=1)
Comparison.rename(columns={0: 'classifier'}, inplace=True)

# Calculate confusion matrices
# ================================
# Confusion matrix for category M (Malignant)
M_TP = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 0)).sum()
M_FN = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 1)).sum()
M_TN = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 1)).sum()
M_FP = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 0)).sum()

M_confusion_matrix = confusion_matrix(label_testset, prediction)
M_confusion_matrix = pd.DataFrame(M_confusion_matrix, columns=['Actual Positive', 'Actual Negative'], index=['Predicted Positive', 'Predicted Negative'])

# Confusion matrix for category B (Benign)
B_TP = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 1)).sum()
B_FN = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 0)).sum()
B_TN = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 0)).sum()
B_FP = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 1)).sum()

B_confusion_matrix = {
    'Actual Positive': [B_TP, B_FN],
    'Actual Negative': [B_FP, B_TN],
}
B_confusion_matrix = pd.DataFrame(B_confusion_matrix, columns=['Actual Positive', 'Actual Negative'], index=['Predicted Positive', 'Predicted Negative'])

# Print confusion matrices
print('Category M\n', M_confusion_matrix, '\n\n', 'Category B\n', B_confusion_matrix, '\n\n')

# Calculate precision and recall for both categories
M_Precision = M_TP / (M_TP + M_FP)
M_Recall = M_TP / (M_TP + M_FN)
print('Category M', 'Precision =', M_Precision, 'Recall =', M_Recall)

B_Precision = B_TP / (B_TP + B_FP)
B_Recall = B_TP / (B_TP + B_FN)
print('Category B', 'Precision =', B_Precision, 'Recall =', B_Recall)


id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst     