In [1]:
import pandas as pd
import numpy as np
import math

# ================================
# Naive Bayes Classifier from Scratch
# This script implements a Naive Bayes Classifier
# from scratch using the Breast Cancer Wisconsin dataset.
# The classifier is implemented without using any external libraries.
# ================================

# Load and preprocess dataset
# ================================
file_path = r'data.csv'  # Path to the dataset file

# Load the dataset
raw_dataset = pd.read_csv(file_path)

# Print the data types of each column to verify correctness
print(raw_dataset.dtypes)

# Check for missing values in each column
print(raw_dataset.isnull().sum())

# Data Preprocessing
# ================================
# Drop non-informative columns (e.g., 'Unnamed: 32' and 'id')
raw_dataset = raw_dataset.drop(['Unnamed: 32', 'id'], axis=1)  # Drop unnecessary columns

# Convert qualitative labels (diagnosis) to numerical ones
def label_to_numeric(column):
    if column.dtype == 'object':
        unique_labels, _ = pd.factorize(column)
        return pd.Series(unique_labels, index=column.index)
    return column

dataset = raw_dataset.apply(label_to_numeric)

# Create a mapping from original labels to numeric ones
label_map = dict(zip(raw_dataset['diagnosis'], dataset['diagnosis']))

# Shuffle the dataset randomly
np.random.seed(42)  # Set a seed for reproducibility
dataset = dataset.sample(frac=1).reset_index(drop=True)

# Split the dataset into training and testing sets
# ================================
train_ratio = 0.8  # 80% for training, 20% for testing
total_samples = len(dataset)
train_samples = int(train_ratio * total_samples)

train_data = dict(list(dataset.iloc[:train_samples, :].groupby('diagnosis')))
test_data = dataset.iloc[train_samples:, :]

# Define parameters for the Naive Bayes classifier
# ================================
features = list(set(dataset.columns) - {'diagnosis'})  # Extract feature labels

# Calculate mean and standard deviation for each class
M_train_features = pd.DataFrame(train_data[0][features].values, columns=features)
B_train_features = pd.DataFrame(train_data[1][features].values, columns=features)

M_mean_train_features = np.mean(M_train_features, axis=0)
B_mean_train_features = np.mean(B_train_features, axis=0)

M_std_train_features = np.std(M_train_features, axis=0)
B_std_train_features = np.std(B_train_features, axis=0)

# Test data preparation
test_features = pd.DataFrame(test_data[features].values, columns=features)

# Calculate prior probabilities for each class (M and B)
M_prior = (train_data[0].iloc[:, 0].value_counts() / (train_data[0].shape[0] + train_data[1].shape[0])).to_dict()
B_prior = (train_data[1].iloc[:, 0].value_counts() / (train_data[1].shape[0] + train_data[0].shape[0])).to_dict()

# Define function to calculate likelihood using Gaussian distribution
def calculate_likelihood(data, means, stds):
    # Calculate likelihood for each feature
    likelihood = 1 / (stds * np.sqrt(2 * np.pi)) * np.exp(-0.5 * ((data - means) / stds) ** 2)
    
    # Multiply likelihoods for all features (assuming independence)
    overall_likelihood = likelihood.prod(axis=1)
    
    # Return likelihood as a pandas Series
    return pd.Series(overall_likelihood, name='Likelihood')

# Classification of test data
# ================================
# Calculate likelihoods for both classes
B_likelihood = calculate_likelihood(test_features, B_mean_train_features, B_std_train_features)
M_likelihood = calculate_likelihood(test_features, M_mean_train_features, M_std_train_features)

# Define function to classify each sample
def classify_sample(likelihood_class1, likelihood_class2, prior_class1, prior_class2):
    # Calculate posterior probabilities for both classes
    posterior_class1 = likelihood_class1 * prior_class1
    posterior_class2 = likelihood_class2 * prior_class2

    # Return the class with the higher posterior probability
    return 0 if posterior_class1 > posterior_class2 else 1

# Apply classifier to the test data
M_class = pd.DataFrame(M_likelihood * M_prior[0])
M_class.columns = ['M_posterior']

B_class = pd.DataFrame(B_likelihood * B_prior[1])
B_class.columns = ['B_posterior']

classifier = pd.DataFrame({'classifier': [None] * len(test_data)})
decision = pd.concat([M_class, B_class, classifier], axis=1)

decision['classifier'] = decision.apply(lambda row: 0 if row['M_posterior'] > row['B_posterior'] else 1, axis=1)

# Model evaluation
# ================================
Comparison = pd.concat([test_data['diagnosis'].reset_index(drop=True), decision['classifier']], axis=1)

# Confusion Matrix for category M (Malignant)
M_TP = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 0)).sum()
M_FN = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 1)).sum()
M_TN = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 1)).sum()
M_FP = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 0)).sum()

M_confusion_matrix = {
    'Actual Positive': [M_TP, M_FN],
    'Actual Negative': [M_FP, M_TN],
}

M_confusion_matrix = pd.DataFrame(M_confusion_matrix, columns=['Actual Positive', 'Actual Negative'], index=['Predicted Positive', 'Predicted Negative'])

# Confusion Matrix for category B (Benign)
B_TP = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 1)).sum()
B_FN = ((Comparison['diagnosis'] == 1) & (Comparison['classifier'] == 0)).sum()
B_TN = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 0)).sum()
B_FP = ((Comparison['diagnosis'] == 0) & (Comparison['classifier'] == 1)).sum()

B_confusion_matrix = {
    'Actual Positive': [B_TP, B_FN],
    'Actual Negative': [B_FP, B_TN],
}

B_confusion_matrix = pd.DataFrame(B_confusion_matrix, columns=['Actual Positive', 'Actual Negative'], index=['Predicted Positive', 'Predicted Negative'])

# Print confusion matrices
print('Category M\n', M_confusion_matrix, '\n\n', 'Category B\n', B_confusion_matrix, '\n\n')

# Calculate precision and recall for both categories
M_Precision = M_TP / (M_TP + M_FP)
M_Recall = M_TP / (M_TP + M_FN)
print('Category M', 'Precision =', M_Precision, 'Recall =', M_Recall)

B_Precision = B_TP / (B_TP + B_FP)
B_Recall = B_TP / (B_TP + B_FN)
print('Category B', 'Precision =', B_Precision, 'Recall =', B_Recall)

# Calculate overall accuracy
Accuracy = (M_TP + M_TN) / len(test_data)
print('Accuracy =', Accuracy)


id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst     