In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.stats import norm

# Step 1: Load the dataset
data = pd.read_csv('wdbc.data.mb.csv', header=None)

# Step 2: Split the dataset into features (X) and labels (y)
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

# Step 3: Split the data into training (70%) and testing (30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 4: Implement the Probability Calculation Module
class NaiveBayesClassifier:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.class_probs = []
        self.mean_variances = []

        for c in self.classes:
            X_c = X[y == c]
            class_prob = len(X_c) / len(X)
            self.class_probs.append(class_prob)

            mean_variance = [(np.mean(attribute), np.var(attribute)) for attribute in X_c.T]
            self.mean_variances.append(mean_variance)

    def predict(self, X):
        predictions = []
        for x in X:
            class_scores = []
            for i, c in enumerate(self.classes):
                class_score = np.log(self.class_probs[i])
                for j, (mean, variance) in enumerate(self.mean_variances[i]):
                    class_score += norm.logpdf(x[j], mean, np.sqrt(variance))
                class_scores.append(class_score)
            predictions.append(self.classes[np.argmax(class_scores)])
        return predictions

# Step 5: Create and train the Naive Bayesian Classifier
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(X_train, y_train)

# Step 6: Classify the test data
y_pred = nb_classifier.predict(X_test)

# Step 7: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)

# Step 8: Perform k-fold cross-validation (K=5)
from sklearn.model_selection import StratifiedKFold

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []
confusion_matrices = []

for train_index, test_index in kfold.split(X, y):
    X_train_fold, X_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]

    nb_classifier = NaiveBayesClassifier()
    nb_classifier.fit(X_train_fold, y_train_fold)

    y_pred_fold = nb_classifier.predict(X_test_fold)

    accuracy_fold = accuracy_score(y_test_fold, y_pred_fold)
    conf_matrix_fold = confusion_matrix(y_test_fold, y_pred_fold)

    accuracies.append(accuracy_fold)
    confusion_matrices.append(conf_matrix_fold)

# Step 9: Report the results
for i in range(5):
    print(f"Fold {i+1}:")
    print("Accuracy:", accuracies[i])
    print("Confusion Matrix:\n", confusion_matrices[i])




Accuracy: 0.9239130434782609
Confusion Matrix:
 [[105   8]
 [  6  65]]
Fold 1:
Accuracy: 0.9024390243902439
Confusion Matrix:
 [[68  8]
 [ 4 43]]
Fold 2:
Accuracy: 0.959349593495935
Confusion Matrix:
 [[74  2]
 [ 3 44]]
Fold 3:
Accuracy: 0.943089430894309
Confusion Matrix:
 [[75  1]
 [ 6 41]]
Fold 4:
Accuracy: 0.9180327868852459
Confusion Matrix:
 [[69  6]
 [ 4 43]]
Fold 5:
Accuracy: 0.9426229508196722
Confusion Matrix:
 [[73  2]
 [ 5 42]]
