In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Load the CSV data

In [2]:
data = pd.read_csv('iris.csv')
print(data.columns)
print(data.head)

Index(['sepal length', 'sepal width', 'petal length', 'petal width', 'class'], dtype='object')
<bound method NDFrame.head of     sepal length sepal width petal length petal width           class
0              r           r            r           r               n
1            5.1         3.5          1.4         0.2     Iris-setosa
2            4.9         3.0          1.4         0.2     Iris-setosa
3            4.7         3.2          1.3         0.2     Iris-setosa
4            4.6         3.1          1.5         0.2     Iris-setosa
..           ...         ...          ...         ...             ...
146          6.7         3.0          5.2         2.3  Iris-virginica
147          6.3         2.5          5.0         1.9  Iris-virginica
148          6.5         3.0          5.2         2.0  Iris-virginica
149          6.2         3.4          5.4         2.3  Iris-virginica
150          5.9         3.0          5.1         1.8  Iris-virginica

[151 rows x 5 columns]>


# Assuming the last column is the target and the rest are features

In [3]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Assuming `y` contains class labels as integers starting from 0

In [4]:
num_classes = len(np.unique(y))
print(np.unique(y))
print(num_classes)

['Iris-setosa' 'Iris-versicolor' 'Iris-virginica' 'n']
4


In [5]:
X = pd.get_dummies(X, drop_first=True)
y = y.astype('category').cat.codes

### Sub-Step 1: Binarize the labels

In [6]:
y_binarized = label_binarize(y, classes=range(num_classes))

# Preprocess the data

In [7]:
def check_nan_in_csv():
    try:
        # Check for NaN values
        nan_summary = data.isna().sum()
        total_nan = nan_summary.sum()
        
        if total_nan == 0:
            print("The CSV file has no NaN values.")
        else:
            print(f"The CSV file contains {total_nan} NaN values.")
            nan_columns = nan_summary[nan_summary > 0]
            print("Columns with NaN values:")
            print(nan_columns)
    except Exception as e:
        print(f"An error occurred: {e}")
check_nan_in_csv()


The CSV file has no NaN values.


# Naive Bayes Implementation

In [8]:
class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.feature_stats = {}
        self.class_prior = {}

        for c in self.classes:
            X_c = X[y == c]
            self.feature_stats[c] = {
                "mean": X_c.mean(axis=0),
                "var": X_c.var(axis=0)
            }
            self.class_prior[c] = len(X_c) / len(y)

    def predict(self, X):
        posteriors = []

        for x in X:
            class_probs = []
            for c in self.classes:
                prior = np.log(self.class_prior[c])
                likelihood = -0.5 * np.sum(
                    np.log(2 * np.pi * self.feature_stats[c]["var"]) +
                    ((x - self.feature_stats[c]["mean"]) ** 2) / (2 * self.feature_stats[c]["var"])
                )
                class_probs.append(prior + likelihood)
            posteriors.append(self.classes[np.argmax(class_probs)])

        return np.array(posteriors)

# Train and Predict

### Initialize KFold

In [9]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)
mean_fpr = np.linspace(0, 1, 100)  # Define a common set of FPR points for interpolation
tpr = {i: [] for i in range(num_classes)}  # True positive rates
roc_auc = {i: [] for i in range(num_classes)}  # AUC for each class
fold_accuracies = []

### Cross-Validation

In [10]:
for train_index, test_index in kf.split(X):
    # Split the data into training and testing for this fold
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]

    # Train and predict
    nb = NaiveBayes()
    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)

    # Calculate accuracy for this fold
    accuracy = np.mean(y_pred == y_test)
    fold_accuracies.append(accuracy)

  np.log(2 * np.pi * self.feature_stats[c]["var"]) +
  ((x - self.feature_stats[c]["mean"]) ** 2) / (2 * self.feature_stats[c]["var"])
  ((x - self.feature_stats[c]["mean"]) ** 2) / (2 * self.feature_stats[c]["var"])
  np.log(2 * np.pi * self.feature_stats[c]["var"]) +


In [11]:
mean_accuracy = np.mean(fold_accuracies)
print(f"Cross-Validation Accuracy: {mean_accuracy * 100:.2f}%")
print(f"Accuracy per fold: {fold_accuracies}")

Cross-Validation Accuracy: 33.04%
Accuracy per fold: [0.4375, 0.26666666666666666, 0.5333333333333333, 0.2, 0.4, 0.3333333333333333, 0.4, 0.26666666666666666, 0.3333333333333333, 0.13333333333333333]


# Plot ROC Curves

In [None]:
class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.feature_stats = {}
        self.class_prior = {}

        for c in self.classes:
            X_c = X[y == c]
            self.feature_stats[c] = {
                "mean": X_c.mean(axis=0),
                "var": X_c.var(axis=0)
            }
            self.class_prior[c] = len(X_c) / len(y)

    def predict_proba(self, X):
        probabilities = []

        for x in X:
            class_probs = []
            for c in self.classes:
                prior = np.log(self.class_prior[c])
                likelihood = -0.5 * np.sum(
                    np.log(2 * np.pi * self.feature_stats[c]["var"]) +
                    ((x - self.feature_stats[c]["mean"]) ** 2) / (2 * self.feature_stats[c]["var"])
                )
                class_probs.append(prior + likelihood)
            probabilities.append(np.exp(class_probs) / np.sum(np.exp(class_probs)))
        
        return np.array(probabilities)

In [None]:
for train_index, test_index in kf.split(X):
    # Split the data into training and testing for this fold
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y_binarized[train_index], y_binarized[test_index]

    # Train the model
    nb = NaiveBayes()
    nb.fit(X_train, y_train.argmax(axis=1))  # Train with non-binarized labels
    y_proba = nb.predict_proba(X_test)

    # Step 3: Calculate ROC for each class
    for i in range(num_classes):
        fpr_class, tpr_class, _ = roc_curve(y_test[:, i], y_proba[:, i])
        interp_tpr = np.interp(mean_fpr, fpr_class, tpr_class)
        interp_tpr[0] = 0.0  # Ensure the TPR starts at 0
        tpr[i].append(interp_tpr)
        roc_auc[i].append(auc(fpr_class, tpr_class))

In [None]:
plt.figure(figsize=(10, 8))
colors = plt.cm.get_cmap("tab10", num_classes)

In [None]:
for i in range(num_classes):
    mean_tpr = np.mean(tpr[i], axis=0)
    mean_tpr[-1] = 1.0  # Ensure the TPR ends at 1
    mean_auc = np.mean(roc_auc[i])
    plt.plot(mean_fpr, mean_tpr, label=f'Class {i} (AUC = {mean_auc:.2f})', color=colors(i))

In [None]:
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Iris Classification')
plt.legend(loc='lower right')
plt.grid(alpha=0.5)
plt.show()

  np.log(2 * np.pi * self.feature_stats[c]["var"]) +
  ((x - self.feature_stats[c]["mean"]) ** 2) / (2 * self.feature_stats[c]["var"])
  ((x - self.feature_stats[c]["mean"]) ** 2) / (2 * self.feature_stats[c]["var"])
  np.log(2 * np.pi * self.feature_stats[c]["var"]) +


# Step 6: Evaluate the Model

In [15]:
accuracy = np.mean(fold_accuracies)
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Accuracy per fold: {fold_accuracies}")

Accuracy: 33.04%
Accuracy per fold: [0.4375, 0.26666666666666666, 0.5333333333333333, 0.2, 0.4, 0.3333333333333333, 0.4, 0.26666666666666666, 0.3333333333333333, 0.13333333333333333]
