In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Step 1: Load the CSV data
# Replace 'data.csv' with your actual CSV file path

In [2]:
data = pd.read_csv('iris.csv')
print(data.columns)
print(data.head)

Index(['sepal length', 'sepal width', 'petal length', 'petal width', 'class'], dtype='object')
<bound method NDFrame.head of     sepal length sepal width petal length petal width           class
0              r           r            r           r               n
1            5.1         3.5          1.4         0.2     Iris-setosa
2            4.9         3.0          1.4         0.2     Iris-setosa
3            4.7         3.2          1.3         0.2     Iris-setosa
4            4.6         3.1          1.5         0.2     Iris-setosa
..           ...         ...          ...         ...             ...
146          6.7         3.0          5.2         2.3  Iris-virginica
147          6.3         2.5          5.0         1.9  Iris-virginica
148          6.5         3.0          5.2         2.0  Iris-virginica
149          6.2         3.4          5.4         2.3  Iris-virginica
150          5.9         3.0          5.1         1.8  Iris-virginica

[151 rows x 5 columns]>


# Assuming the last column is the target and the rest are features

In [3]:
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Step 2: Preprocess the data
# Convert categorical data to numerical if necessary

In [4]:
X = pd.get_dummies(X, drop_first=True)
y = y.astype('category').cat.codes

# Step 3: Naive Bayes Implementation

In [None]:
class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.feature_stats = {}
        self.class_prior = {}

        for c in self.classes:
            X_c = X[y == c]
            self.feature_stats[c] = {
                "mean": X_c.mean(axis=0),
                "var": X_c.var(axis=0)
            }
            self.class_prior[c] = len(X_c) / len(y)

    def predict(self, X):
        posteriors = []

        for x in X:
            class_probs = []
            for c in self.classes:
                prior = np.log(self.class_prior[c])
                likelihood = -0.5 * np.sum(
                    np.log(2 * np.pi * self.feature_stats[c]["var"]) +
                    ((x - self.feature_stats[c]["mean"]) ** 2) / (2 * self.feature_stats[c]["var"])
                )
                class_probs.append(prior + likelihood)
            posteriors.append(self.classes[np.argmax(class_probs)])

        return np.array(posteriors)

# Step 4: Split the data into train-test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=0.3, random_state=42)

# Step 5: Train and Predict

In [None]:
nb = NaiveBayes()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)

# Step 6: Evaluate the Model

In [None]:
accuracy = np.mean(y_pred == y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Optional: Plot a confusion matrix

In [None]:
cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y))
disp.plot(cmap=plt.cm.Blues)
plt.show()