In [1]:
import pandas as pd
import numpy as np
from math import log, sqrt, pi, exp
from ucimlrepo import fetch_ucirepo

class GaussianNaiveBayes:
    def __init__(self):
        self.class_priors = {}
        self.feature_means = {}
        self.feature_vars = {}
        self.classes = []
        
    def fit(self, X, y):
        self.classes = list(set(y))
        n_samples = len(X)
        
        for cls in self.classes:
            self.class_priors[cls] = y.count(cls) / n_samples
        
        for cls in self.classes:
            class_samples = [X[i] for i in range(len(X)) if y[i] == cls]
            class_samples = np.array(class_samples)
            
            self.feature_means[cls] = np.mean(class_samples, axis=0)
            self.feature_vars[cls] = np.var(class_samples, axis=0)
                
    def gaussian_pdf(self, x, mean, var):
        return (1 / sqrt(2 * pi * var)) * exp(-0.5 * ((x - mean) ** 2) / var)
    
    def predict_single(self, sample):
        class_scores = {}
        
        for cls in self.classes:
            score = log(self.class_priors[cls])
            for i, feature_value in enumerate(sample):
                mean = self.feature_means[cls][i]
                var = self.feature_vars[cls][i]
                score += log(self.gaussian_pdf(feature_value, mean, var))
            class_scores[cls] = score
            
        return max(class_scores, key=class_scores.get)
    
    def predict(self, X):
        return [self.predict_single(sample) for sample in X]

def train_test_split(X, y, test_size=0.3):
    np.random.seed(42)
    n_samples = len(X)
    n_test = int(n_samples * test_size)
    
    indices = np.random.permutation(n_samples)
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    
    X_train = [X[i] for i in train_indices]
    X_test = [X[i] for i in test_indices]
    y_train = [y[i] for i in train_indices]
    y_test = [y[i] for i in test_indices]
    
    return X_train, X_test, y_train, y_test

def accuracy_score(y_true, y_pred):
    correct = sum(1 for true, pred in zip(y_true, y_pred) if true == pred)
    return correct / len(y_true)

def load_iris_dataset():
    iris = fetch_ucirepo(id=53)
    X = iris.data.features.values.tolist()
    y = iris.data.targets.values.flatten().tolist()
    return X, y

def main():
    X, y = load_iris_dataset()
    print(f"dataset loaded: {len(X)} samples, {len(X[0])} features")
    print(f"classes: {set(y)}")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    print(f"training samples: {len(X_train)}")
    print(f"test samples: {len(X_test)}")
    
    gnb = GaussianNaiveBayes()
    gnb.fit(X_train, y_train)
    
    y_pred = gnb.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"accuracy: {accuracy:.4f}")
    
    print("sample predictions:")
    for i in range(min(5, len(X_test))):
        actual = y_test[i]
        predicted = y_pred[i]
        print(f"sample {i+1}: actual={actual}, predicted={predicted}")

if __name__ == "__main__":
    main()

dataset loaded: 150 samples, 4 features
classes: {'Iris-setosa', 'Iris-versicolor', 'Iris-virginica'}
training samples: 105
test samples: 45
accuracy: 0.9778
sample predictions:
sample 1: actual=Iris-versicolor, predicted=Iris-versicolor
sample 2: actual=Iris-setosa, predicted=Iris-setosa
sample 3: actual=Iris-virginica, predicted=Iris-virginica
sample 4: actual=Iris-versicolor, predicted=Iris-versicolor
sample 5: actual=Iris-versicolor, predicted=Iris-versicolor
