In [None]:
import numpy as np
import pandas as pd
import math

In [None]:
#To Read Data from IRIS Dataset
iris = pd.read_csv("https://raw.githubusercontent.com/aiforsec/fds-datasets/main/iris-modified.csv")

#To Display Data along with its Header
iris.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [None]:
#To get Descriptive Summary of the Dataset
iris.describe()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width
count,100.0,100.0,100.0,100.0
mean,5.471,3.094,2.862,0.785
std,0.641698,0.476057,1.448565,0.566288
min,4.3,2.0,1.0,0.1
25%,5.0,2.8,1.5,0.2
50%,5.4,3.05,2.45,0.8
75%,5.9,3.4,4.325,1.3
max,7.0,4.4,5.1,1.8


In [None]:
from sklearn.model_selection import train_test_split

#To add Data to X and y Variables
X = iris.iloc[:, :-1].values
y = iris.iloc[:, -1].values

#To split data for Training and Testing (80% Training and 20% Testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
#To get Total No. of Samples and Total No. of Features of Training Dataset
samples = X_train.shape[0]  #80
feature = X_train.shape[1]  #4

In [None]:
#To get Name of Distinct Classes
classes = np.unique(y_train)

#To get Total No of Classes
number_of_classes = len(classes)
number_of_classes

2

Start of naive bayes

In [None]:
# dict to store prior guesses for each class
priors = {}
for c in classes:
  classCount = np.sum(y_train == c)
  priors[c] = classCount/samples
priors

{'Iris-setosa': np.float64(0.5), 'Iris-versicolor': np.float64(0.5)}

In [None]:
# get mean and variance for feature per class
feature_stats = {}
for c in classes:
  feature_stats[c] = {}
  X_train_class = X_train[y_train == c]
  for feature_idx in range(feature):
    featureData = X_train_class[:, feature_idx]
    mean = np.mean(featureData)
    variance = np.var(featureData, ddof=0)

    # to avoid division by 0
    if variance < 1e-9:
      variance = 1e-9

    feature_stats[c][feature_idx] = {'mean': mean, 'variance': variance}

feature_stats

{'Iris-setosa': {0: {'mean': np.float64(5.045),
   'variance': np.float64(0.11047500000000002)},
  1: {'mean': np.float64(3.4200000000000004),
   'variance': np.float64(0.14109999999999998)},
  2: {'mean': np.float64(1.4775),
   'variance': np.float64(0.020743750000000002)},
  3: {'mean': np.float64(0.24749999999999997),
   'variance': np.float64(0.011993750000000001)}},
 'Iris-versicolor': {0: {'mean': np.float64(5.93),
   'variance': np.float64(0.26210000000000006)},
  1: {'mean': np.float64(2.7725), 'variance': np.float64(0.10299375)},
  2: {'mean': np.float64(4.2524999999999995),
   'variance': np.float64(0.22799375)},
  3: {'mean': np.float64(1.33), 'variance': np.float64(0.0416)}}}

In [None]:
# gaussian probability density function
def gaussian_prob(x, mean, variance):
  if variance == 0:
    return 0
  exponent = math.exp(-((x - mean) ** 2) / (2 * variance))
  coeff = math.sqrt(2 * math.pi * variance)
  return (1/coeff) * exponent

#  test
print("Eg Gauss:", gaussian_prob(5.0, mean=5.5, variance=1.2))

Eg Gauss: 0.3281560642004626


In [None]:

# naive bayes trainig model for each class goes here

# Initialize an empty dictionary to store all the probabilities
feature_probability = {}
predictions = []

for test_sample in X_test:
    # using log form to avoid it becoming 0
    log_probs = {}

    for c in classes:
        # starting with P(class) probability
        log_probs[c] = math.log(priors[c])

        # to multiply P(x_i | class) across features
        for feature_index in range(feature):
            mean = feature_stats[c][feature_index]['mean']
            variance = feature_stats[c][feature_index]['variance']

            # using gaussian assumption: P(x_i | class) = Gaussian(x_i; mean_class, var_class)
            log_pdf = math.log(gaussian_prob(test_sample[feature_index], mean, variance))

            log_probs[c] += log_pdf

    # make the class with the highest updated probability / highest log probability predicted class
    predicted_class = max(log_probs, key=log_probs.get)
    predictions.append(predicted_class)

    # Convert log probabilities back to regular probabilities for output
    if len(predictions) == 1:
        sum_exp = sum(math.exp(val) for val in log_probs.values())
        feature_probability = {
            c: math.exp(log_prob) / sum_exp for c, log_prob in log_probs.items()
        }

# check accuracy
correct_predictions = sum(1 for p, t in zip(predictions, y_test) if p == t)
accuracy = correct_predictions / len(y_test)
print("Accuracy: ", accuracy) # 1.0, checked with sk learn library

Accuracy:  1.0


In [None]:
#Output is the probability of each data entry in the the test dataset (20%). Write code to enumerate it.
# x,sf,we, -> class (versicolor): 0.6



Probabilities for the first test sample:
5.0,3.4,1.6,0.4 -> {'Iris-setosa': 0.9999999999999186, 'Iris-versicolor': 8.136742027800733e-14}
