In [87]:
# 03/02/2023
# Author: Pushpraj Katiyar
# email: pk825@snu.edu.in
# Roll no: 2220120001

#let's import all useful packages 

import numpy as np  # import numpy for linear algebra operations
import pandas as pd # pandas for data processing, CSV file I/O (e.g. pd.read_csv)
# from scipy.stats import norm

In [88]:
class BayesianClassifier:
    def fit(self, X, y, class_probs, means, variances):
        self.X = X
        self.y = y
        self.classes = np.unique(y)
        self.class_probs = class_probs
        self.means = means
        self.variances = variances
        
    def predict(self, X_test):
        y_pred = []
        for x in X_test:
            posteriors = []
            for i, c in enumerate(self.classes):
                mean = self.means[i]
                var = self.variances[i]
                posteriors.append(self.gaussian(mean, var, x)*self.class_probs[i])
#                 print(posteriors)
#                 print(np.argmax(posteriors))
            y_pred.append(self.classes[np.argmax(posteriors)])
        return np.array(y_pred)
    
    def gaussian(self, mean, var, sample):
        return np.exp(-0.5 * np.dot(np.dot((sample - mean), np.linalg.inv(np.diag(var))), (sample - mean))) / np.sqrt(2 * np.pi * np.prod(var))
    
    def accuracy(self, y_true, y_pred):
        return np.mean(y_true == y_pred)
    
    def error_rate(self, y_true, y_pred):
        return 1 - self.accuracy(y_true, y_pred)

In [89]:
# Load data
dataset = pd.read_csv("dataset/Dataset1_Assignment1.csv")
X = dataset[["x1", "x2"]].to_numpy()
y = dataset["y"].to_numpy()

# Prior probabilities
class_probs = [0.1, 0.3, 0.25, 0.35]

# Gaussian means
means = [[-1, -1], [-1, 1], [1, 1], [1, 1]]

# Gaussian variances
variances = [[0.1, 0.1], [0.1, 0.1], [0.1, 0.1], [0.1, 0.1]]

# Fit the model
clf = BayesianClassifier()
clf.fit(X, y, class_probs, means, variances)

# Predict on the test data
y_pred = clf.predict(X)
print("Pridiction of classifier:", y_pred)

# Calculate the error rate
error_rate = clf.error_rate(y, y_pred)
print("Error rate:", error_rate)


Pridiction of classifier: [1. 0. 1. ... 3. 1. 1.]
Error rate: 0.4024


In [90]:
#Now using half of the data. first 5000 samples

X_firsthalf = X[:5000]
y_firsthalf = y[:5000]
# print(X_firsthalf)
# print(y_firsthalf)
prior0 = np.mean(y_firsthalf == 0)
prior1 = np.mean(y_firsthalf == 1)
prior2 = np.mean(y_firsthalf == 2)
prior3 = np.mean(y_firsthalf == 3)

#new prior probabilities based on first 5000 records
# Prior probabilities
New_class_probs = [prior0, prior1, prior2, prior3]
print("New Prior probabilities of classes y ∈ {0,1,2,3} are : ", New_class_probs)

New Prior probabilities of classes y ∈ {0,1,2,3} are :  [0.1034, 0.2926, 0.256, 0.348]


In [91]:
# Compute the sample covariance matrices for each class
# calculate mean

# def calculateVarience(results):
#     mean = sum(results) / len(   )
#     # calculate variance using a list comprehension
#     var_res = sum((xi - mean) ** 2 for xi in results) / len(results)  
#     return var_res
# variance1 = calculateVarience(X_firsthalf[0])
# variance2 = calculateVarience(X_firsthalf[1])

# Compute the sample covariance matrices for each class
cov0 = np.cov(X_firsthalf[y_firsthalf == 0,:].T)
cov1 = np.cov(X_firsthalf[y_firsthalf == 1,:].T)
cov2 = np.cov(X_firsthalf[y_firsthalf == 2,:].T)
cov3 = np.cov(X_firsthalf[y_firsthalf == 3,:].T)
variance1 = (cov0[0][0] + cov1[0][0] + cov2[0][0] + cov3[0][0])/4
variance2 = (cov0[1][1] + cov3[1][1] + cov3[1][1] + cov3[1][1])/4
#New Gaussian varience are 
print("New Gaussian varience are :", variance1, variance2)

New Gaussian varience are : 0.10042071606970722 0.09940819599510224


In [92]:
#obtain the probability of classification error using the Bayesian classifier for second 5000 samples
# Fit the model
clf = BayesianClassifier()
clf.fit(X[5000:], y[5000:], class_probs, means, variances)

# Predict on the test data
y_pred = clf.predict(X)
print("Pridiction of classifier:", y_pred)

# Calculate the error rate
error_rate = clf.error_rate(y, y_pred)
print("Error rate:", error_rate)

Pridiction of classifier: [1. 0. 1. ... 3. 1. 1.]
Error rate: 0.4024


In [93]:
# obtain the Bayesian classifier using the obtained πˆ(i), Pˆ(x|i) and σˆ2 j , and obtain the probability of 
#classification error using this classifier for first 5000 samples

new_variances = [[variance1, variance2], [variance1, variance2], [variance1, variance2], [variance1, variance2]]
clf = BayesianClassifier()
clf.fit(X[:5000], y[:5000], New_class_probs, means, new_variances)

# Predict on the test data
y_pred = clf.predict(X)
print("Pridiction of classifier:", y_pred)

# Calculate the error rate
error_rate = clf.error_rate(y, y_pred)
print("Error rate:", error_rate)

Pridiction of classifier: [1. 0. 1. ... 3. 1. 1.]
Error rate: 0.4054


In [96]:
print(">>>>>>>>>>  COMPLETED   <<<<<<<<<<<<")

>>>>>>>>>>  COMPLETED   <<<<<<<<<<<<
