In [None]:
import numpy as np 
import pandas as pd 
import math
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_blobs
from scipy.stats import norm

In [None]:
''' 
## Continues Data (Done in Class)

X, y = make_blobs(n_samples=10000, centers=2, n_features = 2, random_state=1)

print(X.shape, y.shape)
print(X[:5])
print(y[:5])
'''

(10000, 2) (10000,)
[[-3.08389358  5.70067218]
 [-8.80258525 -5.07389013]
 [-1.68452735  5.22511143]
 [-1.44683075  4.51471432]
 [-3.36067232  3.22371079]]
[0 1 0 0 0]


In [None]:
## ADDED for Discrete Data 
# Creating a sample data set of 2 dims x1 and x2
x1 = np.random.randint(2, size = 1000)
x2 = np.random.randint(2, size = 1000)
y = np.random.randint(2, size = 1000)

#Creating an array for discrete generated variables 
X = np.vstack((x1,x2)).T
print(X[:4])
print(y[:4])

print()
#Creating DataFrame
Xy = np.vstack((x1,x2,y)).T 
print(Xy[:4])

[[1 0]
 [1 1]
 [1 1]
 [0 0]]
[0 0 0 0]

[[1 0 0]
 [1 1 0]
 [1 1 0]
 [0 0 0]]


In [None]:
'''
Steps to be performed-

1. Split Data
2. Each column is discrete --> likelihood for each column/each class
3. Probability function ==> prior * likelihood(s)
4. Calculate prior
5. predict function
6. fit function

'''

'\n\n1. Split Data\n2. Each column is a univariate gussian --> Mean and Std per coln\n3. We need a Guassian Density function\n4. Probability function ==> prior * likelihood(s)\n5. Calculate prior\n6. predict function\n7. fit function\n\n'

In [None]:
class NaiveBayes:

  def __init__(self, X, y, tau = 0):
    self.X = X
    self.y = y
    self.tau = tau    # For Laplace Smoothing
  
  def dataSplit(self):
    X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=24, shuffle=True)
    return X_train, X_test, y_train, y_test

  '''
  def fitDistribution(self, X):
    mean = np.mean(X)
    std = np.std(X)
    dist = norm(mean, std)
    return dist
  '''

  def probabilityFunction(self, X, prior, dist1, dist2):
    return prior * dist1 * dist2 

  def fit(self):
    self.X_train, self.X_test, self.y_train, self.y_test = self.dataSplit()

    # Divide the data based on 2 classes of y = 0 & 1
    self.X0_train = self.X_train[self.y_train == 0]
    self.X1_train = self.X_train[self.y_train == 1]


    # For each Class, divide the data into each column (x1 & x2)
    self.X00_train = self.X0_train[:, 0]    # For Class 0, first column (x1)
    self.X10_train = self.X0_train[:, 1]    # For Class 0, second column (x2)
    self.X01_train = self.X1_train[:, 0]    # For Class 1, first column (x1)
    self.X11_train = self.X1_train[:, 1]    # For Class 1, second column (x2)

    
    # For each class, divide the xi rows into xi=0 and xi=1

    # For Class 0
    self.X00y0_train = self.X00_train[self.X00_train == 0]    #Class 0 x1=0
    self.X01y0_train = self.X00_train[self.X00_train == 1]    #Class 0 x1=1
    self.X10y0_train = self.X10_train[self.X10_train == 0]    #Class 0 x2=0
    self.X11y0_train = self.X10_train[self.X10_train == 1]    #Class 0 x2=1
    
    # For Class 1
    self.X00y1_train = self.X01_train[self.X01_train == 0]    #Class 1 x1=0  
    self.X01y1_train = self.X01_train[self.X01_train == 1]    #Class 1 x1=1
    self.X10y1_train = self.X11_train[self.X11_train == 0]    #Class 1 x2=0
    self.X11y1_train = self.X11_train[self.X11_train == 1]    #Class 1 x2=1
          
    # calculating priors on y

    self.X0_prior = len(self.X0_train)/len(self.X_train)
    self.X1_prior = len(self.X1_train)/len(self.X_train)

        
    d = 2   # No. of features
    tau = self.tau
    #print(tau)

    # For Class 0 - calculating likelihood functions for x1 and x2 
    self.X00y0_likelihood = (len(self.X00y0_train)+tau)/(len(self.X0_train)+(d*tau)) 
    self.X01y0_likelihood = (len(self.X01y0_train)+tau)/(len(self.X0_train)+(d*tau))
    self.X10y0_likelihood = (len(self.X10y0_train)+tau)/(len(self.X0_train)+(d*tau))
    self.X11y0_likelihood = (len(self.X11y0_train)+tau)/(len(self.X0_train)+(d*tau))
    
    # For Class 1 - calculating likelihood functions for x1 and x2      
    self.X00y1_likelihood = (len(self.X00y1_train)+tau)/(len(self.X1_train)+(d*tau))
    self.X01y1_likelihood = (len(self.X01y1_train)+tau)/(len(self.X1_train)+(d*tau))
    self.X10y1_likelihood = (len(self.X10y1_train)+tau)/(len(self.X1_train)+(d*tau))
    self.X11y1_likelihood = (len(self.X11y1_train)+tau)/(len(self.X1_train)+(d*tau))
    
    '''
    self.X00_dist = self.fitDistribution(X0_train[: ,0])
    self.X10_dist = self.fitDistribution(X0_train[: ,1])

    self.X01_dist = self.fitDistribution(X1_train[: ,0])
    self.X11_dist = self.fitDistribution(X1_train[: ,1])
    '''

  def predict(self):
    for sample, target in zip(self.X_test, self.y_test):

      # if sample belongs to class 0

      if sample[0] == 1:    #Class 0 
          self.X0y0_dist = self.X01y0_likelihood #x1=1|y=0
      else:
          self.X0y0_dist = self.X00y0_likelihood #x1=0|y=0
          
      if sample[1] == 1:
          self.X1y0_dist = self.X11y0_likelihood #x2=1|y=0
      else:
          self.X1y0_dist = self.X10y0_likelihood #x2=0|y=0
          
      # if sample belongs to class 1
      
      if sample[0] == 1:
          self.X0y1_dist = self.X01y1_likelihood #x1=1|y=1
      else:
          self.X0y1_dist = self.X00y1_likelihood #x1=0|y=1
           
      if sample[1] == 1:
          self.X1y1_dist = self.X11y1_likelihood #x2=1|y=1
      else:
          self.X1y1_dist = self.X10y1_likelihood #x2=0|y=1

      py0 = self.probabilityFunction(sample, self.X0_prior, self.X0y0_dist, self.X1y0_dist)
      py1 = self.probabilityFunction(sample, self.X1_prior, self.X0y1_dist, self.X1y1_dist)
      

      print('P(y=0|%s = %.3f' % (sample, py0*100))
      print('P(y=1|%s = %.3f' % (sample, py1*100))
      print('Model predicted class {} and the true label was {} \n'. format(np.argmax([py0*100, py1*100]), target))



Without Laplace Smoothing

In [None]:
clf = NaiveBayes(X, y)
clf.fit()

0


In [None]:
clf.predict()

P(y=0|[0 1] = 12.871
P(y=1|[0 1] = 12.775
Model predicted class 0 and the true label was 1 

P(y=0|[0 1] = 12.871
P(y=1|[0 1] = 12.775
Model predicted class 0 and the true label was 0 

P(y=0|[1 0] = 11.156
P(y=1|[1 0] = 13.203
Model predicted class 1 and the true label was 0 

P(y=0|[1 0] = 11.156
P(y=1|[1 0] = 13.203
Model predicted class 1 and the true label was 1 

P(y=0|[0 1] = 12.871
P(y=1|[0 1] = 12.775
Model predicted class 0 and the true label was 1 

P(y=0|[0 1] = 12.871
P(y=1|[0 1] = 12.775
Model predicted class 0 and the true label was 0 

P(y=0|[1 0] = 11.156
P(y=1|[1 0] = 13.203
Model predicted class 1 and the true label was 1 

P(y=0|[1 0] = 11.156
P(y=1|[1 0] = 13.203
Model predicted class 1 and the true label was 1 

P(y=0|[0 0] = 12.272
P(y=1|[0 0] = 13.797
Model predicted class 1 and the true label was 0 

P(y=0|[1 1] = 11.701
P(y=1|[1 1] = 12.225
Model predicted class 1 and the true label was 1 

P(y=0|[0 1] = 12.871
P(y=1|[0 1] = 12.775
Model predicted class 0 and 

For Laplace Smoothing

In [None]:
clf = NaiveBayes(X, y, tau = 1)
clf.fit()
clf.predict()

1
P(y=0|[0 1] = 12.866
P(y=1|[0 1] = 12.776
Model predicted class 0 and the true label was 1 

P(y=0|[0 1] = 12.866
P(y=1|[0 1] = 12.776
Model predicted class 0 and the true label was 0 

P(y=0|[1 0] = 11.161
P(y=1|[1 0] = 13.202
Model predicted class 1 and the true label was 0 

P(y=0|[1 0] = 11.161
P(y=1|[1 0] = 13.202
Model predicted class 1 and the true label was 1 

P(y=0|[0 1] = 12.866
P(y=1|[0 1] = 12.776
Model predicted class 0 and the true label was 1 

P(y=0|[0 1] = 12.866
P(y=1|[0 1] = 12.776
Model predicted class 0 and the true label was 0 

P(y=0|[1 0] = 11.161
P(y=1|[1 0] = 13.202
Model predicted class 1 and the true label was 1 

P(y=0|[1 0] = 11.161
P(y=1|[1 0] = 13.202
Model predicted class 1 and the true label was 1 

P(y=0|[0 0] = 12.271
P(y=1|[0 0] = 13.792
Model predicted class 1 and the true label was 0 

P(y=0|[1 1] = 11.703
P(y=1|[1 1] = 12.229
Model predicted class 1 and the true label was 1 

P(y=0|[0 1] = 12.866
P(y=1|[0 1] = 12.776
Model predicted class 0 an