In [9]:
import numpy as np
from sklearn.datasets import make_blobs
from scipy.stats import norm
from dataclasses import dataclass
from sklearn.model_selection import train_test_split
import pandas as pd
from scipy.stats import laplace

In [16]:
#Built in class
X, y = make_blobs(n_samples=10000,n_features=2,centers=2, random_state=1)
X

array([[-3.08389358,  5.70067218],
       [-8.80258525, -5.07389013],
       [-1.68452735,  5.22511143],
       ...,
       [-8.65168502, -5.58805662],
       [-1.41968841,  3.76555241],
       [-9.9077506 , -3.42556702]])

In [17]:
#Built in class
y

array([0, 1, 0, ..., 1, 0, 1])

In [18]:
#Built in class
@dataclass
class NaiveBayes:
  X:np.array
  y:np.array

  def __post_init__(self):
    self.X_train, self.X_test, self.y_train, self.y_test = self.data_split()
    self.fit()

  def data_split(self):
    X_train,X_test,y_train,y_test = train_test_split(self.X,self.y,
                                                     test_size = 0.3,shuffle=True)
    return X_train,X_test,y_train,y_test

  def fit_distribution(self,data):
    mean =np.mean(data)
    std = np.std(data)
    dist = norm(mean,std)
    return dist

## shouldn't be hard coded - should work for d columns: make it a recursive function
## also add laplace smoothing
  def get_probability(self, X, dist_col1, dist_col2, prior):
    return prior * dist_col1.pdf(X[0]) * dist_col2.pdf(X[1])

  def fit(self):
    X1_train = self.X_train[self.y_train == 1]
    X0_train = self.X_train[self.y_train == 0]

    self.prior1 = X1_train.shape[0] / self.X_train.shape[0]
    self.prior0 = X0_train.shape[0] / self.X_train.shape[0]

    self.dist_X01 = self.fit_distribution(X1_train[:,0])
    self.dist_X11 = self.fit_distribution(X1_train[:,1])

    self.dist_X00 = self.fit_distribution(X0_train[:,0])
    self.dist_X10 = self.fit_distribution(X0_train[:,1])

  def predict(self):
    for i in range(self.X_test.shape[0]):
      x = self.X_test[i]
      y=self.y_test[i]
      prob_1 = self.get_probability(x, self.dist_X01, self.dist_X11, self.prior1)
      prob_0 = self.get_probability(x, self.dist_X00, self.dist_X10, self.prior0)
      j = np.argmax([prob_0, prob_1])
      print('P(y=0 | %s) = %.3f'%(x,prob_0))
      print('P(y=1 | %s) = %.3f'%(x,prob_1))
      print("Model predicted {} and class was {}".format(j,y))

In [19]:
#Built in class
nb = NaiveBayes(X,y)

In [None]:
#Built in class
nb.predict()

In [21]:
#Assignment
@dataclass
class NaiveBayes:
    X: np.array
    y: np.array

    def __post_init__(self):
        self.X_train, self.X_test, self.y_train, self.y_test = self.data_split()
        self.fit()

    def data_split(self):
        X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3, shuffle=True)
        return X_train, X_test, y_train, y_test

    def fit_distribution(self, data):
        return laplace(data.mean(), data.std())

    def get_probability(self, X, values, prior):
        prob = prior

        for i, val in enumerate(values):
            prob *= (values[i] == X[i]).sum() / len(values)

        return prob

    def fit(self):
        X1_train = self.X_train[self.y_train == 1]
        X0_train = self.X_train[self.y_train == 0]

        self.prior1 = (X1_train.shape[0] + 1) / (self.X_train.shape[0] + 2)  # Laplace smoothing
        self.prior0 = (X0_train.shape[0] + 1) / (self.X_train.shape[0] + 2)  # Laplace smoothing

        # Calculate the probabilities using Laplace smoothed distributions
        self.dist_X01 = self.fit_distribution(X1_train[:, 0])
        self.dist_X11 = self.fit_distribution(X1_train[:, 1])

        self.dist_X00 = self.fit_distribution(X0_train[:, 0])
        self.dist_X10 = self.fit_distribution(X0_train[:, 1])

    def predict(self):
        for i in range(self.X_test.shape[0]):
            x = self.X_test[i]
            y = self.y_test[i]
            prob_0 = self.get_probability(x, [self.dist_X00, self.dist_X10], self.prior0)
            prob_1 = self.get_probability(x, [self.dist_X01, self.dist_X11], self.prior1)
            j = 1 if prob_1 > prob_0 else 0
            print('P(y=0 | {}, {}) = {:.3f}'.format(x[0], x[1], prob_0))
            print('P(y=1 | {}, {}) = {:.3f}'.format(x[0], x[1], prob_1))
            print("Model predicted {} and actual class was {}".format(j, y))

# Example usage with a discrete dataset
np.random.seed(1)
n = 1000
d = 2

data = {
    'x1': np.random.randint(2, size=n),
    'x2': np.random.randint(2, size=n),
    'y': np.random.randint(2, size=n)
}

df = pd.DataFrame(data)  # Create a Pandas DataFrame

X = np.array(df[['x1', 'x2']])  # Extract the features as a NumPy array
y = np.array(df['y'])

nb = NaiveBayes(X, y)
nb.predict()


P(y=0 | 0, 0) = 0.000
P(y=1 | 0, 0) = 0.000
Model predicted 0 and actual class was 1
P(y=0 | 0, 0) = 0.000
P(y=1 | 0, 0) = 0.000
Model predicted 0 and actual class was 1
P(y=0 | 0, 1) = 0.000
P(y=1 | 0, 1) = 0.000
Model predicted 0 and actual class was 0
P(y=0 | 1, 1) = 0.000
P(y=1 | 1, 1) = 0.000
Model predicted 0 and actual class was 1
P(y=0 | 1, 1) = 0.000
P(y=1 | 1, 1) = 0.000
Model predicted 0 and actual class was 1
P(y=0 | 1, 1) = 0.000
P(y=1 | 1, 1) = 0.000
Model predicted 0 and actual class was 1
P(y=0 | 0, 0) = 0.000
P(y=1 | 0, 0) = 0.000
Model predicted 0 and actual class was 1
P(y=0 | 0, 1) = 0.000
P(y=1 | 0, 1) = 0.000
Model predicted 0 and actual class was 1
P(y=0 | 0, 1) = 0.000
P(y=1 | 0, 1) = 0.000
Model predicted 0 and actual class was 1
P(y=0 | 0, 1) = 0.000
P(y=1 | 0, 1) = 0.000
Model predicted 0 and actual class was 0
P(y=0 | 0, 0) = 0.000
P(y=1 | 0, 0) = 0.000
Model predicted 0 and actual class was 0
P(y=0 | 1, 0) = 0.000
P(y=1 | 1, 0) = 0.000
Model predicted 0 and