In [2]:
import numpy as np

In [76]:
class GNBC():
    def __init__(self, min_var=0.1):
        self.min_var = min_var
    
    def get_classes(self, y):
        class_dict = {}
        for num in y:
            if num not in class_dict.keys():
                class_dict.update({num:1})
            else:
                class_dict[num] += 1
        class_counts = [(num, class_dict[num]) for num in class_dict.keys()]
        class_counts.sort(key=lambda y: y[0])
        counts = [t[1] for t in class_counts]
    
        return len(class_dict.keys()), np.array(counts) #return num_classes, class_counts
    
    def compute_sigmas(self, X, y):
        rows = []
        for c in range(self.n_classes):
            Nc = self.class_counts[c]
            row = np.array([np.var(X[:, i][y == c]) + self.min_var for i in range(self.n_features)])
            rows.append(row)

        res = np.stack(rows)
        return res
            
    def compute_mus(self, X, y):
        rows = []
        for c in range(self.n_classes):
            Nc = self.class_counts[c]
            row = np.array([np.mean(X[:, i][y == c]) for i in range(self.n_features)])
            rows.append(row)

        res = np.stack(rows)
        return res
        
    
    def fit(self, X, y, output_classes):
        # X is N by d np array
        # y is N array
        N, d = X.shape
        self.X = X
        self.n_features = d
        self.n_classes, self.class_counts = self.get_classes(y)
        self.pis = self.class_counts / N
        self.classes = output_classes
        self.mus = self.compute_mus(X, y)
        self.sigmas = self.compute_sigmas(X, y)
        self.predict = np.vectorize(self.predict1)
        return self
    
    def predict1(self, x): # vectorized
        probs = []
        # x is an attribute, j, is the index of that column, c is the class index
        normal_prob = lambda x, j, c: (1/np.sqrt(2*np.pi*self.sigmas[c, j]))*\
                                        np.exp(-(x-self.mus[c, j])**2/(2*self.sigmas[c, j]))
        for c in range(self.n_classes):
            prob = np.prod([normal_prob(x[j], j, c) for j in range(self.n_features)]) * self.pis[c]
            probs.append(prob)
        
        prediction = np.argmax(np.array(probs))
        return prediction    
    
    def score(self, X_test, y_test):
        guesses = [self.predict1(X_test[i, :]) for i in range(X_test.shape[0])]
        N = len(guesses)
        return sum([guesses[i] == y_test[i] for i in range(N)]) / N

In [78]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

X, y = load_iris(return_X_y=True, as_frame=False)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = GNBC().fit(X_train, y_train, ['setosa', 'versicolor', 'virginica'])
accuracy = model.score(X_test, y_test)
print(accuracy)

1.0


In [81]:
import time
X, y = load_iris(return_X_y=True, as_frame=False)

y = y == 0

print(y)

start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = GNBC().fit(X_train, y_train, ['setosa', 'versicolor', 'virginica'])
accuracy = model.score(X_test, y_test)
print(accuracy)
end = time.time()
print(end - start)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False]
1.0
0.005996227264404297


In [85]:
from sklearn.naive_bayes import GaussianNB

X, y = load_iris(return_X_y=True, as_frame=False)

y = y == 0

print(y)

start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = GaussianNB().fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print(accuracy)
end = time.time()
print(end - start)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False]
1.0
0.014980077743530273


In [88]:
from sklearn.datasets import load_digits
X, y = load_digits(return_X_y=True, as_frame=False)

print(y)
y = y == 4

print(y)

start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = GNBC().fit(X_train, y_train, [])
accuracy = model.score(X_test, y_test)
print(accuracy)
end = time.time()
print(end - start)

[0 1 2 ... 8 9 8]
[False False False ... False False False]
0.9851851851851852
0.35398173332214355


In [89]:
from sklearn.naive_bayes import GaussianNB

X, y = load_digits(return_X_y=True, as_frame=False)

y = y == 4

print(y)

start = time.time()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = GaussianNB().fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print(accuracy)
end = time.time()
print(end - start)

[False False False ... False False False]
0.8203703703703704
0.0030028820037841797
