In [1]:
import numpy as np
from math import pi
np.set_printoptions(precision=3, suppress=True)

In [2]:
class GaussianBinaryNaiveBayes:
    means, variances = None, None
    log_prior = None
    num_feat = None
    labels = None
    dropped_columns = None
    def __init__(self):
        pass
    
    def fit(self, X, y, standardize=False):
        self.num_feat = X.shape[1]
        self.labels = np.unique(y)
        self.dropped_columns = np.where(X.sum(axis=0) == 0)
        X_ = np.delete(X, self.dropped_columns, axis=1)  # drop columns where features are all 0

        if standardize:  # Mean center and normalize data
            X_ = (X_ - X_.mean(axis=0))/X_.std(axis=0)  

        self.means, self.variances = np.empty(shape=(2, self.labels.size, X_.shape[1]))
#         variances = X_.std(axis=0)**2
        for index, label in enumerate(self.labels):
            self.means[index] = X_[y==self.labels[index]].mean(axis=0)
            self.variances[index] = (X_[y==self.labels[index]].std(axis=0))**2
#         self.wki = (means[0] - means[1])/variances
#         print(self.wki.shape)
        self.log_prior = np.log(np.bincount(y)[self.labels]/y.shape[0])
#         self.wk0 = np.log(prior[0]/prior[1]) + (np.sum((means[0]**2 - means[1]**2)/(2*variances)))
    
    def predict(self, X):
        X_ = np.delete(X, self.dropped_columns, axis=1)
#         num = (self.wk0 + np.sum(self.wki.reshape(1,-1) * X_, axis=1)) 
#         num = np.argmax(num, axis=0)
#         print(num)
        log_class_cond_prob = np.zeros(shape=(self.labels.size, X.shape[0]))
        for index in range(self.labels.size):
            log_likelihood = - 0.5 * np.sum(np.log(2 * np.pi * self.variances[index, :]))
            value = ((X_ - self.means[index, :]) ** 2)/(self.variances[index, :])
            value = value[:,~np.isnan(value).any(axis=0)]
            log_likelihood -= 0.5 * np.sum(value, axis=1)
            log_class_cond_prob[index] = self.log_prior[index] + log_likelihood
        log_class_cond_prob = log_class_cond_prob.T
        return self.labels[np.argmax(log_class_cond_prob, axis=1)]

# MNIST Data

Load MNIST train data

In [3]:
mnist_data = np.genfromtxt('../Data/MNIST_HW1/train.csv', delimiter=',', dtype=np.float)
mnist_data = mnist_data.T
X = mnist_data[:, :-1]
y = mnist_data[:, -1].astype(int)
# Select data corresponding to 0 and 1 digits
rows_0_1 = ((y == 0) + (y == 1))
# Select data corresponding to 3 and 5 digits
rows_3_5 = ((y == 3) + (y == 5))
X_0_1 = X[rows_0_1]
X_3_5 = X[rows_3_5]
y_0_1 = y[rows_0_1]
y_3_5 = y[rows_3_5]

Fit to model

In [4]:
nbc_0_1 = GaussianBinaryNaiveBayes()
nbc_0_1.fit(X_0_1, y_0_1)
nbc_3_5 = GaussianBinaryNaiveBayes()
nbc_3_5.fit(X_3_5, y_3_5)

Load MNIST test data

In [5]:
mnist_test_data = np.genfromtxt('../Data/MNIST_HW1/test.csv', delimiter=',', dtype=np.float)
mnist_test_data = mnist_test_data.T
X_test = mnist_test_data[:, :-1]
y_test = mnist_test_data[:, -1].astype(int)
# Select data corresponding to 0 and 1 digits
rows_0_1 = ((y_test == 0) + (y_test == 1))
# Select data corresponding to 3 and 5 digits
rows_3_5 = ((y_test == 3) + (y_test == 5))
X_test_0_1 = X_test[rows_0_1]
X_test_3_5 = X_test[rows_3_5]
y_test_0_1 = y_test[rows_0_1]
y_test_3_5 = y_test[rows_3_5]

Get predictions

In [6]:
y_pred_0_1 = nbc_0_1.predict(X_test_0_1)
y_pred_3_5 = nbc_3_5.predict(X_test_3_5)



# Wine Data

The following function just samples the requested number of samples from a given bool_array. 

In [7]:
def sample_mask_from(bool_array, sample_size):
    sample_rows_mask = bool_array.copy()
    temp = bool_array[bool_array == True]
    temp[sample_size:] = False
    np.random.shuffle(temp)
    sample_rows_mask[sample_rows_mask == True] = temp
    return sample_rows_mask

Load wine data

In [8]:
wine_data = np.genfromtxt('../Data/wine.data', delimiter=',')
rows_1 = (wine_data[:,0] == 1)
rows_2 = (wine_data[:,0] == 2)
wine_data = wine_data[rows_1 + rows_2]
rows_1 = (wine_data[:,0] == 1)
rows_2 = (wine_data[:,0] == 2)
five_sample_rows_mask = sample_mask_from(rows_1, 5) + sample_mask_from(rows_2, 5)
train_set_1 = wine_data[five_sample_rows_mask]
test_set_1 = wine_data[~five_sample_rows_mask]

fifty_sample_rows_mask = sample_mask_from(rows_1, 50) + sample_mask_from(rows_2, 50)
train_set_2 = wine_data[fifty_sample_rows_mask]
test_set_2 = wine_data[~fifty_sample_rows_mask]

Fit data

In [9]:
nbc_w_1 = GaussianBinaryNaiveBayes()
nbc_w_1.fit(train_set_1[:, 1:], train_set_1[:, 0].astype(int))
nbc_w_2 = GaussianBinaryNaiveBayes()
nbc_w_2.fit(train_set_2[:, 1:], train_set_2[:, 0].astype(int))

In [10]:
w_pred_lab_1, w_pred_lab_2 = nbc_w_1.predict(test_set_1[:, 1:]), nbc_w_2.predict(test_set_2[:, 1:])

Evaluate model performance

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd

  if 'order' in inspect.getargspec(np.copy)[0]:


Wine Data:

    For five samples

        Confusion Matrix

In [12]:
pd.DataFrame(confusion_matrix(test_set_1[:, 0], w_pred_lab_1), columns=['Class 1', 'Class 2'], index = ['Class 1', 'Class 2'])

Unnamed: 0,Class 1,Class 2
Class 1,47,7
Class 2,9,57


        Accuracy Score

In [13]:
print("Accuracy is: {:.3}%".format(accuracy_score(test_set_1[:, 0], w_pred_lab_1, normalize=True) * 100))

Accuracy is: 86.7%


        Other metrics

In [14]:
target_names = ['Class 1', 'Class 2']
print(classification_report(test_set_1[:, 0], w_pred_lab_1, target_names=target_names))

             precision    recall  f1-score   support

    Class 1       0.84      0.87      0.85        54
    Class 2       0.89      0.86      0.88        66

avg / total       0.87      0.87      0.87       120



    For fifty samples

        Confusion matrix

In [15]:
pd.DataFrame(confusion_matrix(test_set_2[:, 0], w_pred_lab_2), columns=['Class 1', 'Class 2'], index = ['Class 1', 'Class 2'])

Unnamed: 0,Class 1,Class 2
Class 1,9,0
Class 2,0,21


        Accuracy Score

In [16]:
print("Accuracy is: {:.3}%".format(accuracy_score(test_set_2[:, 0], w_pred_lab_2, normalize=True) * 100))

Accuracy is: 1e+02%


        Other Metrics

In [17]:
print(classification_report(test_set_2[:, 0], w_pred_lab_2, target_names=target_names))

             precision    recall  f1-score   support

    Class 1       1.00      1.00      1.00         9
    Class 2       1.00      1.00      1.00        21

avg / total       1.00      1.00      1.00        30



K fold cross validation

In [18]:
from sklearn.cross_validation import StratifiedKFold

In [19]:
num_folds = 10
skf = StratifiedKFold(wine_data[:, 0], n_folds=num_folds, shuffle=True)
X = wine_data[:, 1:]
y = wine_data[:, 0].astype(int)

In [20]:
accuracy_scores = []
for train_index, test_index in skf:
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    clf = GaussianBinaryNaiveBayes()
    clf.fit(X_train, y_train.astype(int))
    y_pred = clf.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))

Average accuracy from cross validation

In [21]:
print("Accuracy is: {:.4} %".format(np.asarray(accuracy_scores).mean()*100))

Accuracy is: 98.46 %
