# 1. Load libraries

In [None]:
## packages 
import numpy as np
from scipy.sparse import coo_matrix # for sparse matrix
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score # for evaluating results

# data path and file name 
path = 'ex6DataPrepared/'
train_data_fn = 'train-features.txt'
test_data_fn = 'test-features.txt'
train_label_fn = 'train-labels.txt'
test_label_fn = 'test-labels.txt'

# 2. Define read_data function

In [None]:
nwords = 2500 

def read_data(data_fn, label_fn, type):
    ## read label_fn
    with open(path + label_fn) as f:
        content_tmp = f.readlines()
    # remove '\n' at the end of each line
    label = [int(x.strip()) for x in content_tmp]

    ## read data_fn
    with open(path + data_fn) as f:
        content_tmp = f.readlines()
    # remove '\n' at the end of each line
    content = [x.strip() for x in content_tmp] 
    
    data = np.zeros((len(label), nwords))
    
    for i, line in enumerate(content): 
        a = line.split(' ')
        # because words and observation is indexed from 1, so we need to minus 1
        if (type == 'Multinomial'):
            data[int(a[0]) - 1, int(a[1]) - 1] = int(a[2])
        elif (type == 'Bernoulli'):
            data[int(a[0]) - 1, int(a[1]) - 1] = 1
        else:
            raise ValueError("Please use the correct method")
    return (np.array(data), np.array(label))

# 3. Multinomial Naive Bayes

In [None]:
(train_data, train_label)  = read_data(train_data_fn, train_label_fn, 'Multinomial')
(test_data, test_label)  = read_data(test_data_fn, test_label_fn, 'Multinomial')

## 1. Using SciKit

In [None]:
clf = MultinomialNB(alpha=1)
clf.fit(train_data, train_label)
y_pred = clf.predict(test_data)

print('Training size = %d, accuracy = %.2f%%' % \
      (train_data.shape[0],accuracy_score(test_label, y_pred)*100))

Training size = 700, accuracy = 98.08%


## 2. Doing from scratch

In [None]:
y_freq = np.bincount(train_label)
print(y_freq)

label = np.nonzero(y_freq)[0]
print(label)

y_index = np.hstack((0, np.cumsum(y_freq)))
print(y_index)

[350 350]
[0 1]
[  0 350 700]


In [None]:
smoothing = 1

In [None]:
V = nwords
print(V)

class_log_prior = np.zeros_like(label, dtype=float)

for ii in np.arange(len(label)):
    class_log_prior[ii] = y_freq[ii]/len(train_label)
print(class_log_prior)

feature_prob = np.zeros((len(label), V))

for ii in np.arange(len(label)):
    data = train_data[y_index[ii]:y_index[ii+1],:]
    data = (np.sum(data, axis=0)+smoothing)/(np.sum(data)+smoothing*V)
    feature_prob[ii] = np.log10(data)

2500
[0.5 0.5]


In [None]:
class_prob = np.zeros((len(test_data), len(label)))

for ii in np.arange(len(label)):
    data = np.tile(feature_prob[label[ii]], (len(test_data), 1))
    data = np.multiply(data, test_data)
    class_prob[:, ii] = np.log10(class_log_prior[ii]) + np.sum(data, axis=1)

In [None]:
y_hat = np.zeros_like(test_label)
for ii in np.arange(0, len(y_hat)):
    y_hat[ii] = label[np.argmax(class_prob[ii, :])]

In [None]:
print(y_hat)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1]


In [None]:
print(accuracy_score(test_label, y_hat))

0.9807692307692307
