In [154]:
import numpy as np
import math
import csv

fname = './data/train.csv'

def load_data(fname):
    with open(fname, newline='') as csvfile:
        reader = csv.reader(csvfile)
        data = list(reader)

    num_examples = len(data)
    header = data[0]
    feats = np.array([row[1:] for row in data[1:]], dtype=int)
    label_chars = [row[0] for row in data[1:]]
    char2int = dict(zip(sorted(set(label_chars)), range(len(set(label_chars)))))
    labels = np.array([char2int[c] for c in label_chars])
    
    return feats, labels

feats, labels = load_data(fname)
classes = sorted(char2int.values())
num_classes = len(classes)
num_feats = feats.shape[1]
num_feat_levels = 16
feat_range = range(num_feat_levels)


In [183]:
fname.lower().split('.')[-1]

'csv'

# Gaussian Naive Bayes

In [129]:
means = np.zeros([num_classes, num_feats])
stds = np.ones([num_classes, num_feats])

for c in classes:
    c_feats = feats[np.where(labels == c)]
    means[c] = (np.mean(c_feats, axis=0))
    stds[c] = (np.std(c_feats, axis=0))

In [130]:
def gaussprob(means, stds, feat_vals, y):
    m = means[y]
    s = stds[y]
    p = np.exp(-0.5*np.power((feat_vals - m)/s, 2)) * (1/(s*math.sqrt(2*math.pi)))
    return np.sum(np.log(p))

predict = np.ndarray(labels.shape)
for i in range(num_examples-1):
    predict[i] = np.argmax([gaussprob(means, stds, feats[i], c) for c in range(num_classes)])

In [131]:
sum(labels==predict)/len(labels)

0.6530789245446661

# Multinomial Naive Bayes (counting)

In [173]:

freq = np.ndarray((num_classes, num_feat_levels, num_feats))
prior_freq = {}
z = {}

for i, c in enumerate(classes):
    c_feats = feats[np.where(labels==c)]
    num_in_class = c_feats.shape[0]
    prior_freq[c] = num_in_class
    z = num_in_class + num_feat_levels*alpha
    for f in feat_range:
        p = (np.count_nonzero(c_feats==f, axis=0) + alpha) #/(num_in_class + 15*alpha)
        freq[i, f, :] = np.log(p) - np.log(z)
    
# log_freq = np.log(freq)

In [174]:
prior_freq

{0: 606,
 1: 572,
 2: 553,
 3: 603,
 4: 571,
 5: 580,
 6: 575,
 7: 547,
 8: 564,
 9: 556,
 10: 542,
 11: 567,
 12: 607,
 13: 584,
 14: 560,
 15: 587,
 16: 594,
 17: 580,
 18: 553,
 19: 589,
 20: 597,
 21: 573,
 22: 576,
 23: 582,
 24: 623,
 25: 548}

In [175]:
freq

array([[[-6.43294009, -3.43720782, -6.43294009, ..., -6.43294009,
         -5.73979291, -6.43294009],
        [-2.74406064, -2.99895289, -6.43294009, ..., -6.43294009,
         -1.93313042, -5.3343278 ],
        [-1.45620635, -3.38841766, -2.84942115, ..., -5.3343278 ,
         -1.15482543, -5.04664573],
        ..., 
        [-6.43294009, -4.130355  , -6.43294009, ..., -6.43294009,
         -6.43294009, -6.43294009],
        [-6.43294009, -4.64118062, -6.43294009, ..., -6.43294009,
         -6.43294009, -6.43294009],
        [-6.43294009, -4.48702994, -6.43294009, ..., -6.43294009,
         -6.43294009, -6.43294009]],

       [[-5.68357977, -3.15785112, -6.37672695, ..., -6.37672695,
         -6.37672695, -6.37672695],
        [-2.73914079, -3.08089008, -3.66867675, ..., -6.37672695,
         -6.37672695, -6.37672695],
        [-2.02001812, -3.38099467, -2.73914079, ..., -6.37672695,
         -4.99043259, -6.37672695],
        ..., 
        [-6.37672695, -4.29728541, -6.37672695, ...,

In [176]:
def prob(feat_vals, y):
    p = 0
    for i,f in enumerate(feat_vals):
        p += log_freq[y, f, i]
    return p

In [177]:
def mnb(feats, classes):
    p = np.zeros((feats.shape[0], len(classes)))
    for r in range(p.shape[0]):
        for i, c in enumerate(classes):
            p[r,c] = prob(feats[r], i)
    
    predict = np.argmax(p, axis=1)
    return predict

In [178]:
predict = mnb(feats, classes)

In [179]:
sum(labels==predict)/len(labels)

0.75201814664086997

In [167]:
val_feats, val_labels = load_data('./data/validation.csv')

In [168]:
val_predict = mnb(val_feats, classes)

In [169]:
sum(val_labels==val_predict)/len(val_labels)

0.71999999999999997

# Sklearn results

In [102]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(feats, labels).predict(feats)
print((labels == y_pred).sum() / feats.shape[0])

0.654746814331


In [171]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha=1.0, fit_prior=True)
y_pred = mnb.fit(feats, labels).predict(feats)
print((labels == y_pred).sum() / feats.shape[0])

val_pred = mnb.predict(val_feats)
print((val_labels == val_pred).sum() / val_feats.shape[0])

0.55233838148
0.5405


In [181]:
mnb

<function __main__.mnb>