### Naive bayes

$$ p(\theta|y) \propto p(y|\theta)p(\theta) $$

In [1]:
import numpy as np
import cPickle as pickle
import glob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
import time

start = time.time()

sv = CountVectorizer()
rv = CountVectorizer()
dv = TfidfVectorizer(max_df=1.0, min_df=0.01)

def load_data(pattern, train=False):
    docs = list()
    receivers = list()
    senders = list()
    for path in glob.glob("data/" + pattern):
        with open(path) as f:
            receivers.append(f.readline())
            senders.append(f.readline())
            docs.append(f.readline())

    if train:
        dv.fit(docs)
        sv.fit(senders)
        rv.fit(receivers)
        
    D = dv.transform(docs)
    S = sv.transform(senders)
    R = rv.transform(receivers)
    
    return D.toarray(), np.argmax(S.toarray(), axis=1), np.argmax(R.toarray(), axis=1)

In [2]:
train_X, train_s, train_r = load_data("0*", True)
val_X, val_s, val_r = load_data("1*")

In [3]:
print train_X.shape

print val_s.shape
print 1-(val_s==0).mean()

print val_r.shape
print 1-(val_r==0).mean()

(13575, 2638)
(13589,)
0.932739715947
(13589,)
0.98807859298


In [4]:
clf_s = MultinomialNB(alpha=0.0001, fit_prior=False)
clf_s.fit(train_X, train_s)
pred_s = clf_s.predict(val_X)
print f1_score(val_s, pred_s, average='micro')
print f1_score(val_s, pred_s, average='macro')
print f1_score(val_s, pred_s, average='weighted')

0.899624696446
0.772191692349
0.894157648378


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [149]:
clf_r = MultinomialNB(alpha=0.0001, fit_prior=False)
clf_r.fit(train_X, train_r)
pred_r = clf_r.predict(val_X)
print f1_score(val_r, pred_r, average='micro')
print f1_score(val_r, pred_r, average='macro')
print f1_score(val_r, pred_r, average='weighted')

0.77311617359
0.372496276425
0.796814074067


In [113]:
print "Took %f s" % (time.time()-start)

Took 153.973458 s


In [5]:
norm_coef = clf_s.coef_/np.abs(clf_s.coef_.sum(axis=1))[:,None]
clf_s2 = clf_s
clf_s2.feature_log_prob_ = norm_coef
pred_s = clf_s2.predict(val_X)
print f1_score(val_s, pred_s, average='micro')
print f1_score(val_s, pred_s, average='macro')
print f1_score(val_s, pred_s, average='weighted')

0.894988593716
0.805184970071
0.869086541886


In [150]:
norm_coef = clf_r.coef_/np.abs(clf_r.coef_.sum(axis=1))[:,None]
clf_r2 = clf_r
clf_r2.feature_log_prob_ = norm_coef
pred_r = clf_r2.predict(val_X)
print f1_score(val_r, pred_r, average='micro')
print f1_score(val_r, pred_r, average='macro')
print f1_score(val_r, pred_r, average='weighted')

0.810225749923
0.373862197017
0.802757838902
