In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns;
sns.set(style="ticks", color_codes=True)

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

# dataset

we will use the 20 newsgroups dataset. example usage of this data can be found [here](http://scikit-learn.org/stable/datasets/index.html#the-20-newsgroups-text-dataset).

In [2]:
def get_vocab():
    with open('./simple-vocab.txt')as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return np.unique(content)
vocab = get_vocab()
num_features = len(vocab)
categories = [
    'rec.autos',
    'rec.sport.baseball', 
    'rec.sport.hockey',
    'sci.med', 
    'sci.space'
]
num_cats = len(categories)
docs_train = fetch_20newsgroups(subset='train', categories=categories)
docs_test = fetch_20newsgroups(subset='test', categories=categories)

### feature extraction

see wikipedia for an explaination of [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

In [3]:
vectorizer = TfidfVectorizer(
    stop_words='english', 
    vocabulary=vocab,
    binary=True, 
    use_idf=False, 
    norm=None
)
vectors_train = vectorizer.fit_transform(docs_train.data).toarray()
vectors_test = vectorizer.transform(docs_test.data).toarray()
print('train: {}'.format(vectors_train.shape))
print('test: {}'.format(vectors_test.shape))

train: (2978, 1027)
test: (1982, 1027)


In [4]:
# see https://youtu.be/oAihxFkRHu8?t=29m34s
# for an explaination of N_c and N_c_j
# 
def make_N_c():
    N_c = []
    for i in range(num_cats):
        N_c.append(vectors_train[docs_train.target == i, :].shape[0])
    return np.array(N_c)

N_c = make_N_c()

def pi_estimator():
    """
    number of times a document occures in class divided by total
    number of docs
    nando calls it N_c
    """
    return N_c / float(len(vectors_train))

def theta_estimator():
    """
    each class has 1000 estimated parameters
    """
    acc = []
    for class_lbl in range(num_cats):
        feat_acc = []
        for feat in range(num_features):
            feat_acc.append(vectors_train[docs_train.target == class_lbl, feat].sum())
        feat_acc = np.array(feat_acc) / float(N_c[class_lbl])
        acc.append(feat_acc)
    return np.array(acc)

pi_hat = pi_estimator()
theta_hat = theta_estimator()

def prob_c_given(x_str):
    """
    https://youtu.be/oAihxFkRHu8?t=29m44s
    """
    def pmf_at_c(c):
        x = vectorizer.transform([x_str]).toarray().tolist()[0]
        acc_prod = 1
        for j in range(num_features):
            acc_prod *= theta_hat[c,j]**(x[j] == 1) * (1 - theta_hat[c,j])**(x[j] == 0)
        return pi_hat[c] * acc_prod
    
    pmfs = []
    for lbl in range(num_cats):
        pmfs.append(pmf_at_c(lbl))
    pmfs = np.array(pmfs)
    pmfsum = pmfs.sum()
    if pmfsum == 0:
        guess = np.zeros((num_cats,))
        guess[0] = 1
        return guess
    else:
        return pmfs / pmfsum

def show_test_idx(idx):
    # docs_test.target_names
    pred = np.argmax(prob_c_given(docs_test.data[idx]))
    print('pred: {}'.format(docs_test.target_names[pred]))
    print('target: {}'.format(docs_test.target_names[docs_test.target[idx]]))
    print('\n** doc **')
    print(docs_test.data[idx])

def test_accuracy():
    testN = len(docs_test.target)
    correct = 0
    curr_cnt = 0
    for i in range(testN):
        pred = np.argmax(prob_c_given(docs_test.data[i]))
        target = docs_test.target[i]
        if pred == target:
            correct += 1
        curr_cnt += 1
        if curr_cnt % 100 == 0:
            print('Accuracy (0-1): {}'.format((correct / float(curr_cnt)) * 100))
    print('Accuracy Final (0-1): {}'.format((correct / float(testN)) * 100))
        
def test_accuracy_random():
    testN = len(docs_test.target)
    correct = 0
    curr_cnt = 0
    for i in range(testN):
        pred = np.argmax(np.random.multinomial(1, [1/float(num_cats)]*num_cats, size=1)[0])
        target = docs_test.target[i]
        if pred == target:
            correct += 1
        curr_cnt += 1
        if curr_cnt % 100 == 0:
            print('Accuracy (0-1): {}'.format((correct / float(curr_cnt)) * 100))
    print('Accuracy Final (0-1): {}'.format((correct / float(testN)) * 100))

test_accuracy()

Accuracy (0-1): 61.0
Accuracy (0-1): 68.5
Accuracy (0-1): 69.66666666666667
Accuracy (0-1): 70.5
Accuracy (0-1): 70.6
Accuracy (0-1): 71.5
Accuracy (0-1): 72.85714285714285
Accuracy (0-1): 72.125
Accuracy (0-1): 72.0
Accuracy (0-1): 72.1
Accuracy (0-1): 72.18181818181819
Accuracy (0-1): 72.25
Accuracy (0-1): 72.23076923076923
Accuracy (0-1): 71.78571428571429
Accuracy (0-1): 72.06666666666666
Accuracy (0-1): 71.9375
Accuracy (0-1): 71.6470588235294
Accuracy (0-1): 71.05555555555556
Accuracy (0-1): 71.0


KeyboardInterrupt: 

# model