In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns;
sns.set(style="ticks", color_codes=True)

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import torch
import pyro
import pyro.distributions as dist
from torch.nn.functional import softplus

# dataset

we will use the 20 newsgroups dataset. example usage of this data can be found [here](http://scikit-learn.org/stable/datasets/index.html#the-20-newsgroups-text-dataset).

In [2]:
categories = [
    'alt.atheism', 'talk.religion.misc', 
    'comp.graphics', 'sci.space',  
    'rec.sport.baseball', 'rec.sport.hockey'
]
docs_train = fetch_20newsgroups(subset='train', categories=categories)
docs_test = fetch_20newsgroups(subset='test', categories=categories)
num_features = 5000

### feature extraction

see wikipedia for an explaination of [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

In [3]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=num_features, binary=True, use_idf=False, norm=None)
vectors_train = vectorizer.fit_transform(docs_train.data).todense()
vectors_test = vectorizer.transform(docs_test.data).todense()
print('train: {}'.format(vectors_train.shape))
print('test: {}'.format(vectors_test.shape))

train: (3231, 5000)
test: (2149, 5000)


In [29]:
# see https://youtu.be/oAihxFkRHu8?t=29m34s
# for an explaination of N_c and N_c_j
# 
N_c = np.array([
    vectors_train[docs_train.target == 0, :].shape[0],
    vectors_train[docs_train.target == 1, :].shape[0],
    vectors_train[docs_train.target == 2, :].shape[0],
    vectors_train[docs_train.target == 3, :].shape[0],
    vectors_train[docs_train.target == 4, :].shape[0],
    vectors_train[docs_train.target == 5, :].shape[0]
])

def pi_estimator():
    """
    number of times a document occures in class divided by total
    number of docs
    nando calls it N_c
    """
    return N_c / float(len(vectors_train))

def theta_estimator():
    """
    each class has 1000 estimated parameters
    """
    acc = []
    for class_lbl in range(6):
        feat_acc = []
        for feat in range(num_features):
            feat_acc.append(vectors_train[docs_train.target == class_lbl, feat].sum())
        feat_acc = np.array(feat_acc) / float(N_c[class_lbl])
        acc.append(feat_acc)
    return np.array(acc)

pi_hat = pi_estimator()
theta_hat = theta_estimator()

def prob_c_given(x_str):
    """
    https://youtu.be/oAihxFkRHu8?t=29m44s
    """
    def pmf_at_c(c):
        x = vectorizer.transform([x_str]).todense().tolist()[0]
        acc_prod = 1
        for j in range(num_features):
            acc_prod *= theta_hat[c,j]**(x[j] == 1) * (1 - theta_hat[c,j])**(x[j] == 0)
        return pi_hat[c] * acc_prod
    
    pmfs = []
    for lbl in [0,1,2,3,4,5]:
        pmfs.append(pmf_at_c(lbl))
    pmfs = np.array(pmfs)
    pmfsum = pmfs.sum()
    if pmfsum == 0:
        return np.array([1.,0.,0.,0.,0.,0.])
    else:
        return pmfs / pmfsum

def show_test_idx(idx):
    # docs_test.target_names
    pred = np.argmax(prob_c_given(docs_test.data[idx]))
    print('pred: {}'.format(docs_test.target_names[pred]))
    print('target: {}'.format(docs_test.target_names[docs_test.target[idx]]))
    print('\n** doc **')
    print(docs_test.data[idx])

def test_accuracy():
    testN = len(docs_test.target)
    correct = 0
    curr_cnt = 0
    for i in range(testN):
        pred = np.argmax(prob_c_given(docs_test.data[i]))
        target = docs_test.target[i]
        if pred == target:
            correct += 1
        curr_cnt += 1
        if curr_cnt % 100 == 0:
            print('Accuracy (0-1): {}'.format((correct / float(curr_cnt)) * 100))
    print('Accuracy Final (0-1): {}'.format((correct / float(testN)) * 100))
        
def test_accuracy_random():
    testN = len(docs_test.target)
    correct = 0
    curr_cnt = 0
    for i in range(testN):
        pred = np.argmax(np.random.multinomial(1, [1/6.]*6, size=1)[0])
        target = docs_test.target[i]
        if pred == target:
            correct += 1
        curr_cnt += 1
        if curr_cnt % 100 == 0:
            print('Accuracy (0-1): {}'.format((correct / float(curr_cnt)) * 100))
    print('Accuracy Final (0-1): {}'.format((correct / float(testN)) * 100))
test_accuracy_random()

Accuracy (0-1): 14.000000000000002
Accuracy (0-1): 17.0
Accuracy (0-1): 16.666666666666664
Accuracy (0-1): 15.5
Accuracy (0-1): 16.6
Accuracy (0-1): 15.833333333333332
Accuracy (0-1): 16.857142857142858
Accuracy (0-1): 16.75
Accuracy (0-1): 16.555555555555557
Accuracy (0-1): 16.400000000000002
Accuracy (0-1): 16.363636363636363
Accuracy (0-1): 16.166666666666664
Accuracy (0-1): 16.307692307692307
Accuracy (0-1): 16.785714285714285
Accuracy (0-1): 16.53333333333333
Accuracy (0-1): 16.5
Accuracy (0-1): 16.58823529411765
Accuracy (0-1): 16.38888888888889
Accuracy (0-1): 16.0
Accuracy (0-1): 15.7
Accuracy (0-1): 16.095238095238095
Accuracy Final (0-1): 15.960912052117262


# model

In [24]:
np.argmax(np.random.multinomial(1, [1/6.]*6, size=1)[0])

1