In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns;
sns.set(style="ticks", color_codes=True)

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import torch
from torch.nn.functional import softplus
import pyro
import pyro.distributions as dist
import pyro.poutine as poutine
from pyro.distributions.util import log_sum_exp
from torch.distributions import constraints
from pyro.infer import EmpiricalMarginal, SVI, Trace_ELBO
from pyro.infer.abstract_infer import TracePredictive
from pyro.infer.mcmc import MCMC, NUTS
from pyro.optim import Adam

# dataset

we will use the 20 newsgroups dataset. example usage of this data can be found [here](http://scikit-learn.org/stable/datasets/index.html#the-20-newsgroups-text-dataset).

In [2]:
def get_vocab():
    with open('./simple-vocab.txt')as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return np.unique(content)
vocab = get_vocab()
num_features = len(vocab)
categories = [
    'rec.autos',
    'rec.sport.baseball', 
    'rec.sport.hockey',
    'sci.med', 
    'sci.space'
]
num_cats = len(categories)
docs_train = fetch_20newsgroups(subset='train', categories=categories)
docs_test = fetch_20newsgroups(subset='test', categories=categories)

### feature extraction

see wikipedia for an explaination of [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

In [3]:
vectorizer = TfidfVectorizer(
    stop_words='english', 
    vocabulary=vocab,
    binary=True, 
    use_idf=False, 
    norm=None
)
vectors_train = vectorizer.fit_transform(docs_train.data).toarray()
vectors_test = vectorizer.transform(docs_test.data).toarray()
print('train: {}'.format(vectors_train.shape))
print('test: {}'.format(vectors_test.shape))

train: (2978, 1027)
test: (1982, 1027)


# model

In [7]:
def label_counts(dataY):
    counts = np.histogram(dataY.numpy(), bins=num_cats)[0]
    return softplus(torch.FloatTensor(counts), threshold=0)

def model(dataX, dataY):
    cat_prior = pyro.sample(
        'cat_prior',
        dist.Dirichlet(label_counts(dataY))
    )
    cat = pyro.sample(
        'cat', 
        dist.Categorical(cat_prior)
    )
    # use the generated cat sample to pick out target indices that have
    # that class
    c_idxs = torch.nonzero(dataY == cat.item()).squeeze()
    # count docs with class c
    N_c = c_idxs.size(0)
    # corresponding data with class c
    dataX_c = torch.index_select(dataX, 0, c_idxs)
    counts_true = torch.sum(dataX_c, 0)
    counts_false = (torch.ones(num_features)*N_c) - counts_true
    c0 = softplus(counts_true, threshold=0)
    c1 = softplus(counts_false, threshold=0)

    with pyro.iarange('likelihood_loop', num_features):
        beta = pyro.sample(
            'beta',
            dist.Beta(c0, c1).expand_by([N_c]).independent(1)
        )
        print(beta.size())
        print(dataX_c.size())
        pyro.sample(
            'bern',
            dist.Bernoulli(beta).independent(1),
            obs=dataX_c
        )
    
        
def guide(dataY, dataX):
    cat_prior = pyro.sample(
        'cat_prior',
        dist.Dirichlet(label_counts(dataY))
    )
    cat = pyro.sample(
        'cat', 
        dist.Categorical(cat_prior).expand_by([len(dataY)]).independent(1)
    )
    with pyro.iarange('likelihood_loop', num_features):
        c0 = pyro.param('c0', torch.ones(len(dataY), num_features), constraint=constraints.positive)
        c1 = pyro.param('c1', torch.ones(len(dataY), num_features), constraint=constraints.positive)
        return pyro.sample(
            'beta',
            dist.Beta(c0, c1).independent(1)
        )

In [8]:
"""
trainx_t = torch.Tensor(vectors_train)
trainy_t = torch.LongTensor(docs_train.target)
model(trainx_t, trainy_t)

"""

torch.Size([600, 1027])
torch.Size([600, 1027])




In [5]:


"""
pyro.clear_param_store()
trainx_t = torch.Tensor(vectors_train)
trainy_t = torch.LongTensor(docs_train.target)
posterior = pyro.infer.Importance(model, guide=guide, num_samples=10)
marginal = pyro.infer.EmpiricalMarginal(posterior.run(trainy_t, trainx_t), sites='beta')
"""

"""
trainx_t = torch.Tensor(vectors_train)
trainy_t = torch.LongTensor(docs_train.target)
nuts_kernel = NUTS(conditioned_model, adapt_step_size=True)
posterior = MCMC(nuts_kernel, num_samples=100, warmup_steps=10).run(trainy_t, trainx_t)
"""
"""
pyro.clear_param_store()

trainx_t = torch.Tensor(vectors_train)
trainy_t = torch.LongTensor(docs_train.target)

adam_params = {"lr": 0.0005, "betas": (0.90, 0.999)}
optimizer = Adam(adam_params)
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

for step in range(4):
    svi.step(trainy_t, trainx_t)
    if step % 2 == 0:
        print('opt step')
"""
"""

trainx_t = torch.Tensor(vectors_train)
trainy_t = torch.LongTensor(docs_train.target)
model(trainx_t, trainy_t).shape
"""

trainx_t = torch.Tensor(vectors_train)
trainy_t = torch.LongTensor(docs_train.target)
posterior = pyro.infer.Importance(model, num_samples=10)
marginal = pyro.infer.EmpiricalMarginal(posterior.run(trainx_t, trainy_t), sites='beta')

In [6]:
marginal()

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 594 and 600 in dimension 1 at /Users/soumith/minicondabuild3/conda-bld/pytorch_1524590658547/work/aten/src/TH/generic/THTensorMath.c:3586