In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns;
sns.set(style="ticks", color_codes=True)

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import torch
import pyro
import pyro.distributions as dist
from torch.nn.functional import softplus

# dataset

we will use the 20 newsgroups dataset. example usage of this data can be found [here](http://scikit-learn.org/stable/datasets/index.html#the-20-newsgroups-text-dataset).

In [2]:
def get_vocab():
    with open('./simple-vocab.txt')as f:
        content = f.readlines()
    content = [x.strip() for x in content]
    return np.unique(content)
vocab = get_vocab()
num_features = len(vocab)
categories = [
    'rec.autos',
    'rec.sport.baseball', 
    'rec.sport.hockey',
    'sci.med', 
    'sci.space'
]
num_cats = len(categories)
docs_train = fetch_20newsgroups(subset='train', categories=categories)
docs_test = fetch_20newsgroups(subset='test', categories=categories)

### feature extraction

see wikipedia for an explaination of [tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)

In [3]:
vectorizer = TfidfVectorizer(
    stop_words='english', 
    vocabulary=vocab,
    binary=True, 
    use_idf=False, 
    norm=None
)
vectors_train = vectorizer.fit_transform(docs_train.data).toarray()
vectors_test = vectorizer.transform(docs_test.data).toarray()
print('train: {}'.format(vectors_train.shape))
print('test: {}'.format(vectors_test.shape))

train: (2978, 1027)
test: (1982, 1027)


# model

In [78]:

def model(dataX, dataY):
    cat_prior = pyro.sample(
        'cat_prior',
        dist.Dirichlet(torch.tensor([1.,1.,1.,1.,1.,1.]))
    )
    with pyro.iarange('data_loop', dataX.size(0)):
        cat_params = torch.ones(dataX.size(0), 6) * cat_prior 
        cat = pyro.sample(
            'cat', 
            dist.Categorical(cat_params).independent(1)
        )
        
        incorrect = dataY != cat
       
        for j in pyro.irange('likelihood_loop', dataX.size(1)):
            c0 = torch.ones(dataX.size(0))
            c1 = torch.ones(dataX.size(0))
            beta = pyro.sample(
                'beta_{}'.format(j),
                dist.Beta(c0, c1).independent(1)
            )
            # use dataY and cat.item to mask out entries of beta
            beta[incorrect] = 1e-5 
            
            pyro.sample(
                'bern_{}'.format(j),
                dist.Bernoulli(beta),
                obs=dataX[:,j]
            )

def model_guide(dataX, dataY):
    cat_prior = pyro.sample(
        'cat_prior',
        dist.Dirichlet(torch.tensor([1.,1.,1.,1.,1.,1.]))
    )
    with pyro.iarange('data_loop', dataX.size(0)):
        cat_params = torch.ones(dataX.size(0), 6) * cat_prior 
        cat = pyro.sample(
            'cat', 
            dist.Categorical(cat_params).independent(1)
        )
        
        for j in pyro.irange('likelihood_loop', dataX.size(1)):
            c0 = torch.ones(dataX.size(0))
            c1 = torch.ones(dataX.size(0))
            beta = pyro.sample(
                'beta_{}'.format(j),
                dist.Beta(c0, c1).independent(1)
            )

trainx_t = torch.Tensor(vectors_train)
trainy_t = torch.LongTensor(docs_train.target)
posterior = pyro.infer.Importance(model, guide=model_guide, num_samples=10)
marginal = pyro.infer.EmpiricalMarginal(posterior.run(trainx_t, trainy_t), sites='beta_0')

In [99]:
# vectorizer.transform(docs_test.data).todense().shape
# docs_train.target
# marg = marginal()
# marg
marg = marginal() 
marg[marg > 0.1]

tensor([ 0.4922,  0.8754,  0.9402,  0.3103,  0.5383,  0.2120,  0.7439,
         0.3543,  0.3455,  0.1926,  0.4914,  0.9535,  0.1431,  0.5067,
         0.6839,  0.9996,  0.1553,  0.2333,  0.2196,  0.7430,  0.7499,
         0.6841,  0.9919,  0.3628,  0.4696,  0.4430,  0.4359,  0.9497,
         0.1085,  0.2938,  0.1932,  0.4472,  0.2991,  0.6354,  0.5311,
         0.8593,  0.1791,  0.9864,  0.6151,  0.2553,  0.2473,  0.8793,
         0.8775,  0.5577,  0.2837,  0.2720,  0.2106,  0.2489,  0.5726,
         0.4703,  0.5805,  0.9666,  0.6550,  0.2012,  0.4226,  0.6954,
         0.2568,  0.1103,  0.8746,  0.1211,  0.3882,  0.4766,  0.4576,
         0.3650,  0.1887,  0.5484,  0.4850,  0.9744,  0.5916,  0.4435,
         0.2864,  0.7947,  0.4252,  0.3274,  0.6445,  0.9197,  0.1949,
         0.2391,  0.8835,  0.3420,  0.5024,  0.8837,  0.2677,  0.9965,
         0.5585,  0.1598,  0.1901,  0.3519,  0.5072,  0.7683,  0.1106,
         0.8750,  0.8467,  0.9937,  0.1069,  0.4592,  0.6370,  0.2578,
      

In [7]:
docs_test.target

array([3, 0, 2, ..., 0, 5, 3])

In [100]:
marg = marg.numpy()
# len(docs_test.target)
cnt = 0
for i in range(len(marg)):
    if marg[i] == docs_train.target[i]:
        cnt += 1
print(cnt)

0


In [9]:
"""
def model2(data):
    with pyro.iarange('data_loop', data.size(0)):
        cat_params = torch.tensor(np.ones((data.size(0), 6))*(1/6.))
        print(cat_params)
        cat = pyro.sample(
            'cat', 
            dist.Categorical(cat_params).independent(1)
        )
        
        probs = _feature_prob_lookup[cat.numpy(), :]
        probs = torch.tensor(probs, dtype=torch.float32)
        probs = softplus(torch.tensor(probs), threshold=0)

        return pyro.sample(
            'bern',
            dist.Bernoulli(probs).independent(1),
            obs=data
        )
    
def model2_guide(data):
    with pyro.iarange('data_loop', data.size(0)):
        return pyro.sample(
            'cat', 
            dist.Categorical(torch.tensor(np.ones((data.size(0), 6))*(1/6.))).independent(1)
        )
        """
# datX = torch.tensor(vectors, dtype=torch.uint8)
# posterior = pyro.infer.Importance(model2, model2_guide, num_samples=100)
# posterior.run(torch.Tensor(vectors[0:100,:]))
# model2(torch.Tensor(vectors[0:2,:]))

"\ndef model2(data):\n    with pyro.iarange('data_loop', data.size(0)):\n        cat_params = torch.tensor(np.ones((data.size(0), 6))*(1/6.))\n        print(cat_params)\n        cat = pyro.sample(\n            'cat', \n            dist.Categorical(cat_params).independent(1)\n        )\n        \n        probs = _feature_prob_lookup[cat.numpy(), :]\n        probs = torch.tensor(probs, dtype=torch.float32)\n        probs = softplus(torch.tensor(probs), threshold=0)\n\n        return pyro.sample(\n            'bern',\n            dist.Bernoulli(probs).independent(1),\n            obs=data\n        )\n    \ndef model2_guide(data):\n    with pyro.iarange('data_loop', data.size(0)):\n        return pyro.sample(\n            'cat', \n            dist.Categorical(torch.tensor(np.ones((data.size(0), 6))*(1/6.))).independent(1)\n        )\n        "

In [10]:
"""
cat = pyro.sample(
    'cat', 
    dist.Categorical(torch.tensor(np.ones((10, 6))*(1/6.))).independent(1)
)
print(cat.numpy())
_feature_prob_lookup[cat.numpy(), :]
"""

"\ncat = pyro.sample(\n    'cat', \n    dist.Categorical(torch.tensor(np.ones((10, 6))*(1/6.))).independent(1)\n)\nprint(cat.numpy())\n_feature_prob_lookup[cat.numpy(), :]\n"

In [11]:
"""
dirsample = pyro.sample(
    'dir', 
    dist.Dirichlet(torch.tensor([1.,1.,1.,1.,1.,1.]))
)
torch.ones(2,6) * dirsample
"""

"\ndirsample = pyro.sample(\n    'dir', \n    dist.Dirichlet(torch.tensor([1.,1.,1.,1.,1.,1.]))\n)\ntorch.ones(2,6) * dirsample\n"

In [12]:
pyro.sample('bern_prior', dist.Beta(torch.ones(10), torch.ones(10)))

tensor([ 0.1381,  0.1600,  0.5901,  0.2140,  0.8927,  0.2165,  0.8071,
         0.7297,  0.6904,  0.4891])