In [1]:
import cPickle as pickle
import lda
import numpy as np
import os
from serial_lda_gibbs import LdaSampler
from multicore_lda_gibbs import MulticoreLdaSampler
import time

pickle_filepath = 'baseline_data.pickle'

In [2]:
def load_reuters_dataset():
    if not os.path.exists(pickle_filepath):
        dump_reuters_dataset()
    with open(pickle_filepath, 'r') as rfile:
        X, vocab, titles = pickle.load(rfile)
    return X, vocab, titles

def dump_reuters_dataset():
    X = lda.datasets.load_reuters()
    vocab = lda.datasets.load_reuters_vocab()
    titles = lda.datasets.load_reuters_titles()

    with open(pickle_filepath, 'w') as wfile:
        pickle.dump( (X, vocab, titles), wfile)


In [3]:
def serial_gibbs(X, k, iters=50, log=True):
    sampler = LdaSampler(k)
    start = time.time()
    for it, phi in enumerate(sampler.run(X, maxiter=iters)):
        if log:
            print "Iteration", it
            print "Likelihood", sampler.loglikelihood()
        else:
            i = it
    end = time.time()
    print 'Completed %d iterations in %.3f seconds (serial)' % (iters, end - start)
    return sampler


def multicore_gibbs(X, k, p, iters=50, log=True):
    sampler = MulticoreLdaSampler(k, p)
    start = time.time()
    for it, phi in enumerate(sampler.run(X, maxiter=iters)):
        if log:
            print "Iteration", it
            print "Likelihood", sampler.loglikelihood()
        else:
            i = it
    end = time.time()
    print 'Completed %d iterations in %.3f seconds (P=%d)' % (iters, end - start, p)
    return sampler

In [None]:
#https://archive.ics.uci.edu/ml/datasets/Bag+of+Words
def load(dataset):
    files = {
        'nips': 'docword.nips.txt', 
        'nytimes': 'docwords.nytimes.txt',
    }
    if dataset == 'reuters':
        return load_reuters_dataset()
    elif dataset in files: 
        with open(files[dataset], 'r') as rfile:
            lines = rfile.readlines()
        n_documents = int(lines[0])
        n_words = int(lines[1])
        X = np.zeros((n_documents, n_words))
        data = map(lambda s: map(int, s.split()), lines[3:])
        for doc, word, count in data:
            X[doc-1][word-1] = count
        return X
    else:
        raise Exception ('Dataset %s not found' % dataset)


In [None]:
# serial_gibbs(load('nips'), 10, 16)
times = {}
likelihoods = {}
data = load_reuters_dataset()[0]
k = 10
iters=50

sampler = serial_gibbs(data, k, iters=50, log=False)
times[0] = np.mean(sampler.sample_times)
likelihoods[0] = sampler.loglikelihood()
for p in [1,2,4,8,16]:
    sampler = multicore_gibbs(data, k, p,iters=50, log=False)
    times[p] = np.mean(sampler.sample_times) + np.mean(sampler.update_times)
    likelihoods[p] = sampler.loglikelihood()


Sampled in 3.151 seconds
Sampled in 3.230 seconds
Sampled in 3.187 seconds
Sampled in 3.118 seconds
Sampled in 3.005 seconds
Sampled in 3.075 seconds