In [3]:
from sklearn.datasets import fetch_20newsgroups
from time import time

In [4]:
#n_docs = 2000
n_words = 2000
n_topics = 20

In [5]:
print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data #[:n_docs]
n_docs = len(data_samples)
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 1.278s.


Vectorize documents and get Count Matrix
----------------------------------------------

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_words,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 1.553s.


In [9]:
import numpy as np
from scipy.sparse import find

In [10]:
(I, J, K) = find(tf)

docs = [[] for _ in range(n_docs)]
for r, c, n in zip(I, J, K) :
    docs[r] += [c for i in range(n)]

Initialize Counts
-------------------

In [11]:
n_dz = np.zeros((n_docs, n_topics))
n_d = np.zeros((n_docs))
n_zt = np.zeros((n_topics, n_words))
n_z = np.zeros((n_topics))
topic_assign = [[0 for _ in range(len(doc))] for doc in docs]

for d in range(n_docs) :
    for i, w in enumerate(docs[d]):
        z = np.random.randint(n_topics)
        topic_assign[d][i] = z
        n_dz[d, z] += 1
        n_d[d] += 1
        n_zt[z, w] += 1
        n_z[z] += 1

Initialize Parameters
------------------------

In [12]:
alpha = 0.01
beta = 0.01


theta = np.zeros((n_docs, n_topics))
phi = np.zeros((n_topics, n_words))

In [13]:
from tqdm import tqdm_notebook

In [14]:
def assign_topic(d, w, i) :
    zc = topic_assign[d][i]
        
    n_dz[d, zc] -= 1
    n_d[d] -= 1
    n_zt[zc, w] -= 1
    n_z[zc] -= 1
            
    a = (n_zt[:, w] + beta) / (n_z + beta * n_words)
    b = (n_dz[d, :] + alpha) / (n_d[d] + n_topics * alpha)

    pz = a*b
    pz /= sum(pz)
    
    z = np.random.choice(range(n_topics), p=pz)
    
    n_dz[d, z] += 1
    n_d[d] += 1
    n_zt[z, w] += 1
    n_z[z] += 1
    topic_assign[d][i] = z

In [15]:
def run_gibbs(n_iters) :
    for i in tqdm_notebook(range(n_iters)) :
        for doc in range(n_docs) :
            for i, word in enumerate(docs[doc]) :
                assign_topic(doc, word, i)
                
    phi = (n_zt + beta) / (n_zt.sum(1)[:, None] + beta*n_words)
    theta = (n_dz + alpha) / (n_dz.sum(1)[:, None] + alpha*n_topics)
    return phi, theta

In [16]:
def get_top_words(word_list, phi) :
    max_args = np.argsort(phi, axis=1)[:, -10:] 
    for t in range(n_topics) :
        print([word_list[i] for i in max_args[t]])

In [17]:
phi, theta = run_gibbs(100)
word_list = tf_vectorizer.get_feature_names()
get_top_words(word_list, phi)


['13', '16', '20', '14', '11', '12', '15', '25', '10', '00']
['armenia', 'turkey', 'government', 'world', 'jews', 'war', 'armenians', 'turkish', 'people', 'armenian']
['0t', '1t', '1d9', '145', 'pl', 'a86', 'b8f', 'g9v', 'max', 'ax']
['rights', 'think', 'don', 'israel', 'state', 'law', 'right', 'government', 'gun', 'people']
['work', 've', 'time', 'does', 'know', 'good', 'just', 'like', 'use', 'don']
['say', 'time', 'did', 'like', 'didn', 'don', 'know', 'just', 'said', 'people']
['mac', 'windows', 'drives', 'hard', 'use', 'bit', 'disk', 'card', 'scsi', 'drive']
['like', 'stephanopoulos', 'people', 'just', 'going', 'don', 'president', 'think', 'mr', 'know']
['bike', 'time', 've', 'know', 'new', 'good', 'don', 'car', 'just', 'like']
['mv', '34u', 'c_', 'bh', 'lk', 'chz', 'ah', 'w7', 'cx', 'db']
['display', 'problem', 'application', 'windows', 'set', 'program', 'widget', 'using', 'use', 'window']
['just', 'don', 'bible', 'say', 'believe', 'think', 'does', 'people', 'jesus', 'god']
['ripe