In [1]:
from sklearn.datasets import fetch_20newsgroups
from time import time

In [2]:
n_samples = 2000
n_features = 1000
n_topics = 10
n_top_words = 20

In [3]:
print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data[:n_samples]
print("done in %0.3fs." % (time() - t0))

Loading dataset...
done in 1.213s.


Vectorize documents and get Count Matrix
----------------------------------------------

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 0.308s.


In [6]:
import numpy as np
from scipy.sparse import find

In [7]:
(I, J, K) = find(tf)

docs = [[] for _ in range(n_samples)]
for r, c, n in zip(I, J, K) :
    docs[r] += [c for i in range(n)]

Initialize Counts
-------------------

In [8]:
n_dz = np.zeros((n_samples, n_topics))
n_d = np.zeros((n_samples))
n_zt = np.zeros((n_topics, n_features))
n_z = np.zeros((n_topics))
topic_assign = [[0 for _ in range(len(doc))] for doc in docs]

for d in range(n_samples) :
    for i, w in enumerate(docs[d]):
        z = np.random.randint(n_topics)
        topic_assign[d][i] = z
        n_dz[d, z] += 1
        n_d[d] += 1
        n_zt[z, w] += 1
        n_z[z] += 1

Initialize Parameters
------------------------

In [9]:
alpha = 0.01
beta = 0.01


theta = np.zeros((n_samples, n_topics))
phi = np.zeros((n_topics, n_features))

In [10]:
from tqdm import tqdm_notebook

In [11]:
def assign_topic(d, w, i) :
    zc = topic_assign[d][i]
        
    n_dz[d, zc] -= 1
    n_d[d] -= 1
    n_zt[zc, w] -= 1
    n_z[zc] -= 1
            
    a = (n_zt[:, w] + beta) / (n_z + beta * n_features)
    b = (n_dz[d, :] + alpha) / (n_d[d] + n_topics * alpha)

    pz = a*b
    pz /= sum(pz)
    
    z = np.random.choice(range(n_topics), p=pz)
    
    n_dz[d, z] += 1
    n_d[d] += 1
    n_zt[z, w] += 1
    n_z[z] += 1
    topic_assign[d][i] = z

In [12]:
def run_gibbs(n_iters) :
    for i in tqdm_notebook(range(n_iters)) :
        for doc in range(n_samples) :
            for i, word in enumerate(docs[doc]) :
                assign_topic(doc, word, i)
                
    phi = (n_zt + beta) / (n_zt.sum(1)[:, None] + beta*n_features)
    theta = (n_dz + alpha) / (n_dz.sum(1)[:, None] + alpha*n_topics)
    return phi, theta

In [13]:
def get_top_words(word_list, phi) :
    max_args = np.argsort(phi, axis=1)[:, -10:] 
    for t in range(n_topics) :
        print([word_list[i] for i in max_args[t]])

In [14]:
phi, theta = run_gibbs(100)
word_list = tf_vectorizer.get_feature_names()
get_top_words(word_list, phi)


['way', 'good', 'does', 'know', 'think', 'like', 'don', 'just', 'people', 'god']
['don', 'going', 'time', 'did', 'know', 'just', 'went', 'said', 'didn', 'people']
['pub', 'software', 'windows', 'ftp', 'send', 'file', 'mail', 'graphics', 'com', 'edu']
['section', 'gun', 'states', 'public', 'state', 'use', 'key', 'law', 'israel', 'government']
['insurance', 'oil', 'price', 'bike', 'year', '10', '000', '00', 'new', 'car']
['say', 'good', 'way', 'time', 'know', 'like', 'just', 'think', 'don', 'people']
['does', 've', 'problem', 'don', 'need', 'time', 'just', 'know', 'like', 'use']
['lunar', 'surface', 'probe', 'science', 'moon', 'clipper', 'chip', 'key', 'earth', 'space']
['bios', 'rom', '16', 'controller', 'card', 'scsi', 'hard', 'drives', 'disk', 'drive']
['12', 'play', '13', '18', 'game', '20', 'team', '11', '10', '55']
