In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib qt

In [2]:
from sklearn.datasets import fetch_20newsgroups
from time import time
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

import numpy as np
from scipy.sparse import find

import pickle

In [3]:
n_words = 2000

In [4]:
print("Loading dataset...")
t0 = time()
dataset = fetch_20newsgroups(shuffle=False, random_state=1, subset='train',
                             remove=('headers', 'footers', 'quotes'))

print(len(dataset.data))
data_samples = dataset.data
n_docs = len(data_samples)

dataset_test = fetch_20newsgroups(shuffle=False, random_state=1, subset='test',
                             remove=('headers', 'footers', 'quotes'))

print(len(dataset_test.data))
data_test = dataset_test.data
n_docs_test = len(data_test)

print("done in %0.3fs." % (time() - t0))

Loading dataset...
11314
7532
done in 1.918s.


Vectorize documents and get Count Matrix
----------------------------------------------

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                max_features=n_words,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
tf_test = tf_vectorizer.transform(data_test)
print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 2.197s.


In [7]:
import numpy as np
from scipy.sparse import find

In [8]:
(I, J, K) = find(tf)

docs = [[] for _ in range(n_docs)]
for r, c, n in zip(I, J, K) :
    docs[r] += [c for i in range(n)]

In [9]:
(I, J, K) = find(tf_test)

docs_test = [[] for _ in range(n_docs_test)]
for r, c, n in zip(I, J, K) :
    docs_test[r] += [c for i in range(n)]

Initialize Parameters
------------------------

In [10]:
alpha = 0.1
beta = 0.01

In [11]:
from tqdm import tqdm_notebook

In [12]:
def get_top_words(word_list, phi, nums=False, tnum=10) :
    max_args = np.argsort(phi, axis=1)[:, -tnum:] 
    words = []
    for t in range(n_topics) :
        words.append([i if nums else word_list[i] for i in max_args[t]])
    return words

word_list = tf_vectorizer.get_feature_names()

In [13]:
def assign_topic(d, w, i) :
    zc = topic_assign[d][i]
        
    n_dz[d, zc] -= 1
    n_d[d] -= 1
    n_zt[zc, w] -= 1
    n_z[zc] -= 1
            
    a = (n_zt[:, w] + beta) / (n_z + beta * n_words)
    b = (n_dz[d, :] + alpha) / (n_d[d] + n_topics * alpha)

    pz = a*b
    pz /= sum(pz)
    
    z = np.random.choice(range(n_topics), p=pz)
    
    n_dz[d, z] += 1
    n_d[d] += 1
    n_zt[z, w] += 1
    n_z[z] += 1
    topic_assign[d][i] = z

In [14]:
def run_gibbs(n_iters) :
    perps = []
    top_words = []
    for i in tqdm_notebook(range(n_iters)) :
        for doc in range(n_docs) :
            for i, word in enumerate(docs[doc]) :
                assign_topic(doc, word, i)
                
        phi = (n_zt + beta) / (n_zt.sum(1)[:, None] + beta*n_words)
        theta = (n_dz + alpha) / (n_dz.sum(1)[:, None] + alpha*n_topics)
        top_words.append(get_top_words(word_list, phi, tnum=50))
        
        score = np.log(np.matmul(theta, phi))
        score = -tf.multiply(score).sum()
        score = score / tf.sum()
        perps.append(score)
        
    return phi, theta, perps, top_words

In [None]:
n_topics = 10

n_dz = np.zeros((n_docs, n_topics))
n_d = np.zeros((n_docs))
n_zt = np.zeros((n_topics, n_words))
n_z = np.zeros((n_topics))
topic_assign = [[0 for _ in range(len(doc))] for doc in docs]

for d in range(n_docs) :
    for i, w in enumerate(docs[d]):
        z = np.random.randint(n_topics)
        topic_assign[d][i] = z
        n_dz[d, z] += 1
        n_d[d] += 1
        n_zt[z, w] += 1
        n_z[z] += 1
        
phi, theta, perps, top_words = run_gibbs(300)

In [None]:
pickle.dump([phi, theta, perps, top_words], open("lda-gibbs-10-300.p", "wb"))

In [15]:
phi, theta, perps, top_words = pickle.load(open("lda-gibbs-10-300.p", "rb"))

In [32]:
plt.plot([np.exp(x) for x in perps], label="Training Set")
plt.xlabel("Epochs", fontsize=20)
plt.ylabel("Perplexity", fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.legend(fontsize=18)
plt.tight_layout()
plt.savefig("perpgibbs.pdf")
plt.show()

In [48]:
selected_topic = -3

topic_words_list = []
for t in range(10) :
    nt = []
    for i in range(len(top_words)) :
        nt.append(top_words[i][t])
    topic_words_list.append(nt)

dict_words = {}
for i in range(300) :
    for j, word in enumerate(topic_words_list[selected_topic][i]) :
        if word in dict_words :
            dict_words[word][i] = j
        else :
            dict_words[word] = [-5]*300
            dict_words[word][i] = j

In [49]:
fig = plt.figure(figsize=(10, 10))
for word in dict_words :
    plt.plot(dict_words[word], label=word)
    plt.text(i + 1, dict_words[word][-1], word)
plt.ylabel(r"Rank of the word (according to $\beta_{kv}$)", fontsize=20)
plt.xlabel("Epochs", fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.tight_layout()
plt.savefig("gibbs-electronic.pdf")
plt.show()

In [19]:
n_topics = 10
for i in range(n_topics) :
    print(get_top_words(word_list, phi, tnum=10)[i])

['dos', 'know', 'thanks', 'disk', 'does', 'like', 'scsi', 'use', 'card', 'drive']
['new', 'research', 'information', 'nasa', 'chip', 'encryption', 'use', 'data', 'space', 'key']
['game', 'car', 'year', 'time', 'know', 'think', 'good', 'don', 'like', 'just']
['did', 'jews', 'president', 'turkish', 'armenians', 'israel', 'mr', 'armenian', 'said', 'people']
['believe', 'know', 'say', 'just', 'jesus', 'don', 'think', 'does', 'people', 'god']
['1t', '34u', '1d9', '145', 'pl', 'a86', 'b8f', 'g9v', 'max', 'ax']
['14', '16', '17', '12', '20', '25', '15', '11', '10', '00']
['version', 'files', 'available', 'program', 'db', 'image', 'file', 'windows', 'window', 'use']
['internet', 'address', 'list', 'email', 'information', 'send', 'mail', 'file', 'com', 'edu']
['state', 'just', 'like', 'make', 'right', 'gun', 'think', 'government', 'don', 'people']


In [None]:
plt.plot([np.exp(x) for x in perps])
plt.xlabel("Iteration", fontsize=20)
plt.ylabel("Perplexity", fontsize=20)
plt.show()

In [33]:
def coherence(top_words) :
    tc = []
    tfidf = TfidfTransformer().fit_transform(tf).todense()
    for words in tqdm_notebook(top_words) :
        tk = np.zeros((len(words), len(words)))
        for i in tqdm_notebook(range(len(words) - 1)) :
            for j in range(i + 1, len(words)) :
                num = np.dot(tfidf[:, words[i]].T, tfidf[:, words[j]])[0, 0]
                denom = np.sum(tfidf[:, words[i]])
                tk[i, j] = np.log((num + 0.000001)/denom)
                tk[j, i] = tk[i, j]
        tc.append(tk)
    return tc

tp = get_top_words(word_list, phi, nums=True, tnum=1000)
c = coherence(tp)

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))




HBox(children=(IntProgress(value=0, max=999), HTML(value='')))

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))

HBox(children=(IntProgress(value=0, max=999), HTML(value='')))




In [47]:
from scipy.interpolate import interp1d

for i in range(10) :
    cscores = []
    for j in range(2, 100) :
        coh = c[i][-j:, -j:].sum()/(j * (j-1))
        cscores.append(coh)
    plt.plot(range(2, 100), cscores, label=i)
plt.ylabel("Coherence Score", fontsize=20)
plt.xlabel("Epochs", fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
#plt.legend(fontsize=12)
plt.legend(loc='upper right', ncol=4, fontsize=12)
plt.tight_layout()
plt.savefig("cohbyntop_gibbs.pdf")
plt.show()

In [43]:
lsort = np.sort(phi, axis=1)
lsort = lsort / lsort.sum(1)[:, None]
for i in range(10) :
    coh = c[i][-20:, -20:].sum(0)/20
    plt.scatter(lsort[i, -20:], coh, label=i, s=10)
    
plt.xlim(0.0, 0.1)
plt.ylabel("Coherence Score", fontsize=20)
plt.xlabel(r"Probability of word in topic $\beta_{kv}$", fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.legend(loc='lower right', ncol=3, fontsize=12, title="k")
plt.tight_layout()
plt.savefig("cohvsprobfull_gibbs.pdf")
plt.show()

In [46]:
lsort = np.sort(phi, axis=1)
lsort = lsort / lsort.sum(1)[:, None]
for i in range(10) :
    coh = c[i][-4:, -50:-4].sum(0)/4
    plt.scatter(lsort[i, -50:-4], coh, s=10, label=i)
    
plt.xlim(0.0, 0.021)

plt.ylabel("Coherence Score", fontsize=20)
plt.xlabel(r"Probability of word in topic $\beta_{kv}$", fontsize=15)
plt.xticks(fontsize=18, rotation=90)
plt.yticks(fontsize=18)
plt.legend(loc='lower right', ncol=2, fontsize=12, title="k")
plt.tight_layout()
plt.savefig("cohvsprob3_gibbs.pdf")
plt.show()