In [1]:
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import itertools
import json

nltk.download('brown')
nltk.download('stopwords')
stopwords = stopwords.words('english')

np.set_printoptions(suppress=True)

[nltk_data] Downloading package brown to /Users/r0g06z5/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/r0g06z5/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# config path
config_path = 'config.json'

# path for stored distributions
path_word_topic_dist = 'word_topic_distributions.json'
path_doc_topic_dist = 'doc_topic_distributions.json'

In [3]:
def load_config(config_path):
    with open(config_path) as f:
        config = json.load(f)
    return config

config = load_config(config_path)
n_docs = config['n_docs']
n_topics = config['n_topics']
seed = config['seed']

In [4]:
np.random.seed(seed)
brown = nltk.corpus.brown
docs = np.random.choice(brown.fileids(), n_docs, replace=False)

print('Processing sentences..\n')
processed_docs = {}
for doc in docs:
    processed_sents = []
    sents = brown.sents(doc)
    for sent in sents:
        processed_sents.append([word.lower() for word in sent if word.isalnum() and word not in stopwords])
    processed_docs[doc] = processed_sents

processed_sents = list(itertools.chain(*list(processed_docs.values())))
final_tokens = list(set(itertools.chain(*processed_sents)))
n_tokens = len(final_tokens)

token2int = dict(zip(final_tokens, range(n_tokens)))
int2token = {v:k for k,v in token2int.items()}

doc2int = dict(zip(docs, range(n_docs)))
int2doc = {v:k for k,v in doc2int.items()}

print('Number of Documents:', n_docs) 
print('Number of Tokens:', n_tokens)    

Processing sentences..

Number of Documents: 15
Number of Tokens: 6078


### Generative Model of LDA

After inferring the distributions $\beta_{WT}$ (word-topic distribution) and $\theta_{DT}$ (doc-topic distribution), we can use these to sample lot of words, therby generating documents. The approach is as follows:

Consider building a new document similar to document $D$:
1. Sample a topic from $\theta_{D}$. Let this be $T$.
2. Sample a word $W$ from $\beta_{T}$.
3. Repeat 1 and 2, for however many words you need in the new document.

If you want to make a new document using all documents, uniformly sample a document $D$ and repeat the above steps.

In [5]:
def load_distributions(path_word_topic_dist, path_doc_topic_dist):
    with open(path_word_topic_dist) as f:
        beta = json.load(f)
    with open(path_doc_topic_dist) as f:
        theta = json.load(f)
    return beta, theta

In [6]:
def generative_model(processed_docs, beta, theta):
    generated_docs = {}
    for doc, sents in processed_docs.items():
        n_words = len(list(itertools.chain(*sents)))
        generated_doc = []

        # sampling topics from document multinomial distribution
        topic_counts = np.random.multinomial(n_words, theta[doc])

        for topic_idx, count in enumerate(topic_counts):
            # sampling words from topic multinomial distribution
            count_dist = np.random.multinomial(count, beta[str(topic_idx)])
            word_idxs = np.where(count_dist>0)[0]
            counts = count_dist[word_idxs]

            # build the document by populating words
            word_idxs = word_idxs.reshape(-1,1) 
            word_idxs = list(map(lambda x, y: list(x) * y, word_idxs, counts))
            word_idxs = list(itertools.chain(*word_idxs))            
            generated_doc += word_idxs

        # check generated doc length equal to original doc length
        assert len(generated_doc) == n_words

        # convert word indices to words
        generated_doc = list(map(lambda x: int2token[x], generated_doc))

        # shuffle words in document
        np.random.shuffle(generated_doc)
        generated_docs[doc] = generated_doc
    return generated_docs

In [7]:
print('Loading distributions..')
beta, theta = load_distributions(path_word_topic_dist, path_doc_topic_dist)

print('Generating documents with probabilistic model..')
generated_docs = generative_model(processed_docs, beta, theta)

Loading distributions..
Generating documents with probabilistic model..


### Applications of LDA

**Compute document similarities** 

Compute **Kullback-Leibler Divergence (KL Divergence)** between their topic distributions. KL Divergence between distributions $p$ and $q$ is given by:

$$
D[p(x): q(x)]=\sum_{x} p(x) \log \frac{p(x)}{q(x)}
$$

As KL Divergence is asymmetric, as $D(p, q)$ $\neq$ $D(q, p)$, Symmetrized KL Divergence is given by:

$$
\frac{1}{2}[D(p, q)+D(q, p)]
$$

In [8]:
def KL_divergence(p, q):
    return np.sum(p * np.log(p/q))

def sym_KL_divergence(p, q):
    return (KL_divergence(p, q) + KL_divergence(q, p)) / 2

def get_document_similarities(theta):
    doc_similarity = np.zeros((n_docs, n_docs))
    for doc1, theta_doc1 in theta.items():
        doc_idx1 = doc2int[doc1]
        for doc2, theta_doc2 in theta.items():
            doc_idx2 = doc2int[doc2]
            doc_similarity[doc_idx1, doc_idx2] = sym_KL_divergence(np.array(theta_doc1), np.array(theta_doc2))
    return doc_similarity

def get_most_similar_documents(doc_similarity, INFTY_SIM):
    np.fill_diagonal(doc_similarity, INFTY_SIM)
    idxs = np.dstack(np.unravel_index(np.argsort(doc_similarity.ravel()), doc_similarity.shape))[0][0]
    idxs = list(map(lambda x: int2doc[x], idxs))
    return idxs 

In [9]:
print('Document similarities with KL Divergence..')
doc_similarity = get_document_similarities(theta)

INFTY_SIM = 1000
idxs = get_most_similar_documents(doc_similarity, INFTY_SIM)
print(f'Most similar documents are "{idxs[0]}" and "{idxs[1]}"')

Document similarities with KL Divergence..
Most similar documents are "cc14" and "ck21"


### Applications of LDA

Information Retrieval - **Similarity with respect to a query**

Given a query $q$, we want to calculate the conditional probability given that it's coming from a document $d_{i}$, and do so for all documents:

$$
\begin{aligned}
&p\left(q \mid d_{i}\right)=\prod_{w_{k} \in q} p\left(w_{k} \mid d_{i}\right) \\
&=\prod_{w_{k} \in q_{j=1}} \sum_{j=1}^{T} P\left(w_{k} \mid z=j\right) P\left(z=j \mid d_{i}\right)
\end{aligned}
$$

In [10]:
def get_query_doc_similarity(n_query, beta, theta, seed):
    np.random.seed(seed)
    query = np.random.choice(final_tokens, n_query, replace=True)
    print('Given query:', query)
    query_idxs = list(map(lambda x: token2int[x], query))

    cond_probs = []
    for doc in docs:
        theta_doc = theta[doc]
        cond_prob = 1
        for word_idx in query_idxs:
            word_prob = 0
            for topic_idx in range(n_topics):
                p1 = beta[str(topic_idx)][word_idx]
                p2 = theta_doc[topic_idx]
                word_prob += p1 * p2
            cond_prob *= word_prob
        cond_probs.append(cond_prob)
    
    # normalize probabilities
    cond_probs = np.array(cond_probs) / sum(cond_probs)
    return cond_probs

In [11]:
n_query = 10
cond_probs = get_query_doc_similarity(n_query, beta, theta, seed)
most_similar_doc_idx = np.where(cond_probs == np.max(cond_probs))[0][0]
most_similar_doc = int2doc[most_similar_doc_idx]

print(f'\nMost similar doc to given query is "{most_similar_doc}"')

Given query: ['invitation' 'check' 'infinite' 'strasny' 'jack' 'overseas' 'van'
 'rolled' 'follow' 'permit']

Most similar doc to given query is "cf15"


### Applications of LDA

**Similarity between two words**

Given two words $w_{1}$ and $w_{2}$, compute similarity between them:
$$
p\left(w_{2} \mid w_{1}\right)=\sum_{j=1}^{T} p\left(w_{2} \mid z=j\right) p\left(z=j \mid w_{i}\right)
$$

This can also be used to find most similar words, given a word $w$

In [12]:
def get_cond_prob(word_idx1, word_idx2):
    p = 0
    for topic_idx in range(n_topics):
        p1 = beta[str(topic_idx)][word_idx1] 
        p2 = beta[str(topic_idx)][word_idx2] 
        p += p2 * p1
    return p

def find_most_similar_words(word1, topN):
    word_idx1 = token2int[word1]
    cond_prob = []
    for word2 in final_tokens:
        word_idx2 = token2int[word2]
        prob = get_cond_prob(word_idx1, word_idx2)
        cond_prob.append(prob)
    cond_prob = np.array(cond_prob) / sum(cond_prob)
    most_similar_word_idxs = np.argsort(cond_prob)[::-1][:topN]
    return list(map(lambda x: int2token[x], most_similar_word_idxs))

In [13]:
topN = 15

word = np.random.choice(final_tokens, 1)[0]    
most_similar_words = find_most_similar_words(word, topN)
print(f'Most similar words to "{word}":')
print(most_similar_words)

Most similar words to "mountains":
['notice', 'feeble', 'testimony', 'labor', 'overdriving', 'africa', 'universally', 'judge', 'unblinkingly', 'imbruing', 'stunning', 'explode', 'fellowship', 'calamity', 'fibers']
