# LDA (Latent Dirichlet Allocation)

* a generative model i.e. the model can randomly generate the observed data
* The goal is to learn 2 probability distributions:
    1. Probability of documents belong to various topics ($\theta$) -- topic-document distribution
    2. Probability of words belonging to various topics ($\phi$) -- word-topic distribution
     


**References**
1. [Latent Dirichlet Allocaiton Under the Hood - Andrew Brooks](http://brooksandrew.github.io/simpleblog/articles/latent-dirichlet-allocation-under-the-hood/)

In [1]:
rawdocs = ['eat turkey on turkey day holiday',
          'i like to eat cake on holiday',
          'turkey trot race on thanksgiving holiday',
          'snail race the turtle',
          'time travel space race',
          'movie on thanksgiving',
          'movie at air and space museum is cool movie',
          'aspiring movie star']


In [2]:
docs = [x.split(' ') for x in rawdocs]
print(docs)

[['eat', 'turkey', 'on', 'turkey', 'day', 'holiday'], ['i', 'like', 'to', 'eat', 'cake', 'on', 'holiday'], ['turkey', 'trot', 'race', 'on', 'thanksgiving', 'holiday'], ['snail', 'race', 'the', 'turtle'], ['time', 'travel', 'space', 'race'], ['movie', 'on', 'thanksgiving'], ['movie', 'at', 'air', 'and', 'space', 'museum', 'is', 'cool', 'movie'], ['aspiring', 'movie', 'star']]


In [3]:
## PARAMETERS
num_topics = 2 # number of topics
alpha = 1 # hyperparameter. single value indicates symmetric dirichlet prior. higher=>scatters document clusters
eta = .001 # hyperparameter
iterations = 3 # iterations for collapsed gibbs sampling.  This should be a lot higher than 3 in practice.


In [4]:
import numpy as np
## Assign WordIDs to each unique word

# vocab = np.unique(unlist(docs))
vocab = list(np.unique([word for doc in docs for word in doc]))


# ## Replace words in documents with wordIDs
transformed_docs = []
for doc in docs:
    transformed_docs.append([])
    for i in doc:
        transformed_docs[-1].append(vocab.index(i))
        
print(transformed_docs)

[[7, 25, 14, 25, 6, 8], [9, 11, 22, 7, 4, 14, 8], [25, 24, 15, 14, 19, 8], [16, 15, 20, 26], [21, 23, 17, 15], [12, 14, 19], [12, 3, 0, 1, 17, 13, 10, 5, 12], [2, 12, 18]]


In [5]:
from random import randint

## 1. Initialize word topic count matrix
wt = np.zeros((len(vocab), num_topics), dtype=np.int)

## Initialize topic assignment list. This is a three way matrix where 
## row index corresponds to a document
## column index corresponds to a word
## value will indicate the topic to which the word is assigned to in a given document
ta = np.ndarray((len(transformed_docs), len(vocab)), dtype=np.int8)
ta[:,:] = -1

## We randomly assign topics to words in a given document

for doc_index, doc in enumerate(transformed_docs):
    
    for word_index in doc:
        
        # randomly sample a topic between 0 and num_topics
        topic_index = randint(0, num_topics-1)
        
        # randomly assign a word that appears in a document a random topic
        ta[doc_index, word_index] = topic_index
        
        # keep track of how often a word is assigned to a given topic
        wt[word_index, topic_index] += 1

# print(wt)
# print(ta)

In [6]:
# Count how many words of a document belongs to a given topic
dt = np.zeros((len(transformed_docs), num_topics))

for doc_index, doc in enumerate(transformed_docs):
    
    for word_index in doc:
        
        # extract topic assigned to the word
        topic_index = ta[doc_index, word_index]
        
        # count how many words in the doc are assigned to that topic
        dt[doc_index, topic_index] += 1
        
print(dt)

[[3. 3.]
 [3. 4.]
 [2. 4.]
 [2. 2.]
 [1. 3.]
 [2. 1.]
 [2. 7.]
 [1. 2.]]


$$P(i, j) = \frac{wt[i, j]}{\sum_k{wt[i,k]}}$$

In [7]:
num_epoch = 10
vocab_size = len(vocab)

for i in range(num_epoch): # number of iterations
    
    
    for doc_index, doc in enumerate(transformed_docs): # for each doc
    
        for word_index in doc: # for each word token in the doc
            
            # current topic assigned to the word
            t0 = ta[doc_index, word_index]
            
            # we don't want to include token w in our document-topic count matrix when sampling for token w
            dt[doc_index, t0] -= 1
            
            # we don't want to include token w in our word topic count matrix when sampling for token w 
            wt[word_index, t0] -= 1
            
            
            ## UPDATE TOPIC ASSIGNMENT FOR EACH WORD
            ## -- COLLAPSED GIBBS SAMPLING MAGIC.  Where the magic happens.
            denom_a = np.sum(dt[doc_index, :]) + (num_topics * alpha)
            denom_b = np.sum(wt, axis=0) + (vocab_size * eta)
            pz = (wt[:, word_index] + eta) / denom_b * (dt[d, :] + alpha) / denom_a
            t1 = 
            t1 = sample(1:K, 1, prob=pz/sum(pz))
            
#           # denom_a <- sum(dt[d,]) + K * alpha # number of tokens in document + number topics * alpha
            # denom_b <- rowSums(wt) + length(vocab) * eta # number of tokens in each topic + # of words in vocab * eta
            #  p_z <- (wt[,wid] + eta) / denom_b * (dt[d,] + alpha) / denom_a # calculating probability word belongs to each topic
            # t1 <- sample(1:K, 1, prob=p_z/sum(p_z)) # draw topic for word n from multinomial using probabilities calculated above

            
            

SyntaxError: invalid syntax (<ipython-input-7-a2c58f8cfd74>, line 26)

In [19]:
x = np.ndarray((3,4))
x[:,:] = -1
x

array([[-1., -1., -1., -1.],
       [-1., -1., -1., -1.],
       [-1., -1., -1., -1.]])