In [79]:
import numpy as np
import matplotlib.pyplot as plt
import os
import re
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer as WNL
import scipy
from IPython.display import clear_output as clr
%matplotlib inline

In [3]:
# Defining data directory
data_dir = os.getcwd() + '/../'
# Check in contents of current dir are same as expected
os.listdir(data_dir)

['.ipynb_checkpoints',
 'LDA',
 'stackoverflow-data-idf.json',
 'stackoverflow-test.json',
 'TF-IDF',
 'train_corpus',
 'Unigram']

In [4]:
data = pd.read_csv(data_dir + 'train_corpus')['text']

In [5]:
data.head()

0    serialize private struct can it be do i have p...
1    how do i prevent float right content from over...
2    gradle command line i m try to run shell scrip...
3    loop variable as parameter in asynchronous fun...
4    canot get href value hi i need to valid href b...
Name: text, dtype: object

In [30]:
class vocab:
    
    def __init__(self, corpus):
        """
        Corpus : list of docs
        """
        self.dictionary = {}
        self.vocab_size = 0
        self.make_vocab(corpus)
        self.inv_dictionary = {v: k for k, v in self.dictionary.items()}
    
    def make_vocab(self,corpus):
        
        count = 0
        for doc in corpus:
            for word in doc.strip().split():
                if(word in self.dictionary.keys()):
                    pass
                else:
                    self.dictionary[word] = count
                    count += 1
        
        self.vocab_size = count
        assert(len(self.dictionary.keys()) == count), "Wrong Number of words added"
        
    def get_index(self,word):
        if(word in self.dictionary.keys()):
            return self.dictionary[word]
        else:
            return -1
    
    def get_word(self,index):
        if(index in self.inv_dictionary.keys()):
            return self.inv_dictionary[index]
        else:
            return " "

In [31]:
myDict = vocab(list(data.iloc[0:500].values))

In [115]:
class LDA:
    
    def __init__(self,vocab, K):
        """
        vocab : a dictionary containing all the words and corresponding indices
        k : smoothing parameter for beta matrix
        """
        self.vocab = vocab
        self.V = vocab.vocab_size
        self.K = K
        self.alpha = np.zeros((self.K, 1)) + np.random.rand()
        self.beta = np.random.rand(self.K, self.V)
        self.gamma = np.random.rand(self.K , 1)
        self.digamma = scipy.special.digamma
        self.eps = 10e-8
        
        
    def train(self, corpus, epochs = 1):
        
        M = len(corpus)
        
        for epoch in range(epochs):
            beta = np.zeros((self.K, self.V))
            grad_alpha = 0
            hesn_alpha = 0
            print("Initialized")
            count = 0
            for doc in corpus:
                clr(wait = True)
                count += 1
                print("epoch : ", epoch, " doc init: ", count)
                words = doc.strip().split(" ")
                N = len(words)
                phi = np.zeros((N, self.K))
                gamma = np.zeros((self.K, 1))


                for j, word in enumerate(words):

                    index = self.vocab.get_index(word)
                    beta_v = np.log(self.beta[:,index].reshape(self.K, 1))
                    phi_n = self.calc_phi(beta_v)
                    phi[j, :] = phi_n.reshape(-1)
                    
                
                print("epoch : ", epoch, "made phi", count)

                for i in range(self.K):
                    gamma[i] = self.alpha[i] + np.sum(phi[:, i])


                print("epoch : ", epoch, "made gamma", count)
                

                for i in range(self.K):
                    for j in range(self.V):
                        val = 0
                        for k, word in enumerate(words):
                            index = self.vocab.get_index(word)
                            if(j == index):
                                val += phi[k, i]

                        beta[i,j] = val
                        
                print("epoch : ", epoch, "made beta", count)

                beta = beta/(np.sum(beta,  axis = 1).reshape(-1,1) + self.eps)

                grad_alpha += self.digamma(np.sum(self.alpha)) - self.digamma(self.alpha)
                grad_alpha += self.digamma(gamma) - self.digamma(np.sum(gamma))

            # update params
            self.beta = beta
            self.gamma = gamma
            self.alpha = self.alpha + 0.0001*grad_alpha.reshape(-1,1)
            

    def calc_phi(self, beta_v):
        phi = np.exp(self.digamma(self.gamma) - self.digamma(np.sum(self.gamma))).reshape(self.K,1)
        phi = beta_v*phi
        phi = phi/(np.sum(phi, axis = 0) + self.eps)
        return phi

In [125]:
model = LDA(myDict , 5)

In [None]:
model.train(list(data.iloc[0:200].values))

epoch :  0  doc init:  15
epoch :  0 made phi 15
epoch :  0 made gamma 15


In [123]:
model.beta.mean()

0.0001453065963037513