In [1]:
import csv
import heapq
import numpy as np
from scipy.special import digamma
import matplotlib.pyplot as plt

fname = 'DATA/lda.csv'
fobj  = open(fname,'r')
itere = csv.reader(fobj)
A = []
for i in itere:
    A.append([int(x) for x in i])
A = np.asarray(A[1:])

# number of topics
K = 3

# Dirichlet Priors
item_topic_factors = np.random.dirichlet([1.0]*K,size=A.shape[0])
topic_word_factors = np.random.dirichlet([1.0]*A.shape[1], size=K)

# calulating di-gamma,performing exponentiation
# arr = np.asarray([11.83,11.04,11.29,11.60,11.13,11.40,11.21,11.60,11.23,11.73,11.76,11.13,11.02,11.26,11.97])
item_topic_factors_new = np.exp(digamma(item_topic_factors) - digamma(1))
topic_word_factors_new = np.exp(digamma(topic_word_factors) - digamma(1))

# calculating lambda
LAMBDA = np.zeros((A.shape[0], A.shape[1], K))
a = np.array([[1. ,2.], [2., 3.], [5., 1.]])
b = np.array([[1., 2.], [3., 4.]])
c = np.array([0., 0.])
c += np.multiply(a[0,:], b[:,0])

for i in xrange(LAMBDA.shape[0]):
    for j in xrange(LAMBDA.shape[1]):
        temp = np.multiply(item_topic_factors_new[i,:], topic_word_factors_new[:,j])
        _temp_ = temp/temp.sum()
        LAMBDA[i][j] += _temp_

# updating factors by variational inference
# update topic-word factors
for i in xrange(LAMBDA.shape[0]):
    for j in xrange(K):
        item_topic_factors[i][j] = np.dot(A[i,:], LAMBDA[i,:][:,j])
# update topic-word factors
for i in xrange(LAMBDA.shape[1]):
    for j in xrange(K):
        topic_word_factors[j][i] = np.dot(A[:,i], LAMBDA[:,i][:,j])


def variational_inference(item_topic_factors, topic_word_factors):
    # calulating di-gamma,performing exponentiation
    # arr = np.asarray([11.83,11.04,11.29,11.60,11.13,11.40,11.21,11.60,11.23,11.73,11.76,11.13,11.02,11.26,11.97])
    item_topic_factors_new = item_topic_factors
    topic_word_factors_new = topic_word_factors

    # calculating lambda
    LAMBDA = np.zeros((A.shape[0], A.shape[1], K))
    a = np.array([[1. ,2.], [2., 3.], [5., 1.]])
    b = np.array([[1., 2.], [3., 4.]])
    c = np.array([0., 0.])
    c += np.multiply(a[0,:], b[:,0])

    for i in xrange(LAMBDA.shape[0]):
        for j in xrange(LAMBDA.shape[1]):
            temp = np.multiply(item_topic_factors_new[i,:], topic_word_factors_new[:,j])
            _temp_ = temp/temp.sum()
            LAMBDA[i][j] += _temp_

    # updating factors by variational inference
    # update topic-word factors
    for i in xrange(LAMBDA.shape[0]):
        for j in xrange(K):
            item_topic_factors[i][j] = np.dot(A[i,:], LAMBDA[i,:][:,j])
    # update topic-word factors
    for i in xrange(LAMBDA.shape[1]):
        for j in xrange(K):
            topic_word_factors[j][i] = np.dot(A[:,i], LAMBDA[:,i][:,j])

    return (item_topic_factors, topic_word_factors)

# set number of iterations
while (1):
    item_topic_factors_, topic_word_factors_ = variational_inference(item_topic_factors, topic_word_factors)
    if np.array_equal(item_topic_factors_, item_topic_factors) and np.array_equal(item_topic_factors_,item_topic_factors):
        item_topic_factors = item_topic_factors_
        topic_word_factors = topic_word_factors_
        break
    else:
        item_topic_factors = item_topic_factors_
        topic_word_factors = topic_word_factors_
temp1 = np.around(item_topic_factors)
item_topic_factors = temp1/temp1.sum()
temp2 = np.around(topic_word_factors)
topic_word_factors = temp2/temp2.sum()

# plot topics
# plt.subplot(311)
# plt.plot(range(15), topic_word_factors[0])
# plt.subplot(312)
# plt.plot(range(15), topic_word_factors[1])
# plt.subplot(313)
# plt.plot(range(15), topic_word_factors[2])
#
# plt.show()
n = 4
vocab = ['Goal', 'RM','FCB','CR7','ball','GDP','CPI','Euro','APR','Tax','PP','PSOE','vote','poll','Presi']
print "Top %d Words for Topic 1 : "%n,[vocab[i] for i in heapq.nlargest(n, range(len(topic_word_factors[0])), topic_word_factors[0].take)]
print "Top %d Words for Topic 2 : "%n,[vocab[i] for i in heapq.nlargest(n, range(len(topic_word_factors[1])), topic_word_factors[1].take)]
print "Top %d Words for Topic 3 : "%n,[vocab[i] for i in heapq.nlargest(n, range(len(topic_word_factors[2])), topic_word_factors[2].take)]


Top 4 Words for Topic 1 :  ['PSOE', 'Presi', 'Euro', 'poll']
Top 4 Words for Topic 2 :  ['PP', 'GDP', 'Tax', 'RM']
Top 4 Words for Topic 3 :  ['Goal', 'CPI', 'APR', 'FCB']
