In [49]:
import numpy as np
import scipy as sp
import pandas as pd
import random

In [50]:
f= open('voc', 'r')
voc = f.read().strip().split("\n")

In [51]:
def parse_vocabulary(vocab):
    """Parse the vocabulary set into two dictionaries, word with its index and index with its
word. When input is null, output should be null."""
    type_to_index = {}
    index_to_type = {}
    for word in set(vocab):
        index_to_type[len(index_to_type)] = word
        type_to_index[word] = len(type_to_index)        
    return (type_to_index, index_to_type)

In [52]:
# true data
type_to_index = parse_vocabulary(voc)[0]
index_to_type = parse_vocabulary(voc)[1]

In [53]:
ff = open('train', 'r')
train = ff.read().strip().split("\n")
fff = open('test', 'r')
test = fff.read().strip().split("\n")

In [54]:
def parse_data(corpus):
    """Parse the corpus into a list of documents. Find out the index of each word in each document and count its number of appearance in each document. When input is null, output should be null."""
    word_ids = []
    word_cts = []     
    for document_line in corpus:
        document_word_dict = {}
        for token in document_line.split():
            if token not in type_to_index:
                continue                
            type_id = type_to_index[token]
            if type_id not in document_word_dict:
                document_word_dict[type_id] = 0
            document_word_dict[type_id] += 1

        word_ids.append(np.array(list(document_word_dict.keys())))
        word_cts.append(np.array(list(document_word_dict.values()))[np.newaxis, :])

    return (word_ids, word_cts)

In [55]:
def compute_dirichlet_expectation(dirichlet_parameter):
    """Calculate the expectation of dirichlet parameter. When input is null, output should be a warning that the input is null."""
    if not np.array(dirichlet_parameter).size:
        return ("The dirichlet_parameter is null.")
    if (len(dirichlet_parameter.shape) == 1):
        return (sp.special.psi(dirichlet_parameter)-sp.special.psi(np.sum(dirichlet_parameter)))
    return (sp.special.psi(dirichlet_parameter) - sp.special.psi(np.sum(dirichlet_parameter, 1))[:, np.newaxis])

### Tests of function parse_vocabulary, parse_data, and compute_dirichlet_expectation, for both common and edge cases. Test results are consistent with the expectations which are described in docstrings.###

In [56]:
voc_test = ["year","state","new","percent","peopl","report","million","govern","presid"]
parse_vocabulary(voc_test)

({'govern': 6,
  'million': 4,
  'new': 8,
  'peopl': 1,
  'percent': 7,
  'presid': 2,
  'report': 3,
  'state': 0,
  'year': 5},
 {0: 'state',
  1: 'peopl',
  2: 'presid',
  3: 'report',
  4: 'million',
  5: 'year',
  6: 'govern',
  7: 'percent',
  8: 'new'})

In [57]:
voc_test = []
parse_vocabulary(voc_test)

({}, {})

In [58]:
corpus_test = ["mauric adult adult peopl year h last year resolv polic polic polic polic polic polic polic motiv appear cri christian christian three troubl pride farley farley friday lock bureau gun gun gun church marino marino","work posit reach reach peopl year thought million million million presid govern year year first percent state appear abl time billion told resid complet day develop made made made made wednesday wednesday","work offer shoot thought million far year year polic polic polic polic polic polic polic polic polic cash appear abl sophist told told woman woman arrest retain found day monday citi administr prepar fame art gun"]
parse_data(corpus_test)

([array([5953, 2309, 1478, 6337, 2763, 4942, 5037, 1936,  849, 4954, 2140,
         5658, 5471, 5672, 1351, 3309,  637, 4339, 3638, 2746, 4924,  253]),
  array([1478, 4339, 1737, 5514, 4428, 3790,  591, 2577, 3991,  420, 2383,
         3292, 1054, 5471, 2596, 3654,  166, 5265, 5913, 2926,  754, 1395,
         3190]),
  array([3872, 5573, 3654, 1737, 4942, 2383, 2128, 4945, 5710, 5460, 1109,
           86, 3288, 3292, 5471, 4768, 4835, 2596,  582,  166, 1478, 5037,
         3569, 1780, 4169])],
 [array([[1, 1, 1, 1, 1, 7, 3, 1, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 2]]),
  array([[1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 1, 2, 2, 1, 1, 1]]),
  array([[1, 1, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1,
          2, 1, 1]])])

In [59]:
corpus_test = []
parse_data(corpus_test)

([], [])

In [60]:
number_of_topics = 100
number_of_types = len(type_to_index)
eta = np.random.gamma(100., 1./100., (number_of_topics,number_of_types))
compute_dirichlet_expectation(eta)

array([[-9.52796673, -9.60434618, -9.42120346, ..., -9.44474476,
        -9.24287867, -9.51159128],
       [-9.240929  , -9.49762594, -9.34999158, ..., -9.08952047,
        -9.37434663, -9.16787778],
       [-9.11738392, -9.49026457, -9.13080326, ..., -9.86859296,
        -9.20836237, -9.19514557],
       ..., 
       [-9.80143616, -9.20949025, -9.42926963, ..., -9.26464589,
        -9.17977571, -9.41576343],
       [-9.12376768, -9.7927182 , -9.6159151 , ..., -9.43533907,
        -9.31465568, -9.15600508],
       [-9.4725741 , -9.17515834, -9.43266649, ..., -9.49540765,
        -9.58461425, -9.53051858]])

In [61]:
eta_test = []
compute_dirichlet_expectation(eta_test)

'The dirichlet_parameter is null.'

In [62]:
# Initialization
parsed_corpus = parse_data(train)
test_corpus = parse_data(test)
number_of_documents = len(parsed_corpus[0])

alpha_alpha = np.zeros(number_of_topics) + 1/number_of_topics
alpha_beta = np.zeros(number_of_types) + 1/number_of_types
gamma = np.zeros((number_of_documents, number_of_topics)) + alpha_alpha[np.newaxis, :] + 1.0 * number_of_types / number_of_topics       

hyper_parameter_optimize_interval=1

In [63]:
def e_step(corpus=None,local_parameter_iteration=50,number_of_topics=number_of_topics,
local_parameter_converge_threshold=1e-6):
    """E step. When input is None, output should be document_log_likelihood, phi_sufficient_statistics, and gamma. Otherwise, it should be words_log_likelihood, gamma_values."""
    if corpus == None:
        word_ids = parsed_corpus[0]
        word_cts = parsed_corpus[1]
    else:
        word_ids = corpus[0]
        word_cts = corpus[1]
    # Initialization 
    number_of_documents = len(word_ids)
    document_log_likelihood = 0
    words_log_likelihood = 0
    phi_sufficient_statistics = np.zeros((number_of_topics, number_of_types))
    gamma_values = np.zeros((number_of_documents, number_of_topics)) + alpha_alpha[np.newaxis, :] + 1.0 * number_of_types / number_of_topics
    E_log_eta = compute_dirichlet_expectation(eta)
    if parsed_corpus != None:
        E_log_prob_eta = E_log_eta - sp.misc.logsumexp(E_log_eta, axis=1)[:, np.newaxis]

    # iterate over all documents
    for doc_id in np.random.permutation(number_of_documents):
        # compute the total number of words
        total_word_count = np.sum(word_cts[doc_id])
        # initialize gamma for this document
        gamma_values[doc_id, :] = alpha_alpha + 1.0 * total_word_count / number_of_topics

        term_ids = word_ids[doc_id]
        term_counts = word_cts[doc_id]

        # update phi and gamma until gamma converges
        for gamma_iteration in range(local_parameter_iteration):
            log_phi = E_log_eta[:, term_ids].T + np.tile(sp.special.psi(gamma_values[doc_id, :]), (word_ids[doc_id].shape[0], 1))
            log_phi -= sp.misc.logsumexp(log_phi, axis=1)[:, np.newaxis]
            gamma_update = alpha_alpha + np.array(np.sum(np.exp(log_phi + np.log(np.repeat(term_counts, number_of_topics, axis=0).T)), axis=0))
            mean_change = np.mean(abs(gamma_update - gamma_values[doc_id, :]))
            gamma_values[doc_id, :] = gamma_update
            if mean_change <= local_parameter_converge_threshold:
                break

        # compute the alpha, gamma, and phi terms
        document_log_likelihood += sp.special.gammaln(np.sum(alpha_alpha)) - np.sum(sp.special.gammaln(alpha_alpha))
        document_log_likelihood += np.sum(sp.special.gammaln(gamma_values[doc_id, :])) - sp.special.gammaln(np.sum(gamma_values[doc_id, :]))
        document_log_likelihood -= np.sum(np.dot(term_counts, np.exp(log_phi) * log_phi))

# compute the p(w_{dn} | z_{dn}, \eta) terms, which will be cancelled during M-step
        words_log_likelihood += np.sum(np.exp(log_phi.T + np.log(term_counts)) * E_log_prob_eta[:, term_ids])      
        phi_sufficient_statistics[:, term_ids] += np.exp(log_phi + np.log(term_counts.transpose())).T
        
    if corpus == None:
        gamma = gamma_values
        return (document_log_likelihood, phi_sufficient_statistics, gamma)
    else:
        return (words_log_likelihood, gamma_values)

In [64]:
def m_step(phi_sufficient_statistics):
    """M step. When input is null, output should be a warning that the input is null."""
    if not np.array(phi_sufficient_statistics).size:
        return ("The input is null.")
    # compute the beta and the eta terms
    topic_log_likelihood = number_of_topics * (sp.special.gammaln(np.sum(alpha_beta)) - np.sum(sp.special.gammaln(alpha_beta)))
    topic_log_likelihood += np.sum(np.sum(sp.special.gammaln(eta), axis=1) - sp.special.gammaln(np.sum(eta, axis=1)))

    eta_temp = phi_sufficient_statistics + alpha_beta

    # compute the sufficient statistics for alpha and update
    alpha_sufficient_statistics = sp.special.psi(gamma) - sp.special.psi(np.sum(gamma, axis=1)[:, np.newaxis])
    alpha_sufficient_statistics = np.sum(alpha_sufficient_statistics, axis=0)  

    return (topic_log_likelihood, alpha_sufficient_statistics, eta_temp)

### Compute likelihood and sufficient statistics. Also serve as a testing when input of e_step is null. ###

In [65]:
document_log_likelihood, phi_sufficient_statistics, gamma = e_step()

### Implement M step to fit the model and test function m_step when input is null.###

In [66]:
topic_log_likelihood, alpha_sufficient_statistics, eta = m_step(phi_sufficient_statistics)
phi_sufficient_statistics_test = []
m_step(phi_sufficient_statistics_test)

'The input is null.'

In [67]:
def optimize_hyperparameters(alpha_sufficient_statistics, hyper_parameter_iteration=100, hyper_parameter_decay_factor=0.9, hyper_parameter_maximum_decay=10,alpha=alpha_alpha, hyper_parameter_converge_threshold=1e-6):
    """Optimize hyperparameter alpha. Since the function's input is the result of m_step function, it is unnecessary and difficult to test it. Just use it."""
    alpha_update = alpha        
    decay = 0
    for alpha_iteration in range(hyper_parameter_iteration):
        alpha_gradient = number_of_documents * (sp.special.psi(np.sum(alpha)) - sp.special.psi(alpha)) + alpha_sufficient_statistics
        alpha_hessian = -number_of_documents * sp.special.polygamma(1,alpha)

        sum_g_h = np.sum(alpha_gradient / alpha_hessian)
        sum_1_h = 1.0 / alpha_hessian
        z = number_of_documents * sp.special.polygamma(1, np.sum(alpha))
        c = sum_g_h / (1.0 / z + sum_1_h)

        # update the alpha vector
        while True:
            singular_hessian = False
            step_size = np.power(hyper_parameter_decay_factor, decay) * (alpha_gradient - c) / alpha_hessian               
            if np.any(alpha <= step_size):
                singular_hessian = True
            else: alpha_update = alpha - step_size

            if singular_hessian:
                decay += 1
                if decay > hyper_parameter_maximum_decay:
                    break
            else: break

        # check the alpha converge criteria
        mean_change = np.mean(abs(alpha_update - alpha))
        alpha = alpha_update
        if mean_change <= hyper_parameter_converge_threshold:
            break

    return (alpha)

In [68]:
alpha_alpha = optimize_hyperparameters(alpha_sufficient_statistics)
words_log_likelihood, corpus_gamma_values = e_step(corpus = test_corpus)

### For every topic, find out the top 20 words that are most likely to appear in this topic. ###

In [69]:
E_log_eta = compute_dirichlet_expectation(eta)
topic = []
for topic_index in range(number_of_topics):
    temp_list = []
    beta_probability = np.exp(E_log_eta[topic_index, :] - sp.misc.logsumexp(E_log_eta[topic_index, :]))
    i = 0
    for type_index in reversed(np.argsort(beta_probability)):
        i += 1
        if i <= 20:
            temp_list.append(index_to_type[type_index])
        else: break
    topic.append(temp_list)

index = random.sample(range(number_of_topics),10)
df = pd.DataFrame({"topic0":topic[index[9]],"topic1":topic[index[0]],"topic2":topic[index[1]],"topic3":topic[index[2]],"topic4":topic[index[3]],"topic5":topic[index[4]],"topic6":topic[index[5]],"topic7":topic[index[6]],"topic8":topic[index[7]],"topic9":topic[index[8]]})

In [70]:
df

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,cent,govern,year,polic,south,year,chrysler,year,mandela,year
1,new,school,percent,two,two,new,year,state,govern,percent
2,futur,new,peopl,hospit,year,peopl,compani,last,say,billion
3,nation,compani,forc,citi,korea,bush,presid,peopl,children,report
4,york,state,state,state,africa,nomin,unit,presid,report,bank
5,year,year,unit,day,countri,white,shell,new,relea,govern
6,rate,peopl,billion,custom,state,news,travel,say,presid,tax
7,market,million,bush,report,million,state,million,hou,vote,state
8,higher,offici,countri,offici,last,issu,plant,time,nation,million
9,million,presid,million,school,african,presid,motor,make,year,last


### Change the number of topics and compare the results.###

**The reason why the following code is not wrapped into a function is that many global variables are used and there will be conflicts if putting it in a function.**

In [75]:
number_of_topics = 10
alpha_alpha = np.zeros(number_of_topics) + 1/number_of_topics
alpha_beta = np.zeros(number_of_types) + 1/number_of_types
eta = np.random.gamma(100., 1./100., (number_of_topics,number_of_types))
gamma = np.zeros((number_of_documents, number_of_topics)) + alpha_alpha[np.newaxis, :] + 1.0 * number_of_types / number_of_topics 
document_log_likelihood, phi_sufficient_statistics, gamma = e_step(number_of_topics=number_of_topics)
topic_log_likelihood, alpha_sufficient_statistics, eta = m_step(phi_sufficient_statistics)
alpha_alpha = optimize_hyperparameters(alpha_sufficient_statistics,alpha=alpha_alpha)
words_log_likelihood, corpus_gamma_values = e_step(corpus = test_corpus,
number_of_topics=number_of_topics)
E_log_eta = compute_dirichlet_expectation(eta)
topic = []
for topic_index in range(number_of_topics):
    temp_list = []
    beta_probability = np.exp(E_log_eta[topic_index, :] - sp.misc.logsumexp(E_log_eta[topic_index, :]))
    i = 0
    for type_index in reversed(np.argsort(beta_probability)):
        i += 1
        if i <= 20:
            temp_list.append(index_to_type[type_index])
        else: break
    topic.append(temp_list)

index = random.sample(range(number_of_topics),10)
df = pd.DataFrame({"topic0":topic[index[9]],"topic1":topic[index[0]],"topic2":topic[index[1]],"topic3":topic[index[2]],"topic4":topic[index[3]],"topic5":topic[index[4]],"topic6":topic[index[5]],"topic7":topic[index[6]],"topic8":topic[index[7]],"topic9":topic[index[8]]})
df

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,year,state,two,year,govern,percent,state,year,million,year
1,percent,peopl,million,million,year,peopl,new,percent,year,report
2,last,million,presid,peopl,state,year,year,state,percent,state
3,new,time,state,new,new,parti,polic,report,new,peopl
4,state,report,say,two,percent,million,nation,new,compani,offici
5,govern,govern,year,report,say,govern,peopl,two,presid,hou
6,member,two,new,soviet,presid,new,presid,nation,last,presid
7,nation,year,offici,polic,offici,say,two,last,time,new
8,offici,presid,feder,week,bush,report,last,say,price,time
9,month,nation,compani,govern,report,offici,day,peopl,first,soviet


**If we just have 10 topics, we can see that words in different topics are similar. Because test corpus is not that large and there are not so many key words, the corpus is not fully divided into different topics and different topics will overlap with each other. So when the number of topics is small, those key words will appear repeatedly and the distinctions of topics are not obvious.**