### Gibbs 采样原版

In [8]:
import numpy as np
import scipy.io as sio
import sys
import logging
import time

rng = np.random.RandomState(seed=666)

In [3]:
# 计算对数似然
def loglikelihood(doc_topic_matrix, topic_word_matrix, doc_word_matrix, alpha, beta):
    word_list, doc_list = doc_word_matrix.nonzero()
    theta = np.asarray(doc_topic_matrix, dtype=np.float)
    phi = np.asarray(topic_word_matrix.T, dtype=np.float)
    theta = theta + alpha
    theta_norm = np.sum(theta, axis=1)[:, np.newaxis]
    theta = theta / theta_norm
    phi = phi + beta
    phi_norm = np.sum(phi, axis=1)[:, np.newaxis]
    phi = phi / phi_norm

    ll = []
    for word_indx, doc_indx in zip(word_list, doc_list):
        product = np.dot(theta[doc_indx, :], phi[:, word_indx])
        ll.append(product)

    logll = np.sum(np.log(ll))

    return logll, theta, phi

In [4]:
def baseline_lda(doc_word_matrix, K, iters, alpha, beta):
    num_voc, num_doc = doc_word_matrix.shape
    doc_topic_matrix = np.zeros([num_doc, K], dtype=np.int)
    topic_word_matrix = np.zeros([num_voc, K], dtype=np.int)
    topic_sum = np.zeros(K)

    word_list, doc_list = doc_word_matrix.nonzero()
    doc_word_assigned_topic = np.zeros(doc_word_matrix.shape, dtype=np.int)
    for word_indx, doc_indx in zip(word_list, doc_list):
        assigned_topic = rng.randint(K)
        doc_word_assigned_topic[word_indx, doc_indx] = assigned_topic
        doc_topic_matrix[doc_indx, assigned_topic] += 1
        topic_word_matrix[word_indx, assigned_topic] += 1
        topic_sum[assigned_topic] += 1

    st = time.time()
    logger.info('start training with baseline_lda k=%d max_iter=%d' % (K, iters))

    for i in range(iters):
        for word_indx, doc_indx in zip(word_list, doc_list):
            assigned_topic = doc_word_assigned_topic[word_indx, doc_indx]

            doc_topic_matrix[doc_indx, assigned_topic] -= 1
            topic_word_matrix[word_indx, assigned_topic] -= 1
            topic_sum[assigned_topic] -= 1

            conditional_probability = (topic_word_matrix[word_indx] + beta) / (topic_sum + num_voc * beta) * (doc_topic_matrix[doc_indx] + alpha)
            conditional_probability = conditional_probability / sum(conditional_probability)
            new_assigned_topic = int(rng.choice(K, 1, p=conditional_probability))

            doc_word_assigned_topic[word_indx, doc_indx] = new_assigned_topic
            doc_topic_matrix[doc_indx, new_assigned_topic] += 1
            topic_word_matrix[word_indx, new_assigned_topic] += 1
            topic_sum[new_assigned_topic] += 1

        iter_time = time.time() - st
        st = time.time()

        ll, theta, phi = loglikelihood(doc_topic_matrix, topic_word_matrix, doc_word_matrix, alpha, beta)
        ll_time = time.time() - st
        st = time.time()
        logger.info('iter %d sampling_time %f loglikelihood_time %f ll %f' % (i, iter_time, ll_time, ll))

### spare lda

In [2]:
def sparse_lda(doc_word_matrix, K, iters, alpha, beta):
    num_voc, num_doc = doc_word_matrix.shape
    doc_topic_matrix = np.zeros((num_doc, K), dtype=np.int)
    topic_word_matrix = np.zeros((num_voc, K), dtype=np.int)
    topic_sum = np.zeros(K)
    Vbeta = num_voc * beta

    # random word topic assignment initialization
    word_list, doc_list = doc_word_matrix.nonzero()
    doc_word_assigned_topic = np.zeros(doc_word_matrix.shape, dtype=np.int)
    for word_indx, doc_indx in zip(word_list, doc_list):
        assigned_topic = rng.randint(K)
        doc_word_assigned_topic[word_indx, doc_indx] = assigned_topic
        doc_topic_matrix[doc_indx, assigned_topic] += 1
        topic_word_matrix[word_indx, assigned_topic] += 1
        topic_sum[assigned_topic] += 1


    # cache variable computed
    ssum = alpha * beta * np.sum(1/(topic_sum + Vbeta))
    q1 = alpha  / (topic_sum + Vbeta)

    st = time.time()
    logger.info('start training with sparse_lda k=%d max_iter=%d' % (K, iters))

    for i in range(iters):
        for doc_indx in xrange(num_doc):
            temp = doc_topic_matrix[doc_indx] / (topic_sum + Vbeta)
            q1 += temp
            rsum = beta * temp.sum()

            current_doc = doc_word_matrix.getcol(doc_indx)
            has_word_indices = current_doc.nonzero()[0]

            for word_indx in iter(has_word_indices):
                assigned_topic = doc_word_assigned_topic[word_indx, doc_indx]
                # remove chosen word-topic pair
                doc_topic_matrix[doc_indx, assigned_topic] -= 1
                topic_word_matrix[word_indx, assigned_topic] -= 1
                topic_sum[assigned_topic] -= 1

                # update the bucket sums
                denominator = topic_sum[assigned_topic] + Vbeta
                nt_d = doc_topic_matrix[doc_indx, assigned_topic]
                ssum = ssum + alpha * beta * (1 / denominator - 1 / (denominator - 1))
                rsum = rsum - (nt_d + 1) * beta / (denominator + 1) + (nt_d * beta) / denominator
                q1[assigned_topic] = (alpha + nt_d) / denominator
                p = topic_word_matrix[word_indx] * q1
                qsum = p.sum()

                total_sum = ssum + rsum + qsum
                U = rng.rand() * total_sum
                tmp = U

                if U < ssum:
                    for t in range(K):
                        U -= 1 / (topic_sum[t] + Vbeta)
                        if U <= 0:
                            new_assigned_topic = t
                            break

                elif U < (ssum + rsum):
                    U -= ssum
                    U /= beta
                    current_doc_topic = doc_topic_matrix[doc_indx]
                    for topic_indx in range(K):
                        U -= current_doc_topic[topic_indx] / (topic_sum[topic_indx] + Vbeta)
                        if U <= 0:
                            new_assigned_topic = topic_indx
                            break

                else:
                    U -= (ssum + rsum)
                    for topic_indx in range(K):
                        U -= p[topic_indx]
                        if U <= 0:
                            new_assigned_topic = topic_indx
                            break

                nt_d = doc_topic_matrix[doc_indx, new_assigned_topic]
                ssum = ssum + alpha * beta * (1 / (denominator + 1) - 1 / denominator)
                rsum = rsum - nt_d * beta / denominator + (nt_d + 1) * beta / (denominator + 1)
                q1[new_assigned_topic] = (alpha + nt_d + 1) / (denominator + 1)

                doc_word_assigned_topic[word_indx, doc_indx] = new_assigned_topic
                doc_topic_matrix[doc_indx, new_assigned_topic] += 1
                topic_word_matrix[word_indx, new_assigned_topic] += 1
                topic_sum[new_assigned_topic] += 1

            q1 -= doc_topic_matrix[doc_indx] / (topic_sum + Vbeta)


        iter_time = time.time() - st
        st = time.time()

        ll, theta, phi = loglikelihood(doc_topic_matrix, topic_word_matrix, doc_word_matrix, alpha, beta)
        ll_time = time.time() - st
        st = time.time()
        logger.info('iter %d sampling_time %f loglikelihood_time %f ll %f' % (i, iter_time, ll_time, ll))

### spark lda