In [4]:
# Import packages
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict
import nltk
import random


In [7]:
stemmer = nltk.stem.snowball.SnowballStemmer("english")
data_dir = "/Users/yangsong/Desktop/Projects/gitrepo_songyang0716/Topic_Modeling/reviews_small.txt"
np.random.seed(666)

In [8]:
# read review texts
reviews = []
f = open(data_dir, "r")
for review in f:
    reviews.append(review)
random.shuffle(reviews)

In [9]:
# process text
# tokenize, lower, remove stop words, stem, then only keep alphabets in the string
clean_reviews = []
for review in reviews:
    s = nltk.word_tokenize(review)
    s = [word.lower() for word in s]
    s = [word for word in s if not word in set(
        nltk.corpus.stopwords.words('english'))]
    s = [stemmer.stem(word) for word in s if word.isalpha()]
    clean_reviews.append(" ".join(s))

In [10]:
len(clean_reviews)

147

In [11]:
reviews[0]

"This is the second time I've used APlus for plumbing issues. Juan and Miguel once again resolved our problem with ease and efficiency. I was happy to see them waiting for me once I got home from work at 4:07p for my 4p-6 appointment window.\n"

In [12]:
clean_reviews[0]

'second time use aplus plumb issu juan miguel resolv problem eas effici happi see wait got home work appoint window'

In [138]:
min(6, 20)

6

In [140]:
# extract all the unique biterms from the reviews
# BTM directly models the word cooccurrence patterns based on biterms
# A biterm denotes an unordered unique word-pair co-occurring in a short context, each context in our example is a review
biterms = []
unique_words = set()
for clean_review in clean_reviews:
    clean_review = clean_review.split()
    review_length = len(clean_review)
    cur_review_biterms = set()
    for i in range(review_length):
        unique_words.add(clean_review[i])
        # we use a interval of 5, if two words are disance to each other less than 5 positions, than count as a biterms
        for j in range(i+1, min(i+6, len(clean_review))):
            cur_review_biterms.add((clean_review[i], clean_review[j]))
    biterms.extend(list(cur_review_biterms))

In [141]:
len(biterms)

24674

In [177]:
def BTM(biterms, unique_words, num_of_topics, num_of_iterations):
    ####################################################################################
    ### num_of_topics: number of topics to generate                                  ###
    ### number_of_iterations: collapsed gibbs sampling iterations                    ###
    ####################################################################################

    # constant we set for the LD prior (topic distributions in a document)
    DL_ALPHA = 50 / num_of_topics
    # constant we set for the LD prior (word distribution in a topic)
    DL_BETA = 0.01
    # Number of total biterms
    N_BITERMS = len(biterms)

    # Assign a random topic for each biterm
    n_z = np.random.randint(0, num_of_topics, N_BITERMS)
    # Words count over topics
    # Key is word, value is an array of topic counts, use the index to indicate the topic 1 to k
    n_wz = defaultdict(lambda: np.zeros(num_of_topics))
    for index, (w1, w2) in enumerate(biterms):
        n_wz[w1][n_z[index]] += 1
        n_wz[w2][n_z[index]] += 1

    # unlike to LDA model, in the biterm model, each bigram is coming from a specific topic
    # biterm_topic = np.zeros((N_BITERMS, num_of_topics))
    for iteration in range(num_of_iterations):
        print (iteration)
        for index, (w1, w2) in enumerate(biterms):
            cur_topic = n_z[index]
            # give a -1 class to the current biterm, means we ignore the current biterm
            # n_z[index] = -1
            n_wz[w1][cur_topic] -= 1
            n_wz[w2][cur_topic] -= 1

            # nz = np.unique(n_z, return_counts=True)[1][1:]
            nz = np.bincount(n_z, minlength=num_of_topics)
            nz[cur_topic] -= 1
            n_w1z = n_wz[w1]
            n_w2z = n_wz[w2]
#             print(n_w1z)
#             print(n_w2z)
#             print(nz)
            z_posterior = (nz + DL_ALPHA) * (n_w1z + DL_BETA) * (n_w2z + DL_BETA) / np.sum(
                (2 * nz + len(unique_words) * DL_BETA) * (2 * nz + len(unique_words) * DL_BETA))
            topic_prob = z_posterior / np.sum(z_posterior)
            topic_selection = np.argmax(
                np.random.multinomial(n=1, pvals=topic_prob, size=1))

            n_z[index] = topic_selection
            n_wz[w1][topic_selection] += 1
            n_wz[w2][topic_selection] += 1

    # return the topic assignment for each biterm
    return n_z 

In [178]:
review = BTM(biterms, unique_words, 3, 50)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
