In [1]:
# Import packages
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict
import nltk
import random


In [2]:
stemmer = nltk.stem.snowball.SnowballStemmer("english")
data_dir = "/Users/yangsong/Desktop/Projects/gitrepo_songyang0716/Topic_Modeling/reviews_small.txt"
np.random.seed(666)

In [3]:
# read review texts
reviews = []
f = open(data_dir, "r")
for review in f:
    reviews.append(review)
random.shuffle(reviews)

In [4]:
# process text
# tokenize, lower, remove stop words, stem, then only keep alphabets in the string
clean_reviews = []
for review in reviews:
    s = nltk.word_tokenize(review)
    s = [word.lower() for word in s]
    s = [word for word in s if not word in set(
        nltk.corpus.stopwords.words('english'))]
    s = [stemmer.stem(word) for word in s if word.isalpha()]
    clean_reviews.append(" ".join(s))

In [5]:
len(clean_reviews)

147

In [6]:
reviews[0]

"We came in with a large group and ordered a lot of things. There were shrimp. Oysters. There was calamari (yum!). There were prawns and scallops. There was a chicken piccatta. And honestly, some of it all runs together because we're quite the share everything crowd. But I do remember there being a serious cioppino on the menu, and I tried a bit of the sauce. Spicy, rich, and full of seafood. This is a project, but a tasty one.\n"

In [7]:
clean_reviews[0]

'came larg group order lot thing shrimp oyster calamari yum prawn scallop chicken piccatta honest run togeth quit share everyth crowd rememb serious cioppino menu tri bit sauc spici rich full seafood project tasti one'

In [8]:
min(6, 20)

6

In [9]:
# extract all the unique biterms from the reviews
# BTM directly models the word cooccurrence patterns based on biterms
# A biterm denotes an unordered unique word-pair co-occurring in a short context, each context in our example is a review
biterms = []
unique_words = set()
for clean_review in clean_reviews:
    clean_review = clean_review.split()
    review_length = len(clean_review)
    cur_review_biterms = set()
    for i in range(review_length):
        unique_words.add(clean_review[i])
        # we use a interval of 5, if two words are disance to each other less than 5 positions, than count as a biterms
        for j in range(i+1, min(i+6, len(clean_review))):
            cur_review_biterms.add((clean_review[i], clean_review[j]))
    biterms.extend(list(cur_review_biterms))

In [10]:
len(biterms)

24674

In [11]:
def BTM(biterms, unique_words, num_of_topics, num_of_iterations):
    ####################################################################################
    ### num_of_topics: number of topics to generate                                  ###
    ### number_of_iterations: collapsed gibbs sampling iterations                    ###
    ####################################################################################

    # constant we set for the LD prior (topic distributions in a document)
    DL_ALPHA = 1
    # constant we set for the LD prior (word distribution in a topic)
    DL_BETA = 0.01
    # Number of total biterms
    N_BITERMS = len(biterms)

    # Assign a random topic for each biterm
    n_z = np.random.randint(0, num_of_topics, N_BITERMS)
    n_topics = np.bincount(n_z, minlength=num_of_topics)

    # Words count over topics
    # Key is word, value is an array of topic counts, use the index to indicate the topic 1 to k
    n_wz = defaultdict(lambda: np.zeros(num_of_topics))
    for index, (w1, w2) in enumerate(biterms):
        n_wz[w1][n_z[index]] += 1
        n_wz[w2][n_z[index]] += 1

    # unlike to LDA model, in the biterm model, each bigram is coming from a specific topic
    # biterm_topic = np.zeros((N_BITERMS, num_of_topics))
    for iteration in range(num_of_iterations):
        for index, (w1, w2) in enumerate(biterms):
            #             cur_topic = n_z[index]
            n_wz[w1][n_z[index]] -= 1
            n_wz[w2][n_z[index]] -= 1

            n_topics[n_z[index]] -= 1
            n_w1z = n_wz[w1]
            n_w2z = n_wz[w2]

            z_posterior = np.zeros(num_of_topics)
#             z_posterior = (n_topics + DL_ALPHA) * (n_w1z + DL_BETA) * (n_w2z + DL_BETA) / np.sum(
#                 (2 * n_topics + len(unique_words) * DL_BETA) * (2 * n_topics + len(unique_words) * DL_BETA))
            for z in range(num_of_topics):
                z_posterior[z] = (n_topics[z] + DL_ALPHA) * (n_w1z[z] + DL_BETA) * (n_w2z[z] + DL_BETA) / np.sum(
                    (2 * n_topics[z] + len(unique_words) * DL_BETA) * (2 * n_topics[z] + len(unique_words) * DL_BETA))

            topic_prob = z_posterior / np.sum(z_posterior)
            topic_selection = np.argmax(
                np.random.multinomial(n=1, pvals=topic_prob, size=1))
            n_z[index] = topic_selection
            n_topics[topic_selection] += 1
            n_wz[w1][topic_selection] += 1
            n_wz[w2][topic_selection] += 1

   # return the topic assignment for each biterm and the topic distribution of each bigram
    return n_z, n_wz

In [17]:
n_z, n_wz = BTM(biterms, unique_words, 3, 30)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29


In [24]:
# The topic distribution of the whole copus is
DL_ALPHA = 1
num_of_topics = 3
topic_distribution = (np.bincount(n_z, minlength=num_of_topics) + DL_ALPHA) / (len(biterms) + num_of_topics * DL_ALPHA)

In [26]:
topic_distribution

array([0.32062244, 0.46099607, 0.21838149])

In [27]:
np.sum(topic_distribution)

1.0

In [32]:
n_wz

defaultdict(<function __main__.BTM.<locals>.<lambda>()>,
            {'togeth': array([ 0., 19., 11.]),
             'quit': array([ 0.,  0., 20.]),
             'thing': array([26.,  9., 35.]),
             'oyster': array([ 0., 41., 91.]),
             'project': array([46.,  0.,  8.]),
             'tasti': array([ 0.,  8., 25.]),
             'rememb': array([ 0., 17., 12.]),
             'serious': array([ 5.,  0., 10.]),
             'tri': array([ 0., 32., 83.]),
             'spici': array([ 0.,  0., 30.]),
             'calamari': array([ 0.,  0., 20.]),
             'run': array([18., 40., 28.]),
             'crowd': array([14., 30.,  4.]),
             'piccatta': array([ 0.,  0., 10.]),
             'rich': array([ 0.,  0., 10.]),
             'bit': array([ 0., 22., 61.]),
             'everyth': array([46., 49., 38.]),
             'sauc': array([ 0.,  0., 46.]),
             'honest': array([36.,  0., 31.]),
             'share': array([ 6., 11., 13.]),
             'ci

In [71]:
n_wz_values = np.array([topic_freq for key, topic_freq in n_wz.items()])
n_wz_keys = np.array([key for key, topic_freq in n_wz.items()])

In [72]:
DL_BETA = 0.01
wz = (n_wz_values + DL_BETA) / (np.sum(n_wz_values, axis=0) + len(unique_words) * DL_BETA)


In [77]:
for i in range(num_of_topics):
    print("for topic {}, the top words are: ".format(i))
    print(n_wz_keys[np.argsort(wz[:,i])[-10:][::-1]])

for topic 0, the top words are: 
['time' 'call' 'work' 'need' 'plumb' 'fix' 'job' 'came' 'juan' 'would']
for topic 1, the top words are: 
['dog' 'servic' 'time' 'cat' 'great' 'elain' 'use' 'know' 'would' 'care']
for topic 2, the top words are: 
['dog' 'want' 'groom' 'seafood' 'also' 'oyster' 'get' 'order' 'clam' 'tri']


In [80]:
wz[:,2][np.argsort(wz[:,2])[-10:][::-1]]

array([0.01260468, 0.01065851, 0.00982444, 0.00963909, 0.00945374,
       0.00843432, 0.00815629, 0.00806362, 0.00787827, 0.00769292])