In [4]:
# Import packages
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict
import nltk
import random


In [7]:
stemmer = nltk.stem.snowball.SnowballStemmer("english")
data_dir = "/Users/yangsong/Desktop/Projects/gitrepo_songyang0716/Topic_Modeling/reviews_small.txt"
np.random.seed(666)

In [8]:
# read review texts
reviews = []
f = open(data_dir, "r")
for review in f:
    reviews.append(review)
random.shuffle(reviews)

In [9]:
# process text
# tokenize, lower, remove stop words, stem, then only keep alphabets in the string
clean_reviews = []
for review in reviews:
    s = nltk.word_tokenize(review)
    s = [word.lower() for word in s]
    s = [word for word in s if not word in set(
        nltk.corpus.stopwords.words('english'))]
    s = [stemmer.stem(word) for word in s if word.isalpha()]
    clean_reviews.append(" ".join(s))

In [10]:
len(clean_reviews)

147

In [11]:
reviews[0]

"This is the second time I've used APlus for plumbing issues. Juan and Miguel once again resolved our problem with ease and efficiency. I was happy to see them waiting for me once I got home from work at 4:07p for my 4p-6 appointment window.\n"

In [12]:
clean_reviews[0]

'second time use aplus plumb issu juan miguel resolv problem eas effici happi see wait got home work appoint window'

In [76]:
biterms = []
unique_words = set()
for clean_review in clean_reviews:
    clean_review = clean_review.split()
    review_length = len(clean_review)
    cur_review_biterms = set()
    for i in range(review_length):
        unique_words.add(clean_review[i])
        for j in range(i+1, review_length):
            cur_review_biterms.add((clean_review[i], clean_review[j]))
    biterms.extend(list(cur_review_biterms))

In [77]:
len(biterms)

104242

In [82]:
n_wz = np.zeros((len(unique_words), num_of_topics), dtype=int)

In [83]:
n_wz

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       ...,
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [None]:
"""
(C) YANG SONG - 2019
Implementation of the collapsed Gibbs sampler for
Biterm Topic Models, as described in
Biterm Topic Model for Short Texts (Yan,  Guo, Lan, Cheng)
"""

# Import packages
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict
import nltk
import random

stemmer = nltk.stem.snowball.SnowballStemmer("english")
data_dir = "/Users/yangsong/Desktop/Projects/Topic_Modeling/reviews_small.txt"
np.random.seed(666)

# read review texts
reviews = []
f = open(data_dir, "r")
for review in f:
    reviews.append(review)
random.shuffle(reviews)

# process text
# tokenize, lower, remove stop words, stem, then only keep alphabets in the string
clean_reviews = []
for review in reviews:
    s = nltk.word_tokenize(review)
    s = [word.lower() for word in s]
    s = [word for word in s if not word in set(
        nltk.corpus.stopwords.words('english'))]
    s = [stemmer.stem(word) for word in s if word.isalpha()]
    clean_reviews.append(s)

# extract all the unique biterms from the reviews
# BTM directly models the word cooccurrence patterns based on biterms
# A biterm denotes an unordered unique word-pair co-occurring in a short context, each context in our example is a review
biterms = []
unique_words = set()
for clean_review in clean_reviews:
    clean_review = clean_review.split()
    review_length = len(clean_review)
    cur_review_biterms = set()
    for i in range(review_length):
        unique_words.add(clean_review[i])
        for j in range(i+1, review_length):
            cur_review_biterms.add((clean_review[i], clean_review[j]))
    biterms.extend(list(cur_review_biterms))

# bigrams only
# biterms = [biterm for review in clean_reviews for biterm in zip(review.split(" ")[:-1], review.split("")[1:])]
# biterms = set(biterms)


def BTM(reviews, biterms, unique_words, num_of_topics, num_of_iterations):
    ####################################################################################
    ### reviews: contains a list of reviews, and each review is a list of words      ###
    ### num_of_topics: number of topics to generate                                  ###
    ### number_of_iterations: collapsed gibbs sampling iterations                    ###
    ####################################################################################

    # constant we set for the LD prior (topic distributions in a document)
    DL_ALPHA = 50 / num_of_topics
    # constant we set for the LD prior (word distribution in a topic)
    DL_BETA = 0.01
    # Number of total biterms
    N_BITERMS = len(biterms)

    # Assign a random topic for each biterm
    n_z = np.random.randint(0, num_of_topics, N_BITERMS)
    # Words count over topics
    # Key is word, value is an array of topic counts, use the index to indicate the topic 1 to k
    n_wz = defaultdict(lambda: np.zeros(num_of_topics))
    for index, (w1, w2) in enumerate(biterms):
        n_wz[w1][n_z[index]] += 1
        n_wz[w2][n_z[index]] += 1

    # unlike to LDA model, in the biterm model, each bigram is coming from a specific topic
    # biterm_topic = np.zeros((N_BITERMS, num_of_topics))
    for iteration in range(num_of_iterations):
        for index, (w1, w2) in enumerate(biterms):
            # give a -1 class to the current biterm, means we ignore the current biterm
            n_z[index] = -1
            nz = np.unique(n_z, return_counts=True)[1][1:]
            n_w1z = n_wz[w1]
            n_w2z = n_wz[w2]

            z_posterior = (nz + DL_ALPHA) * (n_wz[w1] + DL_BETA) * (n_wz[w2] + DL_BETA) / np.sum(
                (2 * nz + len(unique_words) * DL_BETA) * (2 * nz + len(unique_words) * DL_BETA))