In [1]:
# Import packages
import pandas as pd
import numpy as np
import datetime
from collections import defaultdict
import nltk
import random

In [3]:
stemmer = nltk.stem.snowball.SnowballStemmer("english")
data_dir = "/Users/yangsong/Desktop/Projects/gitrepo_songyang0716/Topic_Modeling/reviews_small.txt"
np.random.seed(666)

# read review texts
reviews = []
f = open(data_dir, "r")
for review in f:
    reviews.append(review)
random.shuffle(reviews)

In [4]:
# process text
# tokenize, lower, remove stop words, stem, then only keep alphabets in the string
clean_reviews = []
for review in reviews:
    s = nltk.word_tokenize(review)
    s = [word.lower() for word in s]
    s = [word for word in s if not word in set(
        nltk.corpus.stopwords.words('english'))]
    s = [stemmer.stem(word) for word in s if word.isalpha()]
    clean_reviews.append(s)

In [82]:
unique_words = set([word for review in clean_reviews for word in review])

In [98]:

def plsa(clean_reviews, num_of_topics, num_of_iterations, num_of_unique_words):
	####################################################################################
	### clean_reviews: clean reviews that has been tokenized, store in a list        ###
	### num_of_topics: number of topics to generate                                  ###
	### number_of_iterations: collapsed gibbs sampling iterations                    ###
	####################################################################################
	# words dictionary
	word_index = {}
	index_word = {}
	index = 0
	for review in clean_reviews:
		for word in review:
			if word in word_index:
				pass
			else:
				word_index[word] = index
				index_word[index] = word
				index += 1

	# words counts matrix
	n_doc = len(clean_reviews)

	# record the count of each word occured in each document
	ndw = np.zeros((n_doc, num_of_unique_words))
	for i, review in enumerate(clean_reviews):
		for word in review:
			l = word_index[word]
			ndw[i, l] += 1


	# words distribution in each topics
	nwz = np.random.rand(num_of_unique_words, num_of_topics)
	pwz = nwz/nwz.sum(axis=0,keepdims=1)
	# the topic distribution for each document
	nzd = np.random.rand(num_of_topics, n_doc)
	pzd = nzd/nzd.sum(axis=0,keepdims=1)

	pzwd = np.zeros((num_of_topics, num_of_unique_words, n_doc))

	for i in range(num_of_iterations):
		# E-step
		pwd = np.matmul(pwz, pzd)
		for j in range(num_of_unique_words):
			for k in range(len(clean_reviews)): 
				pzwd[:,j,k] = np.multiply(pwz[j,:], pzd[:,k]) / pwd[j,k]

		# M-step
		# update pwz
		for k in range(num_of_topics): 
			for j in range(num_of_unique_words):
				pwz[j,k] = np.matmul(ndw[:,j], pzwd[k,j,:])
			pwz[:,k] = pwz[:,k] / np.sum(pwz[:,k])

		# update pzd
		for k in range(n_doc):
			for j in range(num_of_topics):
				pzd[j,k] = np.matmul(ndw[k,:], pzwd[j,:,k]) 
			pzd[:,k] =  pzd[:,k] / np.sum(pzd[:,k])

	return pwz, pzd, index_word

In [121]:
num_of_topics = 3
num_of_iterations = 100

In [None]:
pwz, pzd, index_word = plsa(clean_reviews, num_of_topics, num_of_iterations, len(unique_words))

In [None]:
for i in range(num_of_topics):
    print("Topic:", i+1)
    top_words = pwz[:,i].argsort()[-10:][::-1]
    word_list = []
    for j in top_words:
        word_list.append(index_word[j])
    print(" ".join(word_list))