## Libraries

In [1]:
import nltk
from nltk.corpus import stopwords
#nltk.download('stopwords')
import re
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import tomotopy as tp
import pyLDAvis
import pandas as pd
import tmplot

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


## Data loading

In [2]:
df = pd.read_csv('../sampleData/tripadvisorReviews/hotel_reviews.csv')

In [3]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


## Text pre-processing

In [4]:
porter_stemmer = nltk.PorterStemmer().stem
english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
pat = re.compile('^[a-z]{2,}$')
corpus = tp.utils.Corpus(
    tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), 
    stopwords=lambda x: x in english_stops or not pat.match(x)
)

In [5]:
reviews = df['Review'].tolist()
corpus.process(doc.lower() for doc in reviews)

20491

## Model setup

In [6]:
mdl = tp.LDAModel(min_df=5, rm_top=40, k=9, corpus=corpus)
mdl.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))
print('Removed Top words: ', *mdl.removed_top_words)

Num docs:20491, Num Vocabs:9545, Total Words:1445015
Removed Top words:  hotel room stay great good staff night locat nice day time clean servic restaur beach walk breakfast place food like resort pool help bed realli love area friendli peopl excel book bar want small recommend littl got view bathroom look


## Model training

In [7]:
for i in range(0, 1000, 20):
    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
    mdl.train(20)
print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))
mdl.summary()

Iteration: 0000, LL per word: -10.18
Iteration: 0020, LL per word: -8.537
Iteration: 0040, LL per word: -8.34
Iteration: 0060, LL per word: -8.275
Iteration: 0080, LL per word: -8.24
Iteration: 0100, LL per word: -8.217
Iteration: 0120, LL per word: -8.201
Iteration: 0140, LL per word: -8.19
Iteration: 0160, LL per word: -8.185
Iteration: 0180, LL per word: -8.177
Iteration: 0200, LL per word: -8.171
Iteration: 0220, LL per word: -8.169
Iteration: 0240, LL per word: -8.168
Iteration: 0260, LL per word: -8.165
Iteration: 0280, LL per word: -8.163
Iteration: 0300, LL per word: -8.16
Iteration: 0320, LL per word: -8.159
Iteration: 0340, LL per word: -8.158
Iteration: 0360, LL per word: -8.157
Iteration: 0380, LL per word: -8.153
Iteration: 0400, LL per word: -8.152
Iteration: 0420, LL per word: -8.148
Iteration: 0440, LL per word: -8.146
Iteration: 0460, LL per word: -8.144
Iteration: 0480, LL per word: -8.14
Iteration: 0500, LL per word: -8.139
Iteration: 0520, LL per word: -8.138
Iterat

## Visualization

## pyLDAvis

In [8]:
topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

In [9]:
prepared_data = pyLDAvis.prepare(
    topic_term_dists, 
    doc_topic_dists, 
    doc_lengths, 
    vocab, 
    term_frequency,
    start_index=0, 
    sort_topics=False 
)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [10]:
pyLDAvis.save_html(prepared_data, 'ldavis.html')