## Libraries

In [2]:
import nltk
from nltk.corpus import stopwords
import re
import tomotopy as tp
import numpy as np
import pandas as pd
import pyLDAvis
import tmplot
import warnings

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


## Data loading

In [3]:
df = pd.read_csv('../sampleData/tripadvisorReviews/hotel_reviews.csv')

In [4]:
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


## Text pre-processing

In [5]:
porter_stemmer = nltk.PorterStemmer().stem
english_stops = set(porter_stemmer(w) for w in stopwords.words('english'))
pat = re.compile('^[a-z]{2,}$')
corpus = tp.utils.Corpus(
    tokenizer=tp.utils.SimpleTokenizer(porter_stemmer), 
    stopwords=lambda x: x in english_stops or not pat.match(x)
)

In [6]:
reviews = df['Review'].tolist()
corpus.process(doc.lower() for doc in reviews)

20491

## Model setup

In [7]:
mdl = tp.LDAModel(min_df=5, rm_top=40, k=10, corpus=corpus)
mdl.train(0)

print('Num docs:{}, Num Vocabs:{}, Total Words:{}'.format(
    len(mdl.docs), len(mdl.used_vocabs), mdl.num_words
))
print('Removed Top words: ', *mdl.removed_top_words)

Num docs:20491, Num Vocabs:9545, Total Words:1445015
Removed Top words:  hotel room stay great good staff night locat nice day time clean servic restaur beach walk breakfast place food like resort pool help bed realli love area friendli peopl excel book bar want small recommend littl got view bathroom look


## Model training

In [8]:
for i in range(0, 1000, 20):
    print('Iteration: {:04}, LL per word: {:.4}'.format(i, mdl.ll_per_word))
    mdl.train(20)
print('Iteration: {:04}, LL per word: {:.4}'.format(1000, mdl.ll_per_word))
mdl.summary()

Iteration: 0000, LL per word: -10.32
Iteration: 0020, LL per word: -8.543
Iteration: 0040, LL per word: -8.351
Iteration: 0060, LL per word: -8.282
Iteration: 0080, LL per word: -8.249
Iteration: 0100, LL per word: -8.226
Iteration: 0120, LL per word: -8.213
Iteration: 0140, LL per word: -8.204
Iteration: 0160, LL per word: -8.193
Iteration: 0180, LL per word: -8.186
Iteration: 0200, LL per word: -8.182
Iteration: 0220, LL per word: -8.18
Iteration: 0240, LL per word: -8.176
Iteration: 0260, LL per word: -8.174
Iteration: 0280, LL per word: -8.172
Iteration: 0300, LL per word: -8.169
Iteration: 0320, LL per word: -8.165
Iteration: 0340, LL per word: -8.164
Iteration: 0360, LL per word: -8.161
Iteration: 0380, LL per word: -8.162
Iteration: 0400, LL per word: -8.159
Iteration: 0420, LL per word: -8.158
Iteration: 0440, LL per word: -8.156
Iteration: 0460, LL per word: -8.158
Iteration: 0480, LL per word: -8.158
Iteration: 0500, LL per word: -8.157
Iteration: 0520, LL per word: -8.157
It

## Visualization

## pyLDAvis

In [9]:
topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
vocab = list(mdl.used_vocabs)
term_frequency = mdl.used_vocab_freq

In [10]:
prepared_data = pyLDAvis.prepare(
    topic_term_dists, 
    doc_topic_dists, 
    doc_lengths, 
    vocab, 
    term_frequency,
    start_index=0, 
    sort_topics=False 
)

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [13]:
pyLDAvis.display(prepared_data)

In [12]:
pyLDAvis.save_html(prepared_data, 'ldavis.html')

## Inspect document-topic associations and topic-word associations

In [17]:
tmplot.get_top_docs(docs=reviews, model=mdl)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9
0,"central major sights, room good size clean bat...",great location great hosts hotel europa close ...,excellent hotel milano hotel milano located he...,felt like visiting family welcoming husband st...,hotel toledano aware place booking phone webpa...,great business trip hotel stay nikko time trav...,"pleased, just returned trip new orleans stayin...","nice, romantic great romantic fling right ober...",dark dingy hotel great location chinatown room...,"blast, grand palladium punta cana sept 20th 27..."
1,japan located opposite jr shinagawa train stat...,loved stay morandi spent lovely days morandi a...,beautiful resort just got staying 4 days wyndh...,special hotel husband daughters stayed hotel n...,dishonest terrible service hotel beautiful ser...,good value money central location visiting sin...,great location location hotel perfect right mi...,"beautiful hotel, partner spent week interconti...",disappointment stayed worst room hotel- view i...,highly recommend hotel stayed bavaro princess ...
2,excellent value visited hotel late march reall...,enjoyable stay great choice visiting florence ...,best place stay pr embassy suites 6th 8th 2006...,total elegance plus week experience elegant we...,horrible experience customer service live 20 m...,"lovely hotel hidden midst hong kong, usually s...",great location nice hotel family 5 stayed june...,friendly clean rejuvenating excellent multi-st...,standard room bathroom like 2 star hotel don`t...,grand flamenco good bad just returned 11/01/04...
3,excellent budget hotel just got enjoyable nigh...,great hotel lovely staff convenient attraction...,great hotel near airport reviewers written nic...,sofitel gem hotel great experience staying lon...,"rude young manager, terrible service man hotel...",great location experience real hong kong staye...,"great deal, noisy terrific deal free parking i...",lovely setting good service stayed 4 nights al...,"renovation, putting lipstick pig, currently pu...",loved loved loved just got couple weeks ago oc...
4,great value good location small basic clean ho...,beautiful florence hotel berchielli excellent ...,tips el conquistador family traveled el conqui...,outstanding hotel visit london twice year ches...,"nasty n't waste time money site late, pay did ...","loved hotel, great, great hotel booked flight ...",great boutique hotel great boutique hotel loca...,serene hotel location lovely hotel set vast ex...,"cool place great hotel modern, room comfortabl...",loved iberostar punta cana took apple vacation...
