### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import csv
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline
import pyLDAvis.sklearn
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

### Define useful functions

In [2]:
#stemmer = SnowballStemmer('english')

def text_process(review_sentence):
        return [word.lower() for word in review_sentence.split() if word.lower() not in stopwords.words('english')]
    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))                        

    print()

### Import dataset

In [3]:
df_negative_sentences = pd.read_csv('./datasets/df_negative_sentences.csv',lineterminator='\n')

### Remove empty reviews

In [4]:
df_negative_sentences = df_negative_sentences[~pd.isnull(df_negative_sentences['review_sentence'])]

## Modeling

### Create model pipeline: Vectorization (BoW), LDA

In [5]:
NUM_TOPICS=5
N_TOP_WORDS=20

In [8]:
tf_vectorizer = CountVectorizer(analyzer=text_process)
tf = tf_vectorizer.fit_transform(df_negative_sentences['review_sentence'][0:50000])

In [9]:
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')

In [10]:
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [11]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names,N_TOP_WORDS)


Topics in LDA model:
Topic #0:
hotel staff room desk front work old clean need also got great never price morning wifi dated rude reception size
Topic #1:
breakfast really 2 service better much needs lobby bad poor area enough food new room place expensive space wasnt felt
Topic #2:
room small didnt rooms bed could bathroom shower water coffee little noisy bit noise parking hot air everything loud nice
Topic #3:
hotel time would stay us get nothing check next told wasnt good location said view long go back booked also
Topic #4:
room like night one door day floor dirty elevator bathroom first could elevators toilet stayed 3 bar towels open smell



In [None]:
##############################################################

In [51]:
doc_topic = lda.transform(tf)

In [56]:
df_negative_sentences['review_sentence'][0]

'Freight line running nearby  bit noisy at night at the front so better noise insulation on the windows would have been a bit better'

In [55]:
doc_topic[0]

array([0.24403696, 0.00400094, 0.00400014, 0.47689241, 0.00400005,
       0.06159399, 0.00400022, 0.19347351, 0.004     , 0.00400179])

In [57]:
for n in range(doc_topic.shape[0]):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n".format(n,topic_most_pr))

doc: 0 topic: 3

doc: 1 topic: 7

doc: 2 topic: 1

doc: 3 topic: 3

doc: 4 topic: 3

doc: 5 topic: 3

doc: 6 topic: 3

doc: 7 topic: 1

doc: 8 topic: 8

doc: 9 topic: 2

doc: 10 topic: 6

doc: 11 topic: 8

doc: 12 topic: 4

doc: 13 topic: 2

doc: 14 topic: 1

doc: 15 topic: 9

doc: 16 topic: 5

doc: 17 topic: 6

doc: 18 topic: 3

doc: 19 topic: 5

doc: 20 topic: 3

doc: 21 topic: 3

doc: 22 topic: 2

doc: 23 topic: 3

doc: 24 topic: 3

doc: 25 topic: 2

doc: 26 topic: 3

doc: 27 topic: 3

doc: 28 topic: 3

doc: 29 topic: 3

doc: 30 topic: 1

doc: 31 topic: 2

doc: 32 topic: 1

doc: 33 topic: 2

doc: 34 topic: 3

doc: 35 topic: 1

doc: 36 topic: 5

doc: 37 topic: 9

doc: 38 topic: 3

doc: 39 topic: 5

doc: 40 topic: 0

doc: 41 topic: 9

doc: 42 topic: 3

doc: 43 topic: 5

doc: 44 topic: 3

doc: 45 topic: 3

doc: 46 topic: 2

doc: 47 topic: 3

doc: 48 topic: 2

doc: 49 topic: 3

doc: 50 topic: 1

doc: 51 topic: 9

doc: 52 topic: 3

doc: 53 topic: 5

doc: 54 topic: 9

doc: 55 topic: 6

do

doc: 2771 topic: 3

doc: 2772 topic: 3

doc: 2773 topic: 3

doc: 2774 topic: 3

doc: 2775 topic: 3

doc: 2776 topic: 3

doc: 2777 topic: 8

doc: 2778 topic: 3

doc: 2779 topic: 3

doc: 2780 topic: 1

doc: 2781 topic: 9

doc: 2782 topic: 8

doc: 2783 topic: 3

doc: 2784 topic: 3

doc: 2785 topic: 1

doc: 2786 topic: 3

doc: 2787 topic: 3

doc: 2788 topic: 3

doc: 2789 topic: 8

doc: 2790 topic: 9

doc: 2791 topic: 5

doc: 2792 topic: 3

doc: 2793 topic: 3

doc: 2794 topic: 2

doc: 2795 topic: 3

doc: 2796 topic: 2

doc: 2797 topic: 1

doc: 2798 topic: 3

doc: 2799 topic: 4

doc: 2800 topic: 9

doc: 2801 topic: 3

doc: 2802 topic: 4

doc: 2803 topic: 4

doc: 2804 topic: 3

doc: 2805 topic: 8

doc: 2806 topic: 9

doc: 2807 topic: 7

doc: 2808 topic: 3

doc: 2809 topic: 3

doc: 2810 topic: 9

doc: 2811 topic: 2

doc: 2812 topic: 9

doc: 2813 topic: 5

doc: 2814 topic: 4

doc: 2815 topic: 5

doc: 2816 topic: 3

doc: 2817 topic: 6

doc: 2818 topic: 2

doc: 2819 topic: 3

doc: 2820 topic: 7



doc: 5292 topic: 3

doc: 5293 topic: 3

doc: 5294 topic: 5

doc: 5295 topic: 9

doc: 5296 topic: 3

doc: 5297 topic: 9

doc: 5298 topic: 3

doc: 5299 topic: 3

doc: 5300 topic: 2

doc: 5301 topic: 3

doc: 5302 topic: 3

doc: 5303 topic: 3

doc: 5304 topic: 4

doc: 5305 topic: 9

doc: 5306 topic: 3

doc: 5307 topic: 3

doc: 5308 topic: 3

doc: 5309 topic: 1

doc: 5310 topic: 6

doc: 5311 topic: 3

doc: 5312 topic: 5

doc: 5313 topic: 5

doc: 5314 topic: 3

doc: 5315 topic: 3

doc: 5316 topic: 1

doc: 5317 topic: 4

doc: 5318 topic: 2

doc: 5319 topic: 8

doc: 5320 topic: 3

doc: 5321 topic: 3

doc: 5322 topic: 1

doc: 5323 topic: 3

doc: 5324 topic: 5

doc: 5325 topic: 2

doc: 5326 topic: 0

doc: 5327 topic: 5

doc: 5328 topic: 5

doc: 5329 topic: 3

doc: 5330 topic: 5

doc: 5331 topic: 2

doc: 5332 topic: 4

doc: 5333 topic: 3

doc: 5334 topic: 6

doc: 5335 topic: 3

doc: 5336 topic: 4

doc: 5337 topic: 1

doc: 5338 topic: 3

doc: 5339 topic: 3

doc: 5340 topic: 2

doc: 5341 topic: 3



doc: 8305 topic: 3

doc: 8306 topic: 2

doc: 8307 topic: 3

doc: 8308 topic: 5

doc: 8309 topic: 3

doc: 8310 topic: 9

doc: 8311 topic: 2

doc: 8312 topic: 2

doc: 8313 topic: 3

doc: 8314 topic: 3

doc: 8315 topic: 2

doc: 8316 topic: 9

doc: 8317 topic: 4

doc: 8318 topic: 2

doc: 8319 topic: 3

doc: 8320 topic: 5

doc: 8321 topic: 9

doc: 8322 topic: 3

doc: 8323 topic: 2

doc: 8324 topic: 9

doc: 8325 topic: 9

doc: 8326 topic: 3

doc: 8327 topic: 3

doc: 8328 topic: 8

doc: 8329 topic: 3

doc: 8330 topic: 3

doc: 8331 topic: 3

doc: 8332 topic: 3

doc: 8333 topic: 9

doc: 8334 topic: 9

doc: 8335 topic: 3

doc: 8336 topic: 3

doc: 8337 topic: 3

doc: 8338 topic: 7

doc: 8339 topic: 7

doc: 8340 topic: 3

doc: 8341 topic: 4

doc: 8342 topic: 3

doc: 8343 topic: 2

doc: 8344 topic: 3

doc: 8345 topic: 5

doc: 8346 topic: 3

doc: 8347 topic: 3

doc: 8348 topic: 3

doc: 8349 topic: 8

doc: 8350 topic: 8

doc: 8351 topic: 5

doc: 8352 topic: 6

doc: 8353 topic: 9

doc: 8354 topic: 3



doc: 11184 topic: 5

doc: 11185 topic: 3

doc: 11186 topic: 3

doc: 11187 topic: 2

doc: 11188 topic: 6

doc: 11189 topic: 3

doc: 11190 topic: 3

doc: 11191 topic: 5

doc: 11192 topic: 2

doc: 11193 topic: 3

doc: 11194 topic: 7

doc: 11195 topic: 3

doc: 11196 topic: 3

doc: 11197 topic: 3

doc: 11198 topic: 9

doc: 11199 topic: 9

doc: 11200 topic: 5

doc: 11201 topic: 2

doc: 11202 topic: 4

doc: 11203 topic: 3

doc: 11204 topic: 3

doc: 11205 topic: 6

doc: 11206 topic: 3

doc: 11207 topic: 7

doc: 11208 topic: 3

doc: 11209 topic: 5

doc: 11210 topic: 2

doc: 11211 topic: 3

doc: 11212 topic: 9

doc: 11213 topic: 3

doc: 11214 topic: 1

doc: 11215 topic: 3

doc: 11216 topic: 2

doc: 11217 topic: 3

doc: 11218 topic: 3

doc: 11219 topic: 5

doc: 11220 topic: 3

doc: 11221 topic: 9

doc: 11222 topic: 3

doc: 11223 topic: 5

doc: 11224 topic: 5

doc: 11225 topic: 9

doc: 11226 topic: 8

doc: 11227 topic: 0

doc: 11228 topic: 3

doc: 11229 topic: 7

doc: 11230 topic: 3

doc: 11231 to

doc: 14124 topic: 9

doc: 14125 topic: 3

doc: 14126 topic: 0

doc: 14127 topic: 3

doc: 14128 topic: 3

doc: 14129 topic: 7

doc: 14130 topic: 8

doc: 14131 topic: 0

doc: 14132 topic: 3

doc: 14133 topic: 6

doc: 14134 topic: 9

doc: 14135 topic: 3

doc: 14136 topic: 3

doc: 14137 topic: 7

doc: 14138 topic: 3

doc: 14139 topic: 1

doc: 14140 topic: 3

doc: 14141 topic: 3

doc: 14142 topic: 6

doc: 14143 topic: 1

doc: 14144 topic: 9

doc: 14145 topic: 5

doc: 14146 topic: 5

doc: 14147 topic: 3

doc: 14148 topic: 1

doc: 14149 topic: 3

doc: 14150 topic: 4

doc: 14151 topic: 7

doc: 14152 topic: 7

doc: 14153 topic: 5

doc: 14154 topic: 3

doc: 14155 topic: 3

doc: 14156 topic: 8

doc: 14157 topic: 3

doc: 14158 topic: 5

doc: 14159 topic: 1

doc: 14160 topic: 0

doc: 14161 topic: 3

doc: 14162 topic: 5

doc: 14163 topic: 3

doc: 14164 topic: 3

doc: 14165 topic: 3

doc: 14166 topic: 2

doc: 14167 topic: 7

doc: 14168 topic: 2

doc: 14169 topic: 2

doc: 14170 topic: 2

doc: 14171 to

doc: 16822 topic: 3

doc: 16823 topic: 5

doc: 16824 topic: 1

doc: 16825 topic: 3

doc: 16826 topic: 3

doc: 16827 topic: 5

doc: 16828 topic: 3

doc: 16829 topic: 4

doc: 16830 topic: 6

doc: 16831 topic: 3

doc: 16832 topic: 3

doc: 16833 topic: 3

doc: 16834 topic: 3

doc: 16835 topic: 9

doc: 16836 topic: 2

doc: 16837 topic: 3

doc: 16838 topic: 3

doc: 16839 topic: 3

doc: 16840 topic: 3

doc: 16841 topic: 3

doc: 16842 topic: 8

doc: 16843 topic: 3

doc: 16844 topic: 3

doc: 16845 topic: 7

doc: 16846 topic: 3

doc: 16847 topic: 9

doc: 16848 topic: 1

doc: 16849 topic: 3

doc: 16850 topic: 5

doc: 16851 topic: 2

doc: 16852 topic: 3

doc: 16853 topic: 6

doc: 16854 topic: 1

doc: 16855 topic: 9

doc: 16856 topic: 2

doc: 16857 topic: 9

doc: 16858 topic: 3

doc: 16859 topic: 3

doc: 16860 topic: 3

doc: 16861 topic: 3

doc: 16862 topic: 3

doc: 16863 topic: 1

doc: 16864 topic: 9

doc: 16865 topic: 5

doc: 16866 topic: 3

doc: 16867 topic: 5

doc: 16868 topic: 3

doc: 16869 to

doc: 19656 topic: 5

doc: 19657 topic: 3

doc: 19658 topic: 9

doc: 19659 topic: 2

doc: 19660 topic: 8

doc: 19661 topic: 3

doc: 19662 topic: 2

doc: 19663 topic: 3

doc: 19664 topic: 2

doc: 19665 topic: 3

doc: 19666 topic: 3

doc: 19667 topic: 3

doc: 19668 topic: 9

doc: 19669 topic: 9

doc: 19670 topic: 3

doc: 19671 topic: 6

doc: 19672 topic: 6

doc: 19673 topic: 3

doc: 19674 topic: 3

doc: 19675 topic: 9

doc: 19676 topic: 0

doc: 19677 topic: 4

doc: 19678 topic: 9

doc: 19679 topic: 3

doc: 19680 topic: 2

doc: 19681 topic: 5

doc: 19682 topic: 3

doc: 19683 topic: 3

doc: 19684 topic: 9

doc: 19685 topic: 9

doc: 19686 topic: 3

doc: 19687 topic: 3

doc: 19688 topic: 1

doc: 19689 topic: 1

doc: 19690 topic: 1

doc: 19691 topic: 4

doc: 19692 topic: 8

doc: 19693 topic: 5

doc: 19694 topic: 3

doc: 19695 topic: 3

doc: 19696 topic: 3

doc: 19697 topic: 3

doc: 19698 topic: 3

doc: 19699 topic: 3

doc: 19700 topic: 2

doc: 19701 topic: 9

doc: 19702 topic: 2

doc: 19703 to