# Import the Dataset

In [1]:
import pandas as pd

from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models
import pandas as pd
import gensim
import pyLDAvis.gensim

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to /Users/priya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Run in python console
import nltk; nltk.download('stopwords')

# Run in terminal or command prompt
import spacy

[nltk_data] Downloading package stopwords to /Users/priya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Regex
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy


# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [5]:
df = pd.read_csv('/Users/priya/tripadvisor_hotel_reviews.csv', encoding='utf-8')
df.head()

Unnamed: 0,Review,Rating
0,nice hotel expensive parking got good deal sta...,4
1,ok nothing special charge diamond member hilto...,2
2,nice rooms not 4* experience hotel monaco seat...,3
3,"unique, great stay, wonderful time hotel monac...",5
4,"great stay great stay, went seahawk game aweso...",5


# Data Preprocessing

In [6]:
pattern = r'\b[^\d\W]+\b'
tokenizer = RegexpTokenizer(pattern)
en_stop = get_stop_words('en')
lemmatizer = WordNetLemmatizer()

In [7]:
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
#stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [8]:
# Convert to list
data = df.Review.values.tolist()

# Remove Emails
#data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

In [9]:
def remove_accented_chars(text):
    """remove accented characters from text, e.g. café"""
    text = unidecode.unidecode(text)
    return text

In [10]:
def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = list(cont.expand_texts([text], precise=True))[0]
    return text

In [11]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[1])

['ok', 'nothing', 'special', 'charge', 'diamond', 'member', 'hilton', 'decided', 'chain', 'shot', 'th', 'anniversary', 'seattle', 'start', 'booked', 'suite', 'paid', 'extra', 'website', 'description', 'not', 'suite', 'bedroom', 'bathroom', 'standard', 'hotel', 'room', 'took', 'printed', 'reservation', 'desk', 'showed', 'said', 'things', 'like', 'tv', 'couch', 'ect', 'desk', 'clerk', 'told', 'oh', 'mixed', 'suites', 'description', 'kimpton', 'website', 'sorry', 'free', 'breakfast', 'got', 'kidding', 'embassy', 'suits', 'sitting', 'room', 'bathroom', 'bedroom', 'unlike', 'kimpton', 'calls', 'suite', 'day', 'stay', 'offer', 'correct', 'false', 'advertising', 'send', 'kimpton', 'preferred', 'guest', 'website', 'email', 'asking', 'failure', 'provide', 'suite', 'advertised', 'website', 'reservation', 'description', 'furnished', 'hard', 'copy', 'reservation', 'printout', 'website', 'desk', 'manager', 'duty', 'did', 'not', 'reply', 'solution', 'send', 'email', 'trip', 'guest', 'survey', 'did',

In [12]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram [same Output can be received by just using the above code as well]
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['nice', 'hotel', 'expensive', 'parking', 'got', 'good', 'deal', 'stay', 'hotel', 'anniversary', 'arrived', 'late', 'evening', 'took', 'advice', 'previous', 'reviews', 'did', 'valet_parking', 'check', 'quick', 'easy', 'little', 'disappointed', 'non_existent', 'view', 'room', 'room', 'clean', 'nice', 'size', 'bed', 'comfortable', 'woke', 'stiff', 'neck', 'high', 'pillows', 'not', 'soundproof', 'like', 'heard', 'music', 'room', 'night', 'morning', 'loud', 'bangs', 'doors', 'opening_closing', 'hear', 'people', 'talking', 'hallway', 'maybe', 'just', 'noisy', 'neighbors', 'aveda_bath_products', 'nice', 'did', 'not', 'goldfish', 'stay', 'nice', 'touch', 'taken', 'advantage', 'staying', 'longer', 'location', 'great', 'walking_distance', 'shopping', 'overall', 'nice', 'experience', 'having', 'pay', 'parking', 'night']


In [13]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [14]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['nice', 'hotel', 'expensive', 'parking', 'get', 'good', 'deal', 'stay', 'hotel', 'anniversary', 'arrive', 'late', 'evening', 'take', 'advice', 'previous', 'review', 'valet_parke', 'check', 'quick', 'easy', 'little', 'disappointed', 'view', 'room', 'room', 'clean', 'nice', 'size', 'bed', 'comfortable', 'wake', 'stiff', 'neck', 'high', 'pillow', 'soundproof', 'hear', 'music', 'room', 'night', 'morning', 'loud', 'bang', 'door', 'opening_close', 'hear', 'people', 'talk', 'maybe', 'noisy', 'neighbor', 'product', 'nice', 'goldfish', 'stay', 'nice', 'touch', 'take', 'advantage', 'stay', 'long', 'location', 'great', 'walking_distance', 'shopping', 'overall', 'nice', 'experience', 'pay', 'parking', 'night']]


In [15]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
#print(corpus[:1])

# Run the LDA Model

In [16]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=2, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=200,
                                           passes=20,
                                           alpha='auto',
                                           per_word_topics=True)

In [17]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics(num_topics=2, num_words=10))
doc_lda = lda_model[corpus]

[(0,
  '0.012*"food" + 0.012*"room" + 0.011*"time" + 0.011*"day" + 0.011*"good" + '
  '0.011*"resort" + 0.010*"pool" + 0.010*"go" + 0.009*"get" + 0.009*"people"'),
 (1,
  '0.049*"hotel" + 0.042*"room" + 0.026*"stay" + 0.017*"great" + 0.015*"good" '
  '+ 0.013*"staff" + 0.011*"night" + 0.010*"location" + 0.010*"breakfast" + '
  '0.008*"nice"')]


# Print the Dominant Topic of Each Word
## The challenge here is that many words might belong to multiple Topics and we need to assign them just one Topic that is of highest priority

In [18]:
counter = 0
for i in lda_model.show_topics(formatted = False, num_topics = 2, num_words=len(lda_model.id2word)):
    if counter == 0:
        topic_num = i[0]
        topic_words = i[1]
        temp1 = pd.DataFrame(topic_words)
        temp1.columns = ['word', topic_num]
        counter = counter + 1
        #     temp1['topic'] = topic_num
    else:
        topic_num = i[0]
        topic_words = i[1]
        temp2 = pd.DataFrame(topic_words)
        temp2.columns = ['word', topic_num]
        temp1 = pd.merge(temp1, temp2, on = "word", how = 'left')
temp1

Unnamed: 0,word,0,1
0,food,1.215058e-02,1.246198e-03
1,room,1.199549e-02,4.227054e-02
2,time,1.104725e-02,4.244115e-03
3,day,1.082985e-02,4.205441e-03
4,good,1.067747e-02,1.501901e-02
...,...,...,...
31304,marscarpone,6.265603e-07,5.269287e-07
31305,objectional,6.265593e-07,5.268964e-07
31306,andalucia,6.265593e-07,5.268965e-07
31307,onstreet,6.265591e-07,5.268959e-07


In [19]:
# This is the ccode that extarcts the topic that has the highest weight
words=temp1
main_topic = list()
for i in range(len(words)):
    top1 = 0
    for j in range(1,3):
        if words.iloc[i,j] > top1:
            top1 = words.iloc[i,j]
            top2 = j-1
    main_topic.append(top2)
#print(i, top2)
#print(words.iloc[i, j])
words['main_topic'] = main_topic    
    
    # If all you want is the word and its main topic save it into this dataframe
word_main_topic = words[['word', 'main_topic']]
word_main_topic


Unnamed: 0,word,main_topic
0,food,0
1,room,1
2,time,0
3,day,0
4,good,1
...,...,...
31304,marscarpone,0
31305,objectional,0
31306,andalucia,0
31307,onstreet,0


# Get Dominant Topic for each Sentence

In [20]:
from scipy import stats
import numpy as np

dict2={}
#split each sentence
for review_no in range(len(df)):
#for review_no in range(100):
    review= df.loc[review_no,'Review']
    review=review.replace(',','.')
    sentences=review.split('.')

#create an empty dictionay
    dict1={}
#split each word    
    for sentence in sentences:
        words=sentence.split(' ')
        #dominant topic
        temp=word_main_topic[word_main_topic['word'].isin(words)]
        #use mode to find the max frequency of a topic
        array1=np.array(temp['main_topic'].tolist())
        m=stats.mode(array1)
        if len(m[0])==0:
            continue
        m=m[0][0]
        dict1[sentence]=str(m)
        
    dict2[review_no]=dict1
    
import json 
with open('dominantTopic_sentences1.json','w') as file:
    json.dump(dict2,file)   