### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import csv
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline
import pyLDAvis.sklearn
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer,PorterStemmer
from gensim.models import WordEmbeddingSimilarityIndex
from gensim.matutils import softcossim,cossim
import gensim.downloader as api
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix
from sklearn.metrics import confusion_matrix,classification_report

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [2]:
# Load the FastText model
#fasttext_model300 = api.load('fasttext-wiki-news-subwords-300')

### Define useful functions

In [244]:
STOPWORDS = stopwords.words('english')

STOPWORDS = STOPWORDS + ['good','better','could','would','didnt','money','night','need','like','nothing','one',
                         'day','get','time','stay','thing','u','horrible','great','well','ask','never','ever',
                         'recommend','place','back','disgusting','terrible','worst','ok','price','pay','paid',
                         'even','use','bad','picture','anything','everything','really','think','bit','sure',
                         'went','quite','turn','around','dont','feel','work','wasnt','much','complaint','told',
                         'hour','line','also','big','small','option','cant','per','cost','extra','said','took',
                         'leave','hotel','area','old','new','service','say','want','stayed','worth','time',
                         'look','little','way','basic','see','overall','rate']

wn_lemmatizer = WordNetLemmatizer()
p_stemmer = PorterStemmer()

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [wn_lemmatizer.lemmatize(p_stemmer.stem(t)) for t in tokenized_text if t not in STOPWORDS]
    return cleaned_text

### Import datasets

In [258]:
df_negative_sentences = pd.read_csv('./datasets/df_negative_sentences.csv',lineterminator='\n')

In [259]:
df_positive_sentences = pd.read_csv('./datasets/df_positive_sentences.csv',lineterminator='\n')

### Remove empty reviews

In [260]:
df_negative_sentences = df_negative_sentences[~pd.isnull(df_negative_sentences['review_sentence'])]

In [261]:
df_positive_sentences = df_positive_sentences[~pd.isnull(df_positive_sentences['review_sentence'])]

### Cosine similarity between topics and reviews

In [262]:
noise_topic = 'Noise noisy loud quiet party scream yell voice music thin wall hear talk' + 'Smell smelly smoke odor cigarette stink' + 'Ac heat hot cold warm chilly thermostat cool air conditioning vent ventilation fan adjust heater temperature'
staff_topic = 'Staff rude unfriendly friendly polite impolite front desk manager maid reception valet clerk reception housekeep waiter waitress' + 'Check in out checkin checkout communication experience bag early late reservation booking'
breakfast_topic = 'Breakfast food egg bacon sausage toast waffle fruit omelette omelet cheese milk pastry coffee tea juice silverware plasticware cup plastic included selection taste'
facilities_topic = 'Facility elevator lift work stair floor disability wheelchair pool jacuzzi gym vending machine spa sauna towel renovation bar restaurant lounge pet friendly dinner lunch pit property' + 'WiFi wi fi internet slow connection signal free fast spotty' + 'Park lot car valet street driveway'
#parking_topic = 'Park lot car valet street driveway'
#smell_topic = 'Smell smelly smoke odor cigarette stink'
#ac_heat_topic = 'Ac heat hot cold warm chilly thermostat cool air conditioning vent ventilation fan adjust heater temperature'
#wifi_topic = 'WiFi wi fi internet slow connection signal free fast spotty'
location_topic = 'Location surrounding far traffic highway walk street road neighborhood sketchy attraction center city town downtown nearby near walk transport subway park view safe dangerous drive'
#check_in_out_topic = 'Check in out checkin checkout communication experience bag early late reservation booking'
bathroom_topic = 'Bathroom stain shower tub bathtub curtain pressure sink water toiletry toilet mirror shampoo conditioner towel soap ply paper hair hand face wash vent ventilation fan window'
room_amenities_topic = 'Room tiny small big large stain curtain shade drape light view window tv balcony service work remote wall fridge refrigerator safe machine coffee tea kettle amenity microwave card door'
bed_topic = 'Bed stain sheet linen blanket cover pillow hard soft mattress outlet plug bug bedbug king double queen frame'

# smell_topic, wifi_topic, check_in_out_topic, parking_topic, ac_heat_topic
topics = [noise_topic,staff_topic,breakfast_topic,facilities_topic,location_topic,bathroom_topic,room_amenities_topic,bed_topic]
n_topics = len(topics)

In [263]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_neg_data = []
for text in df_negative_sentences['review_sentence']:
    tokenized_neg_data.append(clean_text(text))

tokenized_pos_data = []
for text in df_positive_sentences['review_sentence']:
    tokenized_pos_data.append(clean_text(text))
    
tokenized_topics = []    
for text in topics:
    tokenized_topics.append(clean_text(text))
    
tokenized_neg_data_and_topics = tokenized_neg_data + tokenized_topics
tokenized_pos_data_and_topics = tokenized_pos_data + tokenized_topics

In [264]:
# Build a Dictionary - association word to numeric id
neg_dictionary = corpora.Dictionary(tokenized_neg_data_and_topics)
pos_dictionary = corpora.Dictionary(tokenized_pos_data_and_topics)

In [265]:
# Transform the collection of texts to a numerical form
corpus_neg_data = [neg_dictionary.doc2bow(text) for text in tokenized_neg_data]
corpus_pos_data = [pos_dictionary.doc2bow(text) for text in tokenized_pos_data]
corpus_neg_topics = [neg_dictionary.doc2bow(text) for text in tokenized_topics]
corpus_pos_topics = [pos_dictionary.doc2bow(text) for text in tokenized_topics]

In [266]:
# Build similarity matrix with word embeddings from FastText
#termsim_index = WordEmbeddingSimilarityIndex(fasttext_model300)
#similarity_matrix = SparseTermSimilarityMatrix(termsim_index, dictionary)

In [267]:
# Compute cosine similarity between each sentence and each topic
neg_data_topics = []
for review_item in corpus_neg_data:
    review_item_topics = []
    for topic in corpus_neg_topics:
        review_item_topics.append(cossim(review_item,topic))
        #review_item_topics.append(similarity_matrix.inner_product(review_item,topic,normalized=True))
    neg_data_topics.append(review_item_topics)    
    
pos_data_topics = []
for review_item in corpus_pos_data:
    review_item_topics = []
    for topic in corpus_pos_topics:
        review_item_topics.append(cossim(review_item,topic))
        #review_item_topics.append(similarity_matrix.inner_product(review_item,topic,normalized=True))
    pos_data_topics.append(review_item_topics) 

In [268]:
neg_data_closest_topic = []
for review_item_topic_list in neg_data_topics:
    closest_topic_cossim_value = max(review_item_topic_list)
    closest_topic = np.argmax(review_item_topic_list)
    if closest_topic_cossim_value>0.05:
        neg_data_closest_topic.append(closest_topic)
    else:
        neg_data_closest_topic.append(-1)
        
pos_data_closest_topic = []
for review_item_topic_list in pos_data_topics:
    closest_topic_cossim_value = max(review_item_topic_list)
    closest_topic = np.argmax(review_item_topic_list)
    if closest_topic_cossim_value>0.05:
        pos_data_closest_topic.append(closest_topic)
    else:
        pos_data_closest_topic.append(-1)

### Assign topic with highest cosine similarity

In [269]:
df_negative_sentences['review_topic'] = neg_data_closest_topic

In [270]:
df_positive_sentences['review_topic'] = pos_data_closest_topic

In [271]:
df_negative_sentences['review_topic'].value_counts()

-1    69979
 6    55249
 1    20479
 2    18823
 5    18801
 7    16571
 3    15278
 0    11879
 4    11129
Name: review_topic, dtype: int64

### Construct pivot table

In [272]:
df_negative_sentences_by_topic = df_negative_sentences.groupby(['hotel_url','review_topic']).size().reset_index()
df_negative_sentences_by_topic.rename({0:'review_topic_count'},axis=1,inplace=True)

In [273]:
df_positive_sentences_by_topic = df_positive_sentences.groupby(['hotel_url','review_topic']).size().reset_index()
df_positive_sentences_by_topic.rename({0:'review_topic_count'},axis=1,inplace=True)

In [274]:
df_negative_sentences_by_topic_pt = df_negative_sentences_by_topic.pivot_table(values='review_topic_count',index='hotel_url',columns='review_topic').reset_index()
df_negative_sentences_by_topic_pt.fillna(0,inplace=True)

In [275]:
df_positive_sentences_by_topic_pt = df_positive_sentences_by_topic.pivot_table(values='review_topic_count',index='hotel_url',columns='review_topic').reset_index()
df_positive_sentences_by_topic_pt.fillna(0,inplace=True)

### Normalize each count by total number of (tagged) negative sentences per hotel

In [276]:
df_negative_sentences_count_by_hotel = df_negative_sentences.groupby('hotel_url').count().reset_index()[['hotel_url','review_topic']]
df_negative_sentences_count_by_hotel.rename({'review_topic':'sentences_count'},axis=1,inplace=True)

In [277]:
df_positive_sentences_count_by_hotel = df_positive_sentences.groupby('hotel_url').count().reset_index()[['hotel_url','review_topic']]
df_positive_sentences_count_by_hotel.rename({'review_topic':'sentences_count'},axis=1,inplace=True)

In [278]:
df_negative_sentences_by_topic_pt = df_negative_sentences_by_topic_pt.merge(df_negative_sentences_count_by_hotel,on='hotel_url')

In [279]:
df_positive_sentences_by_topic_pt = df_positive_sentences_by_topic_pt.merge(df_positive_sentences_count_by_hotel,on='hotel_url')

In [280]:
# obtaining number of tagged negative sentences for normalization
#df_negative_sentences_by_topic_pt['sentences_count']=df_negative_sentences_by_topic_pt['sentences_count']-df_negative_sentences_by_topic_pt[-1]
#df_negative_sentences_by_topic_pt = df_negative_sentences_by_topic_pt[df_negative_sentences_by_topic_pt['sentences_count']!=0.0]

#df_positive_sentences_by_topic_pt['sentences_count']=df_positive_sentences_by_topic_pt['sentences_count']-df_positive_sentences_by_topic_pt[-1]
#df_positive_sentences_by_topic_pt = df_positive_sentences_by_topic_pt[df_positive_sentences_by_topic_pt['sentences_count']!=0.0]

In [281]:
df_negative_sentences_by_topic_pt[[str(n)+'_pc' for n in range(-1,n_topics)]] = df_negative_sentences_by_topic_pt[[n for n in range(-1,n_topics)]].div(df_negative_sentences_by_topic_pt.sentences_count, axis=0)

In [282]:
df_positive_sentences_by_topic_pt[[str(n)+'_pc' for n in range(-1,n_topics)]] = df_positive_sentences_by_topic_pt[[n for n in range(-1,n_topics)]].div(df_positive_sentences_by_topic_pt.sentences_count, axis=0)

### Write out to csv

In [283]:
df_negative_sentences_by_topic_pt.to_csv('./datasets/df_negative_sentences_by_topic_pt.csv',index=False)

In [284]:
df_positive_sentences_by_topic_pt.to_csv('./datasets/df_positive_sentences_by_topic_pt.csv',index=False)

### Validate topic clustering against manually-annotated entries

In [285]:
df_negative_sentences_annotated = pd.read_csv('./datasets/df_negative_sentences_annotated.csv',lineterminator='\n')

In [286]:
df_negative_sentences_annotated['review_topic_annotated']=df_negative_sentences_annotated['review_topic_annotated'].replace(4,3)
df_negative_sentences_annotated['review_topic_annotated']=df_negative_sentences_annotated['review_topic_annotated'].replace(5,0)
df_negative_sentences_annotated['review_topic_annotated']=df_negative_sentences_annotated['review_topic_annotated'].replace(6,0)
df_negative_sentences_annotated['review_topic_annotated']=df_negative_sentences_annotated['review_topic_annotated'].replace(7,3)
df_negative_sentences_annotated['review_topic_annotated']=df_negative_sentences_annotated['review_topic_annotated'].replace(9,1)

df_negative_sentences_annotated['review_topic_annotated']=df_negative_sentences_annotated['review_topic_annotated'].replace(8,4)
df_negative_sentences_annotated['review_topic_annotated']=df_negative_sentences_annotated['review_topic_annotated'].replace(10,5)
df_negative_sentences_annotated['review_topic_annotated']=df_negative_sentences_annotated['review_topic_annotated'].replace(11,6)
df_negative_sentences_annotated['review_topic_annotated']=df_negative_sentences_annotated['review_topic_annotated'].replace(12,7)

In [287]:
df_negative_sentences_annotated = df_negative_sentences_annotated[['review_date','review_sentence','review_topic_annotated']]

In [288]:
df_negative_sentences_topic_validation = df_negative_sentences.merge(df_negative_sentences_annotated,on=['review_date','review_sentence'])
df_negative_sentences_topic_validation = df_negative_sentences_topic_validation[~pd.isnull(df_negative_sentences_topic_validation['review_topic_annotated'])]
df_negative_sentences_topic_validation['review_topic_annotated'] = df_negative_sentences_topic_validation['review_topic_annotated'].apply(lambda x:int(x))

In [289]:
# classification report on manually-annotated data
print(classification_report(df_negative_sentences_topic_validation['review_topic_annotated'],df_negative_sentences_topic_validation['review_topic']))

              precision    recall  f1-score   support

          -1       0.90      0.88      0.89       152
           0       0.74      0.45      0.56        31
           1       0.70      0.61      0.65        23
           2       0.81      0.96      0.88        26
           3       0.84      0.54      0.66        48
           4       0.58      0.58      0.58        19
           5       0.85      0.92      0.88        25
           6       0.52      0.81      0.64        42
           7       0.77      1.00      0.87        23

    accuracy                           0.78       389
   macro avg       0.75      0.75      0.73       389
weighted avg       0.80      0.78      0.78       389



## Modeling

In [255]:
NUM_TOPICS = 8

In [256]:
# Build the LDA model
lda_model = models.LdaMulticore(corpus=corpus_neg_data, num_topics=NUM_TOPICS, id2word=neg_dictionary)

In [257]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 15))

LDA Model:
Topic #0: 0.112*"room" + 0.064*"bathroom" + 0.053*"clean" + 0.030*"door" + 0.025*"need" + 0.020*"toilet" + 0.016*"shower" + 0.015*"dirti" + 0.014*"light" + 0.014*"broken" + 0.010*"updat" + 0.009*"sink" + 0.008*"mold" + 0.007*"floor" + 0.007*"open"
Topic #1: 0.045*"room" + 0.035*"staff" + 0.021*"smoke" + 0.019*"coffe" + 0.017*"tv" + 0.017*"facil" + 0.016*"chang" + 0.014*"wait" + 0.013*"check" + 0.012*"time" + 0.011*"renov" + 0.010*"rude" + 0.010*"minut" + 0.009*"cigarett" + 0.009*"lobbi"
Topic #2: 0.069*"room" + 0.021*"nice" + 0.018*"wifi" + 0.013*"staff" + 0.012*"experi" + 0.011*"expect" + 0.010*"pictur" + 0.010*"move" + 0.009*"refriger" + 0.008*"filthi" + 0.008*"luggag" + 0.008*"book" + 0.007*"issu" + 0.006*"3" + 0.006*"absolut"
Topic #3: 0.121*"breakfast" + 0.027*"air" + 0.024*"room" + 0.019*"food" + 0.016*"poor" + 0.016*"condit" + 0.014*"includ" + 0.012*"coffe" + 0.010*"high" + 0.009*"expens" + 0.008*"noisi" + 0.008*"condition" + 0.008*"free" + 0.007*"aw" + 0.007*"limit"


In [64]:
#text = "The staff was very unfriendly"
#bow_trial = neg_dictionary.doc2bow(clean_text(text))
corpus_neg_data_lda_output = lda_model[corpus_neg_data]

In [None]:
neg_data_closest_topic = []
for review_item in corpus_neg_data_lda_output:
    review_item_sorted = sorted(review_item,key=lambda x:-x[1])
    if review_item_sorted[0][1]>0.05:
        neg_data_closest_topic.append(review_item_sorted[0][0])
    else:
        neg_data_closest_topic.append(-1)