### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import csv
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline
import pyLDAvis.sklearn
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer,PorterStemmer
from gensim.matutils import softcossim,cossim

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


### Define useful functions

In [2]:
STOPWORDS = stopwords.words('english')
wn_lemmatizer = WordNetLemmatizer()
p_stemmer = PorterStemmer()

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [wn_lemmatizer.lemmatize(p_stemmer.stem(t)) for t in tokenized_text if t not in STOPWORDS]
    return cleaned_text

### Import dataset

In [149]:
df_negative_sentences = pd.read_csv('./datasets/df_negative_sentences.csv',lineterminator='\n')

### Remove empty reviews

In [150]:
df_negative_sentences = df_negative_sentences[~pd.isnull(df_negative_sentences['review_sentence'])]

### Cosine similarity between topics and reviews

In [151]:
noise_topic = 'Noise noisy loud quiet party scream yell voice music thin wall hear talk'
staff_topic = 'Staff rude unfriendly friendly polite impolite front desk manager maid reception valet clerk reception'
breakfast_topic = 'Breakfast food egg bacon sausage toast waffle fruit omelette omelet cheese coffee tea juice silverware plasticware cup plastic included selection taste'
facilities_topic = 'Facility elevator lift work stairs disability wheelchair pool gym vending machine spa sauna towel renovation bar restaurant pet friendly dinner lunch'
parking_topic = 'Park lot car valet street'
smell_topic = 'Smell smelly smoke odor cigarette'
ac_heat_topic = 'Ac heat hot cold warm chilly thermostat cool air conditioning vent ventilation fan adjust heater'
wifi_topic = 'WiFi wi fi internet slow connection signal free fast spotty'
location_topic = 'Location far traffic highway walk street road neighborhood neighbourhood sketchy attraction center city town downtown nearby near walk transport subway park view safe dangerous'
check_in_out_topic = 'Check in out checkin checkout communication experience bag early late reservation booking'
bathroom_topic = 'Bathroom stain shower tub bathtub curtain pressure sink water toiletry toilet mirror shampoo conditioner towel soap ply paper hair hand face wash vent ventilation fan window'
room_amenities_topic = 'Room tiny small big large stain curtain shade drape light view window tv balcony service work remote wall fridge refrigerator safe machine coffee tea amenity microwave card door'
bed_topic = 'Bed stain sheet linen cover pillow hard soft mattress outlet plug bug bedbug king double queen frame'

topics = [noise_topic,staff_topic,breakfast_topic,facilities_topic,parking_topic,smell_topic,ac_heat_topic,wifi_topic,location_topic,check_in_out_topic,bathroom_topic,room_amenities_topic,bed_topic]
n_topics = len(topics)

In [152]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in df_negative_sentences['review_sentence']:
    tokenized_data.append(clean_text(text))
    
tokenized_topics = []    
for text in topics:
    tokenized_topics.append(clean_text(text))
    
tokenized_data_and_topics = tokenized_data + tokenized_topics

In [153]:
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data_and_topics)

In [154]:
# Transform the collection of texts to a numerical form
corpus_data = [dictionary.doc2bow(text) for text in tokenized_data]
corpus_topics = [dictionary.doc2bow(text) for text in tokenized_topics]

In [155]:
# Compute cosine similarity between each sentence and each topic
data_topics = []
for review_item in corpus_data:
    review_item_topics = []
    for topic in corpus_topics:
        review_item_topics.append(cossim(review_item,topic))
    data_topics.append(review_item_topics)    

In [156]:
data_closest_topic = []
for review_item_topic_list in data_topics:
    closest_topic_cossim_value = max(review_item_topic_list)
    closest_topic = np.argmax(review_item_topic_list)
    if closest_topic_cossim_value>0.05:
        data_closest_topic.append(closest_topic)
    else:
        data_closest_topic.append(-1)

### Assign topic with highest cosine similarity

In [157]:
df_negative_sentences['review_topic'] = data_closest_topic

### Construct pivot table

In [158]:
df_negative_sentences_by_topic = df_negative_sentences.groupby(['hotel_url','review_topic']).size().reset_index()
df_negative_sentences_by_topic.rename({0:'review_topic_count'},axis=1,inplace=True)

In [159]:
df_negative_sentences_by_topic_pt = df_negative_sentences_by_topic.pivot_table(values='review_topic_count',index='hotel_url',columns='review_topic').reset_index()
df_negative_sentences_by_topic_pt.fillna(0,inplace=True)

### Normalize each count by total number of negative sentences per hotel

In [160]:
df_negative_sentences_count_by_hotel = df_negative_sentences.groupby('hotel_url').count().reset_index()[['hotel_url','review_topic']]
df_negative_sentences_count_by_hotel.rename({'review_topic':'sentences_count'},axis=1,inplace=True)

In [161]:
df_negative_sentences_by_topic_pt = df_negative_sentences_by_topic_pt.merge(df_negative_sentences_count_by_hotel,on='hotel_url')

In [162]:
df_negative_sentences_by_topic_pt[[n for n in range(-1,n_topics)]] = df_negative_sentences_by_topic_pt[[n for n in range(-1,n_topics)]].div(df_negative_sentences_by_topic_pt.sentences_count, axis=0)

### Write out to csv

In [163]:
df_negative_sentences_by_topic_pt.to_csv('./datasets/df_negative_sentences_by_topic_pt.csv',index=False)

In [164]:
df_negative_sentences_by_topic_pt.head()

Unnamed: 0,hotel_url,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,sentences_count
0,-long-island-islip-courthouse-complex.en-gb,0.311111,0.022222,0.022222,0.155556,0.111111,0.044444,0.0,0.022222,0.044444,0.0,0.066667,0.111111,0.044444,0.044444,45
1,1-brooklyn-bridge.en-gb,0.304,0.024,0.072,0.064,0.072,0.016,0.0,0.032,0.008,0.024,0.072,0.04,0.264,0.008,125
2,1-hotel-central-park.en-gb,0.29771,0.022901,0.030534,0.061069,0.053435,0.030534,0.007634,0.038168,0.007634,0.022901,0.083969,0.091603,0.21374,0.038168,131
3,1000-islands-harbor.en-gb,0.28,0.08,0.053333,0.08,0.16,0.0,0.0,0.026667,0.026667,0.026667,0.026667,0.026667,0.173333,0.04,75
4,11-howard.en-gb,0.288732,0.028169,0.070423,0.007042,0.049296,0.035211,0.007042,0.028169,0.014085,0.014085,0.084507,0.06338,0.246479,0.06338,142


### Validate topic clustering against manually-annotated entries

In [165]:
df_negative_sentences_annotated = pd.read_csv('./datasets/df_negative_sentences_annotated.csv',lineterminator='\n')

In [166]:
df_negative_sentences_annotated = df_negative_sentences_annotated[['review_date','review_sentence','review_topic_annotated']]

In [167]:
df_negative_sentences_topic_validation = df_negative_sentences.merge(df_negative_sentences_annotated,on=['review_date','review_sentence'])
df_negative_sentences_topic_validation = df_negative_sentences_topic_validation[~pd.isnull(df_negative_sentences_topic_validation['review_topic_annotated'])]
df_negative_sentences_topic_validation['review_topic_annotated'] = df_negative_sentences_topic_validation['review_topic_annotated'].apply(lambda x:int(x))

In [168]:
# accuracy on manually-annotated entries

In [169]:
sum(df_negative_sentences_topic_validation['review_topic']==df_negative_sentences_topic_validation['review_topic_annotated'])/len(df_negative_sentences_topic_validation['review_topic'])

0.7746478873239436

## Modeling

In [None]:
def text_process(review_sentence):
        return [word.lower() for word in review_sentence.split() if word.lower() not in stopwords.words('english')]
    
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))                        

    print()

In [6]:
NUM_TOPICS = 10


In [11]:
# Build the LDA model
lda_model = models.LdaMulticore(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [12]:
print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

LDA Model:
Topic #0: 0.054*"room" + 0.015*"clean" + 0.013*"stay" + 0.012*"get" + 0.010*"night" + 0.010*"would" + 0.010*"bed" + 0.010*"hotel" + 0.009*"day" + 0.009*"didnt"
Topic #1: 0.022*"breakfast" + 0.021*"room" + 0.020*"bathroom" + 0.014*"small" + 0.012*"shower" + 0.011*"dirty" + 0.010*"check" + 0.008*"floor" + 0.008*"toilet" + 0.008*"door"
Topic #2: 0.027*"room" + 0.015*"small" + 0.012*"could" + 0.011*"hotel" + 0.010*"bed" + 0.010*"stay" + 0.009*"rooms" + 0.008*"get" + 0.007*"front" + 0.007*"elevators"
Topic #3: 0.032*"room" + 0.020*"staff" + 0.016*"small" + 0.015*"rooms" + 0.010*"one" + 0.009*"bed" + 0.008*"desk" + 0.008*"bathroom" + 0.008*"didnt" + 0.007*"rude"
Topic #4: 0.015*"room" + 0.015*"could" + 0.014*"hotel" + 0.014*"nothing" + 0.010*"would" + 0.010*"price" + 0.010*"dont" + 0.008*"place" + 0.007*"desk" + 0.006*"didnt"
Topic #5: 0.019*"hotel" + 0.017*"room" + 0.013*"night" + 0.012*"didnt" + 0.011*"bathroom" + 0.011*"door" + 0.010*"like" + 0.010*"would" + 0.009*"stay" + 0.00

In [None]:
text = "The staff was very unfriendly"
bow_trial = dictionary.doc2bow(clean_text(text))
lda_model[bow_trial]

In [None]:
###############################################################

### Create model pipeline: Vectorization (BoW), LDA

In [None]:
NUM_TOPICS=5
N_TOP_WORDS=20

In [16]:
tf_vectorizer = CountVectorizer(analyzer=text_process)
tf = tf_vectorizer.fit_transform(df_negative_sentences['review_sentence'][0:50000])

KeyboardInterrupt: 

In [None]:
lda = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')

In [None]:
lda.fit(tf)

In [None]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names,N_TOP_WORDS)

In [None]:
##############################################################

In [None]:
doc_topic = lda.transform(tf)

In [None]:
df_negative_sentences['review_sentence'][0]

In [None]:
doc_topic[0]

In [None]:
for n in range(doc_topic.shape[0]):
    topic_most_pr = doc_topic[n].argmax()
    print("doc: {} topic: {}\n".format(n,topic_most_pr))