# Load all hotel reviews

As before, access the hotel reviews file already in your Google Drive account.

In [0]:
from google.colab import drive
import os
drive.mount("/content/gdrive")

import pandas as pd

if os.path.isfile("/content/gdrive/My Drive/Data/new-york-city.csv"):
  with open('/content/gdrive/My Drive/Data/new-york-city.csv', 'r') as f:
    reviews = pd.read_csv(f, sep="\t",  header=None, usecols=[0,1,2,3], quoting=3,
                    names = ["Hotel Name", "Date of Review", "Review Headline", "Review Text"])
    reviews["Review Headline"] = reviews["Review Headline"].str.lower() # convert all review headlines to lowercase
    reviews["Review Text"] = reviews["Review Text"].str.lower() # convert all review text to lowercase

    print("Reviews file read successfully")
else:
  print("Data folder does not contain 'new-york-city.csv'")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Reviews file read successfully


In [0]:
review_text = reviews["Review Text"] # get only the hotel reviews column
review_text = review_text.dropna() # skip empty reviews

documents = []
for _, text in review_text.iteritems():
  documents.append(text)




---



---



In [0]:
#@title Set topic modeling algorithm arguments

no_topics = 2 #@param {type:"integer"}

no_top_words = 6 #@param {type:"integer"}

no_top_documents = 3 #@param {type:"integer"}

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import numpy as np

In [0]:
#@title Run NMF

def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
    for topic_idx, topic in enumerate(H):
      print()
      print("Topic {}:".format(topic_idx),", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]) )
      print()
      top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
      for doc_index in top_doc_indices:
        print(documents[doc_index])

# NMF is able to use tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# Run NMF
nmf_model = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
nmf_W = nmf_model.transform(tfidf)
nmf_H = nmf_model.components_

print("NMF Topics")
display_topics(nmf_H, nmf_W, tfidf_feature_names, documents, no_top_words, no_top_documents)
print("--------------")



NMF Topics

Topic 0: hotel, great, room, location, staff, clean

we stayed in this hotel for 9 nights during our stay in new york. we got a great deal on the hotel's website. the location was great, very central to everything. the subway was at the end of the block and we had a great view of the empire state building from our hotel room. the rooms were very clean and had everything we needed for our stay.breakfast in the hotel was expensive and not that great. there was a deli called speedy's a few blocks downtown which had a much better breakfast for a lot less.we would definitely recommend this hotel for anyone staying in new york.
we have been to new york 3 times now , and this is the best hotel we have stayed in , friendly helpful staff , really nice rooms and a great location . we will return there next time in new york
a nice clean hotel in a great location.small rooms, big breakfast, great coffee.overall very good, would stay here again

Topic 1: quot, room, desk, night, hotel, 

In [0]:
#@title Run LDA

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model

# CountVectorizer converts a set of documents to a matrix of token counts
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# fit_transform learns the vocabulary dictionary and returns a term-document matrix
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

# Run LDA
lda_model = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)
lda_W = lda_model.transform(tf)
lda_H = lda_model.components_

print("LDA Topics")
display_topics(lda_H, lda_W, tf_feature_names, documents, no_top_words, no_top_documents)