#Load all hotel reviews

In [0]:
from google.colab import drive
import os
drive.mount("/content/gdrive")

import pandas as pd

if os.path.isfile("/content/gdrive/My Drive/Data/new-york-city.csv"):
  with open('/content/gdrive/My Drive/Data/new-york-city.csv', 'r') as f:
    reviews = pd.read_csv(f, sep="\t",  header=None, usecols=[0,1,2,3], quoting=3,
                    names = ["Hotel Name", "Date of Review", "Review Headline", "Review Text"])
    reviews["Review Headline"] = reviews["Review Headline"].str.lower() # convert all review headlines to lowercase
    reviews["Review Text"] = reviews["Review Text"].str.lower() # convert all review text to lowercase

    print("Reviews file read successfully")
else:
  print("Data folder does not contain 'new-york-city.csv'")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Reviews file read successfully


In [0]:
import spacy
from spacy import displacy
nlp = spacy.load('en')

import re
from collections import Counter
import time

Search for particular parts of speech in hotel reviews. Change the second line to find a particular part of speech.

In [0]:
maximum_number_of_reviews_to_process = 2000
target_pos = 'ADJ'

column_name = "Review Text"
all_review_text = reviews[column_name].dropna()

noun_counter = Counter()
last_time = -1
for review_number, text in all_review_text.iteritems():
  if review_number % 250 == 0:
    print("processing review ", review_number, 'of', maximum_number_of_reviews_to_process,' reviews so far. ', end = " ")
    if last_time != -1:
      current_time = time.time()
      elapsed_time = current_time - last_time
      reviews_per_minute = 60 * 500 / elapsed_time 
      print("{:.0f} reviews / minute".format(reviews_per_minute))
    else:
      print()
    last_time = time.time()
  doc = nlp(text)
  for token in doc:
    if token.pos_ == target_pos:
      noun_counter[token.text] += 1
  
  if review_number > maximum_number_of_reviews_to_process:
    break

print('Done.')

processing review  0 of 2000  reviews so far.  
processing review  500 of 2000  reviews so far.  927 reviews / minute
processing review  750 of 2000  reviews so far.  1896 reviews / minute
processing review  1000 of 2000  reviews so far.  1568 reviews / minute
processing review  1250 of 2000  reviews so far.  1894 reviews / minute
processing review  1500 of 2000  reviews so far.  1687 reviews / minute
processing review  1750 of 2000  reviews so far.  1738 reviews / minute
processing review  2000 of 2000  reviews so far.  2058 reviews / minute
Done.


Print the most common words that are of a particular part of speech. Change the first line number to change how many words are printed.

In [0]:
number_of_words_to_print = 60  # change this to show a different number of teh most common nouns
nouns_to_delete = ["i","we", "it","you","they"] # this list must have only lowercase words

for noun in nouns_to_delete:
  if noun in noun_counter:
    del noun_counter[noun]

print(len(noun_counter),' words in counter.')
print()
print("MOST COMMON Words")
print("Word\tFrequency:")
for word, frequency in noun_counter.most_common(number_of_words_to_print):
  print("{}\t{:,}".format(word, frequency))

2616  words in counter.

MOST COMMON Words
Word	Frequency:
our	2,089
great	1,618
my	1,448
which	1,026
good	1,010
new	895
nice	833
clean	812
small	804
square	756
that	598
friendly	557
helpful	556
comfortable	544
your	520
other	504
central	457
nyc	448
little	422
front	403
large	375
next	365
first	361
all	358
their	351
excellent	344
more	334
big	319
best	294
few	291
its	280
many	271
free	270
perfect	266
quiet	251
fantastic	249
only	248
better	246
huge	236
spacious	224
double	222
much	212
old	193
last	192
most	190
easy	190
close	182
same	179
lovely	176
wonderful	173
safe	170
available	160
sure	158
second	157
high	156
extra	156
right	155
hot	155
able	151
expensive	144
