<h1>GrEx3: Hotel Customers Have Their Say About Brands</h1><br>
Eric Randall<br>
PREDICT 420: Summer 2017

In [2]:
import pandas as pd
import numpy as np
import glob
import os
import json
import math
import re
import stop_words
from stemming.porter2 import stem
import nltk

<b>Part 1: Providing structure to our json files</b><br>The first part of our task is to create a list of json files in the data directory and loop through all of them to add all the data we need to a temporary dictionary that will be merged into a series at the end of the loop. Try/Except is applied to catch reviews that chose not to fill out all of the fields in the survey.

In [3]:
json_files = glob.glob("data\\*.json")
reviewsmaster = []
for files in json_files:
    with open(files) as input_file:
        jsondat=json.load(input_file)
    for review in jsondat['Reviews']:
        temp_dict = {}
        temp_dict['Author'] = review['Author'].encode("utf-8")
        temp_dict['Date'] = review['Date'].encode("utf-8")
        temp_dict['ReviewID'] = review['ReviewID'].encode("utf-8")
        temp_dict['HotelNameURL'] = jsondat['HotelInfo']['HotelURL'].encode("utf-8")
        temp_dict['HotelID'] = jsondat['HotelInfo']['HotelID'].encode("utf-8")
        try:
            temp_dict['HotelName'] = str(jsondat['HotelInfo']['Name'].encode("utf-8"))
        except:
            temp_dict['HotelName'] = str("")
        try:
            temp_dict['RCleanliness'] = int(review['Ratings']['Cleanliness'].encode("utf-8"))
        except:
            temp_dict['RCleanliness'] = np.nan
        try:
            temp_dict['RLocation'] = int(review['Ratings']['Location'].encode("utf-8"))
        except:
            temp_dict['RLocation'] = np.nan
        try:
            temp_dict['ROverall'] = int(review['Ratings']['Overall'].encode("utf-8"))
        except:
            temp_dict['ROverall'] = np.nan
        try:
            temp_dict['RRooms'] = int(review['Ratings']['Rooms'].encode("utf-8"))
        except:
            temp_dict['RRooms'] = np.nan
        try:
            temp_dict['RService'] = int(review['Ratings']['Service'].encode("utf-8"))
        except:
            temp_dict['RService'] = np.nan
        try:
            temp_dict['RSleepQual'] = int(review['Ratings']['Sleep Quality'].encode("utf-8"))
        except:
            temp_dict['RSleepQual'] = np.nan
        try:
            temp_dict['RValue'] = int(review['Ratings']['Value'].encode("utf-8"))
        except:
            temp_dict['RValue'] = np.nan
        try:
            temp_dict['RCheckIn'] = int(review['Ratings']['Check in / front desk'].encode("utf-8"))
        except:
            temp_dict['RCheckIn'] = np.nan
        try:
            temp_dict['RBusinessService'] = int(review['Ratings']['Business service (e.g., internet access)']
                                                .encode("utf-8"))
        except:
            try:
                temp_dict['RBusinessService'] = int(review['Ratings']['Business service'].encode("utf-8"))
            except:
                temp_dict['RBusinessService'] = np.nan
        reviewsmaster.append(temp_dict)

In [4]:
# Convert from series into DataFrame.  
reviews_df = pd.DataFrame(reviewsmaster)

In [5]:
# Convert these to strings. 
reviews_df['Author'] = reviews_df['Author'].astype(str)
reviews_df['Date'] = reviews_df['Date'].astype(str)
reviews_df['HotelName'] = reviews_df['HotelName'].astype(str)
reviews_df['ReviewID'] = reviews_df['ReviewID'].astype(str)

# -1 is converted to NaN. All the ratings are float64.  
reviews_df = reviews_df.replace(to_replace={
        'RCheckIn': {-1 : np.nan}, 
        'RBusinessService': {-1 : np.nan},
        'RCleanliness': {-1 : np.nan},
        'RLocation': {-1 : np.nan},
        'RRooms': {-1 : np.nan},
        'RService': {-1 : np.nan},
        'RValue': {-1 : np.nan},
        })

#reviews_df.dtypes

In [6]:
# Extract the name of the hotel from the URL because there was more data there than the HotelName 
# field that was often blank.
tempHotelNameFix = []

for row in reviews_df['HotelNameURL']:
    trimmed = row.split('ShowUserReviews-', 1)[1]
    try:
        trimmed = trimmed.split('Reviews')[1]
    except:
        trimmed = trimmed
    trimmed = trimmed.split('.html', 1)[0]
    trim2 = re.split('\d', trimmed)[len(re.split('\d', trimmed)) - 1][1:]
    trim2 = trim2.replace("_"," ")
    tempHotelNameFix.append(trim2)
reviews_df['HotelNameFix'] = tempHotelNameFix

In [7]:
# If HotelName was blank, use name extracted from URL.
reviews_df['HotelName'] = np.where(reviews_df['HotelName'] == '', reviews_df['HotelNameFix'], 
                                   reviews_df['HotelName'])

In [8]:
# Drop URL and fixed name columns because we've used them.  
reviews_df = reviews_df.drop('HotelNameFix', 1)
reviews_df = reviews_df.drop('HotelNameURL', 1)

In [9]:
# 1b: Number of reviews per hotel:  
pd.crosstab(index=reviews_df["HotelName"], columns="count")

col_0,count
HotelName,Unnamed: 1_level_1
A Victory Inn & Suites Phoenix North,32
BEST WESTERN Airport Inn,45
BEST WESTERN Loyal Inn,113
BEST WESTERN Market Center,54
BEST WESTERN PLUS Executive Inn,137
BEST WESTERN PLUS InnSuites Phoenix Hotel & Suites,60
BEST WESTERN PLUS Pioneer Square Hotel,233
Balisandy Cottages-Kuta Bali,1
Christopher's Inn,93
Comfort Inn & Suites Seattle,36


In [10]:
pd.crosstab(index=reviews_df["RLocation"], columns="count", dropna=False)

col_0,count
RLocation,Unnamed: 1_level_1
1.0,52
2.0,74
3.0,206
4.0,430
5.0,953


In [11]:
# 1c: Across hotels, calculate and report statistics describing the distribution of the overall rating received 
# by the hotels.  
reviews_df.describe()

Unnamed: 0,RBusinessService,RCheckIn,RCleanliness,RLocation,ROverall,RRooms,RService,RSleepQual,RValue
count,483.0,735.0,2098.0,1715.0,887.0,1981.0,2097.0,805.0,2100.0
mean,3.534161,3.97551,3.978551,4.258309,3.83991,3.582029,3.927039,3.816149,3.854762
std,1.259146,1.173945,1.222511,1.027217,1.218401,1.251246,1.258907,1.279765,1.271729
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,3.0,3.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0
50%,4.0,4.0,4.0,5.0,4.0,4.0,4.0,4.0,4.0
75%,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [12]:
# 1d: Pickle and test pickling.  
reviews_df.to_pickle("reviews.pkl")
#reviews_pickle = pd.read_pickle("reviews.pkl")

<b>Part 2: Text Data for Perceptual Mapping</b><br>Now we must filter out stop-words, punctuation, and create a dictionary for each hotel's descriptors.<br> The end goal is a JSON file and a count of the unique words describing each hotel. 

In [13]:
# Acquire stop-words, Convert to unicode and strip out punctuation.  
remove_words = stop_words.get_stop_words('english')
remove_words_temp = []
for word in remove_words:
    remove_words_temp.append(re.sub(r'[^\w\s]|\d','',word.encode("utf-8")))
remove_words = remove_words_temp

In [14]:
hotWords = {}
json_files = glob.glob("data\\*.json")
for files in json_files:
    with open(files) as input_file:
        jsondat=json.load(input_file)

    review_text = ""
    for content in jsondat['Reviews']:
        review_text += content['Content'].encode("utf-8")
    review_text = review_text.replace("."," ")
    
    # Strip out punctuation and convert the words to all lowercase (to make matching easier to the stop-words).  
    review_text_split = review_text.split(" ")
    rts_temp = []
    for word in review_text_split:
        rts_temp.append(re.sub(r'[^\w\s]|\d','',stem(word.lower())))
    review_text_split = rts_temp

    hotel_descriptors = []
    for rts_words in review_text_split:
        if ((rts_words in remove_words_temp) == False):
            if rts_words != "":
                hotel_descriptors.append(rts_words)
                
    descriptor_count = [[x,hotel_descriptors.count(x)] for x in set(hotel_descriptors)]

    entry2 = {jsondat['HotelInfo']['HotelID']: descriptor_count}
    hotWords.update(entry2)

In [15]:
hotWords

{u'100506': [['hampton', 1],
  ['forget', 1],
  ['skip', 1],
  ['lack', 1],
  ['rod', 1],
  ['catch', 2],
  ['niec', 1],
  ['sleep', 15],
  ['ice', 3],
  ['go', 14],
  ['follow', 1],
  ['chair', 1],
  ['hate', 1],
  ['carpet', 14],
  ['kitchenett', 1],
  ['airfar', 1],
  ['decid', 3],
  ['lean', 1],
  ['secur', 1],
  ['million', 1],
  ['monaco', 1],
  ['tv', 11],
  ['embarras', 1],
  ['elsewher', 1],
  ['disgusting', 1],
  ['stores', 1],
  ['finally', 1],
  ['th', 9],
  ['late', 2],
  ['larg', 1],
  ['pride', 1],
  ['worth', 1],
  ['sent', 1],
  ['winfrey', 1],
  ['sound', 4],
  ['file', 1],
  ['woman', 2],
  ['everi', 5],
  ['risk', 1],
  ['far', 3],
  ['desirable', 2],
  ['checkin', 1],
  ['bedrooms', 1],
  ['account', 1],
  ['choice', 1],
  ['worst', 2],
  ['updates', 1],
  ['fall', 5],
  ['veri', 28],
  ['luxury', 2],
  ['difference', 1],
  ['strang', 1],
  ['minute', 1],
  ['cool', 2],
  ['tri', 5],
  ['disgrac', 1],
  ['hour', 6],
  ['fantast', 1],
  ['level', 2],
  ['joel', 1],


In [16]:
# Export our hotWords to a JSON file and then check that it was successful.  
with open('hotwords.txt', 'w') as outfile:  
    json.dump(hotWords, outfile)
    
with open('hotwords.txt') as json_file:  
    data_check = json.load(json_file)
#data_check   

In [17]:
# Creates a master list of all words to compare to.  
i = 0
hotelswords = []
masterwords = []
while i < len(hotWords):
    #hotelswords = hotWords.values()[i]
    masterwords += zip(*(hotWords.values()[i]))[0]
    i=i+1
    
# Compare each word to the number of times it appears in the list. 1 is the flag, since the 1 is itself.
uniquecollection = []
i = 0
hotelswords = []
finalwordstemp = {}
finalwords = {}
while i < len(hotWords):
    hotelswords = zip(*hotWords.values()[i])[0]
    counter = 0
    for h_words in hotelswords:
        if(masterwords.count(h_words) == 1):
            counter = counter + 1
    finalwordstemp = {hotWords.keys()[i]: counter}
    finalwords.update(finalwordstemp)
    i=i+1

In [18]:
# Displays the number of unique words by hotelID.  
finalwords

{u'100506': 101,
 u'1217974': 5,
 u'150849': 5080,
 u'214680': 503,
 u'240124': 127,
 u'2515575': 225,
 u'287670': 534,
 u'550994': 488,
 u'655424': 1,
 u'677703': 3,
 u'72572': 511,
 u'72579': 222,
 u'72586': 357,
 u'72598': 86,
 u'73393': 240,
 u'73644': 113,
 u'73706': 53,
 u'73712': 48,
 u'73718': 123,
 u'73727': 124,
 u'73739': 151,
 u'73743': 85,
 u'73751': 35,
 u'73757': 62,
 u'73760': 88,
 u'73768': 43}

In [19]:
finalwords

{u'100506': 101,
 u'1217974': 5,
 u'150849': 5080,
 u'214680': 503,
 u'240124': 127,
 u'2515575': 225,
 u'287670': 534,
 u'550994': 488,
 u'655424': 1,
 u'677703': 3,
 u'72572': 511,
 u'72579': 222,
 u'72586': 357,
 u'72598': 86,
 u'73393': 240,
 u'73644': 113,
 u'73706': 53,
 u'73712': 48,
 u'73718': 123,
 u'73727': 124,
 u'73739': 151,
 u'73743': 85,
 u'73751': 35,
 u'73757': 62,
 u'73760': 88,
 u'73768': 43}

In [21]:
dict((a,b) for a,b in finalwords.items() if b<100)

{u'1217974': 5,
 u'655424': 1,
 u'677703': 3,
 u'72598': 86,
 u'73706': 53,
 u'73712': 48,
 u'73743': 85,
 u'73751': 35,
 u'73757': 62,
 u'73760': 88,
 u'73768': 43}