(In order to load the stylesheet of this notebook, execute the last code cell in this notebook)

# Analyzing Hotel Ratings on Tripadvisor

In this homework we will focus on practicing two techniques: web scraping and regression. For the first part, we will get some basic information for each hotel in Boston. Then, we will fit a regression model on this information and try to analyze it.

** Task 1 (30 pts)**

We will scrape the data using Beautiful Soup. For each hotel that our search returns, we will get the information below.

![Information to be scraped](hotel_info.png)

Of course, feel free to collect even more data if you want. 

In [47]:
#!/usr/bin/python
# -*- coding: utf-8 -*-

from BeautifulSoup import BeautifulSoup
import sys
import time
import os
import logging
import argparse
import requests
import codecs
import json
import collections
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.keys import Keys


base_url = "http://www.tripadvisor.com"
user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.76 Safari/537.36"

""" STEP 1  """
def get_tourism_page(city, state):
    """ 
        Return the json containing the
        URL of the tourism city page
    """

    # EXAMPLE: http://www.tripadvisor.com/TypeAheadJson?query=boston%20massachusetts&action=API
    #          http://www.tripadvisor.com//TypeAheadJson?query=san%20francisco%20california&type=GEO&action=API
    url = "%s/TypeAheadJson?query=%s%%20%s&action=API" % (base_url, "%20".join(city.split()), state)
    print "URL TO REQUEST:", url
    
    # Given the url, request the HTML page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text.encode('utf-8')

    # Save to file
    #with open('search-page.json', "w") as h:
        #h.write(html)

    # Parse json to get url
    js = json.loads(html)
    results = js['results']
    print "RESULTS: ", results[0]
    urls = results[0]['urls'][0]

    # get tourism page url
    tourism_url = urls['url']
    return tourism_url

""" STEP 2  """
def get_city_page(tourism_url):
    """ 
        Get the URL of the hotels of the city
        using the URL returned by the function
        get_tourism_page()
    """

    url = base_url + tourism_url

    # Given the url, request the HTML page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text.encode('utf-8')
    
    # Save to file	
    #with open('-tourism-page.html', "w") as h:
        #h.write(html)


    # Use BeautifulSoup to extract the url for the list of hotels in 
    # the city and state we are interested in.
    # For exampel in this case we need to  
    #<li class="hotels twoLines">
    #<a href="/Hotels-g60745-Boston_Massachusetts-Hotels.html" data-trk="hotels_nav"
    soup = BeautifulSoup(html)
    li = soup.find("li", {"class": "hotels twoLines"})
    city_url = li.find('a', href = True)
    print "CITY PAGE URL:", city_url['href']
    return city_url['href']


""" STEP 3 """
def get_hotellist_page(city_url, count):
    """ Get the hotel list page given the url returned by
        get_city_page(). Return the html after saving
        it to the datadir 
    """
    print "Hotel page", count
    url = base_url + city_url
    # Sleep 2 sec before starting a new http request
    time.sleep(2)
    # Request page
    headers = { 'User-Agent' : user_agent }
    response = requests.get(url, headers=headers)
    html = response.text.encode('utf-8')
    # Save the 
    #with open('boston-hotelist-' + str(count) + '.html', "w") as h:
        #h.write(html)
    return html

""" STEP 4 """
def parse_hotellist_page(html):
    """ 
    Parse the html pages returned by get_hotellist_page().
    Return the next url page to scrape (a city can have
    more than one page of hotels) if there is, else exit
    the script.
    """
    
    soup = BeautifulSoup(html)
# Extract hotel name, star rating and number of reviews
    hotel_boxes = soup.findAll('div', {'class' :'listing easyClear  p13n_imperfect '})
    for hotel_box in hotel_boxes:
        name = hotel_box.find('div', {'class' :'listing_title'}).find(text=True)
        try:
            rating = hotel_box.find('div', {'class' :'listing_rating'})
            reviews = rating.find('span', {'class' :'more'}).find(text=True)
            stars = hotel_box.find("img", {"class" : "sprite-ratings"})
        except Exception, e:
            log.error("No ratings for this hotel")
            reviews = "N/A"
            stars = 'N/A'
        hotelref = hotel_box.findAll('a', href= True)
        #print "go to ", hotelref[0]['href']," and get traveler ratings"
        ratingfile.write("++NEW HOTEL++ %s\n" % name)
        print '.',
        getTraverlerRating(hotelref[0]['href'])
        
        
        if stars != 'N/A':
            #log.info("Stars: %s" % stars['alt'].split()[0])
            stars = stars['alt'].split()[0]
        if name == "Omni Parker House":
            print "Found Omni Parker House. Scrape reviews"
            print "HOTEL NAME:", name
            print "HOTEL REVIEWS: ", reviews
            print "HOTEL STAR RATING:", stars
            omnihrefs = hotel_box.findAll('a', href= True)
            for omnihref in omnihrefs:
                #print omnihref, "######", omnihref['href']
                if omnihref.find(text = True) == 'Omni Parker House':
                    
                    pg = 0
                    #print "Review url is", omnihref['href']
                    print "page #", pg,
                    ret = scrapeReview(omnihref['href'], pg)
                    #ret = scrapeFaster(omnihref['href'])                    
                    while ret:
                        pg +=1
                        print "page #", pg, 
                        ret = scrapeReview(ret, pg)
                        
                    """
                    print "Review url begin:", omnihref['href'] 
                    scrapeReview(omnihref['href'])
                    """
                    #add this block in main flow to scrape everything
                #return

# # Get next URL page if exists, else exit
    div = soup.find("div", {"class" : "unified pagination standard_pagination"})
    # check if last page
    if div.find('span', {'class' : 'nav next ui_button disabled'}):
        print "We reached last page"
        return None
    # If it is not las page there must be the Next URL
    hrefs = div.findAll('a', href= True)
    for href in hrefs:
        if href.find(text = True) == 'Next':
            print "Next url is", href['href']
            return href['href']

"""Get Traverler's ratings for every hotel"""
def getTraverlerRating(hotelurl):
    headers = { 'User-Agent' : user_agent }
    response = requests.get(base_url+hotelurl, headers=headers)
    #print response
    html = response.text.encode('utf-8')   
    hotelsoup = BeautifulSoup(html)
    
    filterbox = hotelsoup.findAll("div",{"class":"with_histogram"})
    ratebox = filterbox[0].findAll("div",{"class":"col rating "})
    ratinglist = ratebox[0].findAll("li")
    excel = ratinglist[0].findAll("label",{"for":"taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_5"})[0]
    ratingfile.write("Excellent:%s\n" % excel.findAll("span")[2].find(text=True))
    #print "excel", excel.findAll("span")[2].find(text=True), 
    vgood = ratinglist[1].findAll("label",{"for":"taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_4"})[0]
    ratingfile.write("Very good: %s\n" % vgood.findAll("span")[2].find(text=True))
    
    avg = ratinglist[2].findAll("label",{"for":"taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_3"})[0]
    ratingfile.write("Average:%s\n" % avg.findAll("span")[2].find(text=True))
    
    poor = ratinglist[3].findAll("label",{"for":"taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_2"})[0]
    ratingfile.write("Poor:%s\n" % poor.findAll("span")[2].find(text=True))
    
    terrible = ratinglist[4].findAll("label",{"for":"taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_1"})[0]
    ratingfile.write("Terrible:%s\n" % terrible.findAll("span")[2].find(text=True))
    
    #sys.exit()
        
"""STEP 5: Go through each review"""   
"""
def scrapeFaster(url):
    driver.get(base_url+url)
    
    pagehtml = driver.page_source
    pgsoup = BeautifulSoup(pagehtml)
    try:
        nexturl = driver.find_element_by_link_text("More")
        print "More BUTTON", nexturl
    except NoSuchElementException:
        print "NO LINK"
        return
    nexturl.click() 
    time.sleep(0.2)
    
    print "page loaded"
"""    


def scrapeReview(reviewurl, pgnum):
    #return
    #print base_url+reviewurl
    debugfile.write("\nscrapeReview: url %s," % base_url+reviewurl)
    headers = { 'User-Agent' : user_agent }
    response = requests.get(base_url+reviewurl, headers=headers)
    #print response
    debugfile.write("scrapeReview: response %s\n" % response)
    html = response.text.encode('utf-8')   
    reviewsoup = BeautifulSoup(html) 
    
    revbox = reviewsoup.findAll("div", {"class":"reviewSelector   track_back"})
    olderrevbox = reviewsoup.findAll("div", {"class":"reviewSelector  "})
    oldestbox = reviewsoup.findAll("div", {"class":"reviewSelector  first_aph   track_back"})
    
    #if len(olderrevbox):
        #print "older reviews", len(olderrevbox)
    debugfile.write("scrapeReview: total reviews to be parsed %s\n" % str(len(revbox)+len(olderrevbox)+len(oldestbox)))
    print "(",len(revbox)+len(olderrevbox)+len(oldestbox),")|",
    pg = 1
    revbox += olderrevbox+oldestbox
    
    #click on more button and send expanded cells to getstars2 one-by-one?
    for r in revbox:
        reviews = r.findAll('a', href=True)        
        for rev in reviews:            
            thisrevurl = rev['href']
            #print thisrevurl
            #now make http request for review url and write values to a file
            getStars2(thisrevurl)
    """
    reviews = revbox[0].findAll('a', href=True)
    thisurl = reviews[0]['href']
    nextpageret = getStars(thisurl,0)
    """
    
    #nextpages = reviewsoup.findAll("a", {"class":"pageNum taLnk"})
    """
    print "next review page", nextpageret
    while nextpageret:        
        pg +=1
        print "Next review page #", pg        
        #nextpageret = getStars(nextpageret, pg)
    """
    nextpages = reviewsoup.findAll("a", {"class":"pageNum taLnk"})
    pgnum = min(pgnum,4)
    try:
        #print "\nnext page?", nextpages[pgnum]['href']
        debugfile.write("scrapeReview: next page url %s\n" % nextpages[pgnum]['href'])
        return nextpages[pgnum]['href']
    except IndexError:
        print "Done with all pages", pgnum
        return None

"""STEP 6 : Access individual review, parse ratings and store in a file"""
def getStars2(revurl):
    #print base_url+revurl,
    debugfile.write("getStars2: url %s," % base_url+revurl)
    headers = { 'User-Agent' : user_agent }
    response = requests.get(base_url+revurl, headers=headers)
    #print response
    debugfile.write("getStars2: response %s\n" % response)
    html = response.text.encode('utf-8') 
    
    reviewsoup = BeautifulSoup(html)
    reviewblock = reviewsoup.findAll("div",{"class":"deckC"})
    try:
        reviewlist = reviewblock[0].findAll("div",{"class":"  reviewSelector "})
    except IndexError:
        return
    review = reviewlist[0]
    
            
    #print review
    id = review['id']
    #print id, 
    debugfile.write("getStars2: id: %s\t" % id)
    ratelist = review.findAll("div", {"class":"rating-list"})
    #print ratelist
    stars = ratelist[0].findAll("li",{"class":"recommend-answer"})
    #inside stars, access sprite and description and write
    #print stars
    #ratedict = collections.defaultdict(list)
    for val in stars:
        v = val.findAll("img")
        k = val.findAll("div",{"class":"recommend-description"})
        #print k,v
        #print id, ":",k[0].find(text=True),":", v[0]['alt'][0]
        #ratedict[k[0].find(text=True)] = v[0]['alt'][0]

        omnifile.write("%s:" % id)
        omnifile.write("%s:" % k[0].find(text=True))
        omnifile.write("%s\n" % v[0]['alt'][0])
    nextpage = reviewsoup.findAll("a",{"class":"pageNum taLnk"})
    #print "go to next", nextpage[0]['href']
    #return nextpage[0]['href']
"""
def getStars(revurl, pg):
    print base_url+revurl
    driver.get(base_url+revurl)
    print "page #", pg
    #time.sleep(1)
    while True:
        #geturl = base_url+revurl
        #headers = { 'User-Agent' : user_agent }
        #response = requests.get(base_url+revurl, headers=headers)
        #print response
        #html = response.text.encode('utf-8')   
        
        html = driver.page_source
        reviewsoup = BeautifulSoup(html) 
        reviewblock = reviewsoup.findAll("div",{"class":"deckC"})
        reviewlist = reviewblock[0].findAll("div",{"class":"  reviewSelector "})
        #print reviewlist

        revnum = 0
        for review in reviewlist:        
            
            if pg and not revnum:
                revnum += 1
                continue
            
            #print review
            id = review['id']
            print id
            ratelist = review.findAll("div", {"class":"rating-list"})
            #print ratelist
            for i in xrange(len(ratelist)):

                stars = ratelist[i].findAll("li",{"class":"recommend-answer"})
                #inside stars, access sprite and description and write
                #print stars
                #ratedict = collections.defaultdict(list)
                for val in stars:
                    v = val.findAll("img")
                    k = val.findAll("div",{"class":"recommend-description"})
                    #print k,v
                    #print id, ":",k[0].find(text=True),":", v[0]['alt'][0]
                    #ratedict[k[0].find(text=True)] = v[0]['alt'][0]

                    #omnifile.write("%s:" % id)
                    #omnifile.write("%s:" % k[0].find(text=True))
                    #omnifile.write("%s\n" % v[0]['alt'][0])
            revnum += 1
            
        pg += 1
        try:
            nexturl = driver.find_element_by_link_text("Next")
            print "NEXT BUTTON", nexturl
        except NoSuchElementException:
            print "NO LINK"
            break
        nexturl.click() 
        body = driver.find_element_by_tag_name("body")
        body.send_keys(Keys.CONTROL + 't')
        def link_has_gone_stale():
            try:
                # poll the link with an arbitrary call
                nexturl.find_elements_by_id('doesnt-matter') 
                return False
            except StaleElementReferenceException:
                return True
        time.sleep(1)
        wait_for(link_has_gone_stale)
        
        nextpage = reviewsoup.findAll("a",{"class":"pageNum taLnk"})
        print "go to next", nextpage[0]['href']
        revurl = nextpage[0]['href']
    #return nextpage[0]['href']
    #return nextpage[0]['href']
    #omnidict[id] = ratedict
    #print id,":",k,":", v
    #print stars
def wait_for(condition_function):
    start_time = time.time()
    while time.time() < start_time + 10:
        if condition_function():
            return True
        else:
            time.sleep(0.1)
    raise Exception(
        'Timeout waiting for {}'.format(condition_function.__name__)
    )
"""
    
print 'get url'
omnifile = open("omni-scrapte-out.dat","w")
ratingfile = open("travel-rating.dat","w")
debugfile = open("debug.log","w")
tourism_url = get_tourism_page('boston', 'massachusetts')
#Get URL to obtaint the list of hotels in a specific city
city_url = get_city_page(tourism_url)
c=0
#driver = webdriver.Firefox()
#driver.wait = WebDriverWait(driver, 5)
while(True):
    c +=1
    html = get_hotellist_page(city_url,c)
    city_url = parse_hotellist_page(html)
    #if not city_url:
    break
omnifile.close()
ratingfile.close()
debugfile.close()
#driver.quit()

 get url
URL TO REQUEST: http://www.tripadvisor.com/TypeAheadJson?query=boston%20massachusetts&action=API
RESULTS:  {u'lookbackServlet': None, u'name': u'Boston, Massachusetts, United States', u'data_type': u'LOCATION', u'title': u'Destinations', u'url': u'/Tourism-g60745-Boston_Massachusetts-Vacations.html', u'value': 60745, u'coords': u'42.357277,-71.05834', u'urls': [{u'url': u'/Tourism-g60745-Boston_Massachusetts-Vacations.html', u'type': u'GEO', u'name': u'Boston Tourism', u'url_type': u'geo'}], u'scope': u'global', u'type': u'GEO'}
CITY PAGE URL: /Hotels-g60745-Boston_Massachusetts-Hotels.html
Hotel page 1


KeyboardInterrupt: 

** Task 2 (20 pts) **

Now, we will use regression to analyze this information. First, we will fit a linear regression model that predicts the average rating. For example, for the hotel above, the average rating is

$$ \text{AVG_SCORE} = \frac{1*31 + 2*33 + 3*98 + 4*504 + 5*1861}{2527}$$

Use the model to analyze the important factors that decide the $\text{AVG_SCORE}$.

** Task 3 (30 pts) **

Finally, we will use logistic regression to decide if a hotel is _excellent_ or not. We classify a hotel as _excellent_ if more than **60%** of its ratings are 5 stars. This is a binary attribute on which we can fit a logistic regression model. As before, use the model to analyze the data.

-------

In [None]:
# Code for setting the style of the notebook
from IPython.core.display import HTML
def css_styling():
    styles = open("../../theme/custom.css", "r").read()
    return HTML(styles)
css_styling()