# Scraping Hotel Ratings on Tripadvisor

In this project we will practice web scraping. Let's get some basic information for each hotel in Boston.

On each hotel page, scrape the Traverler ratings.

![Information to be scraped](traveler_ratings.png)

Save the data in "traveler_ratings.csv" in the following format:

hotel_name, rating, count

**To receive credit, you must commit traveler_ratings.csv to Github.**

In [4]:
from bs4 import BeautifulSoup
import sys
import time
import os
import codecs
import json
import requests

base_url = "http://www.tripadvisor.com"

def tourism_page(city, state):


    # EXAMPLE: http://www.tripadvisor.com/Boston

    url = base_url+ "/"+ city
    
    print("URL TO REQUEST: %s \n" % url)
   
    response = requests.get(url);

    html = response.text.encode('utf-8')
   
    
    with open(os.path.join('data/', city + '-search-page.json'), "wb") as h:
        h.write(html)

    soup = BeautifulSoup(html, 'html.parser')
    li = soup.find("link", {"hreflang": "en"})
    return li['href']

In [5]:

def get_boston_page(tourism_url):
    

    url = tourism_url

   
    response = requests.get(url)
   
    html = response.text.encode('utf-8')

    
    with open(os.path.join('data/', 'Boston' + '-tourism-page.html'), "wb") as h:
        h.write(html)


    
    soup = BeautifulSoup(html, 'html.parser')

    li = soup.find("li", {"class": "hotels twoLines"})
    city_url = li.find('a', href = True)
    print("CITY PAGE URL: %s" % city_url['href'])

    return city_url['href']

In [6]:

def get_hotel_page(city_url, count):
    

    url = base_url + city_url
    time.sleep(4)
    
    response = requests.get(url)
    html = response.text.encode('utf-8')
    
    with open(os.path.join('data/', 'Boston' + '-hotelist-' + str(count) + '.html'), "wb") as h:
        h.write(html)
    return html

In [7]:
def get_traveler_rating(hotel_url):
    url = base_url + hotel_url
    hotel_urll.append(url)
    response = requests.get(url)
    html = response.text.encode('utf-8')
    soup = BeautifulSoup(html,'html.parser')
    x = soup.find('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_5'}).findAll('span')[2].find(text=True)
    y = soup.find('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_4'}).findAll('span')[2].find(text=True)
    z = soup.find('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_3'}).findAll('span')[2].find(text=True)
    a = soup.find('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_2'}).findAll('span')[2].find(text=True)
    b = soup.find('label',{'for':'taplc_prodp13n_hr_sur_review_filter_controls_0_filterRating_1'}).findAll('span')[2].find(text=True)
    excellent.append(x)
    very_good.append(y)
    average.append(z)
    poor.append(a)
    terrible.append(b)
    

In [8]:

def parse_hotellist_page(html):
 

    soup = BeautifulSoup(html, 'html.parser')


    
    hotel_info = soup.select('div.listing.easyClear.p13n_imperfect')
    for hotel_in in hotel_info:
        name = hotel_in.find('div', {'class' :'listing_title'}).find(text=True)
        hotel_name.append(name)
        href = hotel_in.find('a',{'class':'property_title'},href=True)
        get_traveler_rating(href['href'])
        try:
            rating = hotel_in.find('div', {'class' :'listing_rating'})
        
            reviews = rating.select('span.more.review_count')[0].find(text=True)
            
            stars = hotel_in.find("img", {"class" : "sprite-ratings"})
            
        except Exception as e:
            print("No ratings for this hotel")
            reviews = "N/A"
            stars = 'N/A'

        if stars != 'N/A':
            
            stars = stars['alt'].split()[0]
        print("HOTEL NAME: %s" % name)
        print("HOTEL REVIEWS: %s" % reviews)
        print("HOTEL STAR RATING: %s \n" % stars)

   
    div = soup.select("div.unified.pagination.standard_pagination")[0]
    # check if last page
    if len(div.select('span.nav.next.ui_button.disabled')) > 0:
        print("We reached last page")
        sys.exit()
   
    hrefers = div.findAll('a', href= True)
    for href in hrefers:
        if href.find(text = True) == 'Next':
            print("Next url is %s" % href['href'])
            return href['href']

In [9]:

current_dir = os.getcwd()
hotel_name = []
excellent = []
very_good = []
average = []
poor = []
terrible = []
hotel_urll = []
# Create data directory if not present
if not os.path.exists(os.path.join(current_dir, 'data/')):
    os.makedirs(os.path.join(current_dir, 'data/'))
    

tourism_url = 'https://www.tripadvisor.com/Tourism-g60745-Boston_Massachusetts-Vacations.html'
#
city_url = get_city_page(tourism_url)
c=0
while(True):
    c+=1
    html = get_hotellist_page(city_url, c)
    city_url = parse_hotellist_page(html)

CITY PAGE URL: /Hotels-g60745-Boston_Massachusetts-Hotels.html
HOTEL NAME: Marriott Vacation Club Pulse at Custom House, Boston
HOTEL REVIEWS: 738 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: Boston Harbor Hotel
HOTEL REVIEWS: 1,725 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: Seaport Boston Hotel
HOTEL REVIEWS: 4,190 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: Four Seasons Hotel Boston
HOTEL REVIEWS: 1,488 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: Lenox Hotel
HOTEL REVIEWS: 3,134 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: Courtyard Boston Copley Square
HOTEL REVIEWS: 1,566 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: InterContinental Boston
HOTEL REVIEWS: 3,313 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: Hotel Commonwealth
HOTEL REVIEWS: 3,983 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: Mandarin Oriental, Boston
HOTEL REVIEWS: 617 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: Residence Inn Boston Back Bay/Fenway
HOTEL REVIEWS: 1,262 Reviews
HOTEL STAR RATING: 4.5 

HOTEL NAME: 

SystemExit: 

To exit: use 'exit', 'quit', or Ctrl-D.


In [10]:
import pandas as pd
from pandas import Series, DataFrame
df = pd.DataFrame()
df['Name']=hotel_name
df['Excellent']=excellent
df['Very_Good']=very_good
df['Average']=average
df['Poor']=poor
df['Terrible']=terrible


In [11]:
df.to_csv('traveler_ratings.csv')
new_df = pd.read_csv('traveler_ratings.csv')


-------

Next, scrape all the reviews of each hotel for the star ratings of the following attributes: Value, Location, Sleep Quality, Rooms, Cleanliness, Service. Note that some reviews may not have attribute ratings and some may only have some of the attributes. **(25 pts)**

![Information to be scraped](attribute_ratings.png)

Save the data in "attribute_ratings.csv" in the following format:

hotel_name, review_id, attribute, star_value

**To receive credit, you must commit attribute_ratings.csv to Github.**

In [48]:
#find the link of first review of each hotel
showreview = []
for i in range(len(hotel_urll)):
    response = requests.get(hotel_urll[i])
    html = response.text.encode('utf-8')
    soup = BeautifulSoup(html,'html.parser')
    hotel_href = soup.find('div', {'class' :'innerBubble'})
    hotel_href = hotel_href.find('a')
    hotel_href = base_url + hotel_href.get('href')
    showreview.append(hotel_href)

In [50]:
#showreview
data = {}
data = {'name':hotel_name,'showreview':showreview}
df = pd.DataFrame.from_dict(data,orient = 'columns')
df.to_csv('showreview_link.csv')

In [49]:
import pandas as pd
from pandas import Series, DataFrame
new_df = pd.read_csv('showreview_link.csv')
new_df

Unnamed: 0.1,Unnamed: 0,name,showreview
0,0,"Marriott Vacation Club Pulse at Custom House, ...",http://www.tripadvisor.com/ShowUserReviews-g60...
1,1,Boston Harbor Hotel,http://www.tripadvisor.com/ShowUserReviews-g60...
2,2,Seaport Boston Hotel,http://www.tripadvisor.com/ShowUserReviews-g60...
3,3,Four Seasons Hotel Boston,http://www.tripadvisor.com/ShowUserReviews-g60...
4,4,Lenox Hotel,http://www.tripadvisor.com/ShowUserReviews-g60...
5,5,Courtyard Boston Copley Square,http://www.tripadvisor.com/ShowUserReviews-g60...
6,6,InterContinental Boston,http://www.tripadvisor.com/ShowUserReviews-g60...
7,7,Hotel Commonwealth,http://www.tripadvisor.com/ShowUserReviews-g60...
8,8,"Mandarin Oriental, Boston",http://www.tripadvisor.com/ShowUserReviews-g60...
9,9,Residence Inn Boston Back Bay/Fenway,http://www.tripadvisor.com/ShowUserReviews-g60...


In [51]:
base_url = "http://www.tripadvisor.com"
response = requests.get(new_df.iloc[43,2])
html = response.text.encode('utf-8')
soup = BeautifulSoup(html,'html.parser')
hotel_href = soup.findAll('div', {'class' :'reviewSelector'})


In [56]:
len(hotel_href)

7

In [7]:
#find value in one review
value = hotel_href[0].findAll('div', {'class' :'recommend-description'})
value
value_onereview = []
for i in range(len(value)):
    a = value[i]
    b = str(a)
    c = b.split('>')
    d = c[1].split('<')
    e = d[0]
    value_onereview.append(e)
value_onereview

['Location', 'Rooms', 'Service']

In [45]:
#find alt in one review
alt = hotel_href[5].findAll('span', {'class' :'rate sprite-rating_ss rating_ss'})
alt_onereview = []
for i in range(len(alt)):
    altt = alt[i]
    altt.find_all('img')
    temp = str(altt.find_all('img'))
    temp.index('alt')
    alt_num = temp[temp.index('alt')+5]
    if temp.index('alt')+6 == '.':
        alt_num = int(alt_num)+0.5
    alt_onereview.append(int(alt_num))
alt_onereview

[5, 5, 5]

In [52]:
import pandas as pd
from pandas import Series, DataFrame
df = pd.DataFrame()
df['review_id'] = []
df['hotel_name'] = []
df['Location'] = []
df['Sleep Quality'] = []
df['Rooms'] = []
df['Service']  = []
df['Value'] = []
df['Cleanliness'] = []
df

Unnamed: 0,review_id,hotel_name,Location,Sleep Quality,Rooms,Service,Value,Cleanliness


In [53]:
#get review information for every review
utl_review_id = []
utl_hotel_name = []
utl_Location = []
utl_Sleep_Quality = []
utl_Rooms = []
utl_Service = []
utl_Value = []
utl_Cleanliness = []
review_id = []
while(True):    
    for j in range(len(hotel_href)):
        review_id.append(hotel_href[j].get('id'))
        #find value in one review
        value = hotel_href[j].findAll('div', {'class' :'recommend-description'})
        #print(value)
        value_onereview = []
        for i in range(len(value)):
            a = value[i]
            b = str(a)
            c = b.split('>')
            d = c[1].split('<')
            e = d[0]
            value_onereview.append(e)
       
        alt = hotel_href[j].findAll('span', {'class' :'rate sprite-rating_ss rating_ss'})
        alt_onereview = []
        for i in range(len(alt)):
            alta = alt[i]
            alta.find_all('img')
            temp = str(alta.find_all('img'))
            temp.index('alt')
            alt_num = temp[temp.index('alt')+5]
            ab = float(alt_num)
            if temp[temp.index('alt')+6] == '.':
                ab = ab + 0.5
            alt_onereview.append(ab)
      
        count = 0
        utl_review_id.append(review_id[j]) 
        utl_hotel_name.append(new_df.iloc[43,1])
        x = True
        y = True
        z = True
        a = True
        b = True
        c = True
        for i in range(len(value_onereview)):
            if ('Location' == value_onereview[i]) & x:
                utl_Location.append(alt_onereview[count])
                count+=1
                x = False
                continue
            if ('Sleep Quality' == value_onereview[i]) & y:
                utl_Sleep_Quality.append(alt_onereview[count])
                count+=1
                y = False
                continue
            if ('Rooms' == value_onereview[i]) & z:
                utl_Rooms.append(alt_onereview[count])
                count+=1
                z = False
                continue
            if ('Service' == value_onereview[i]) & a:
                utl_Service.append(alt_onereview[count])
                count+=1
                a = False
                continue
            if ('Value' == value_onereview[i]) & b:
                utl_Value.append(alt_onereview[count])
                count+=1
                b = False
                continue
            if ('Cleanliness' == value_onereview[i]) & c:
                utl_Cleanliness.append(alt_onereview[count])
                count+=1
                c = False
                continue
        if x:
            utl_Location.append(0)
        if y:
            utl_Sleep_Quality.append(0)
        if z:
            utl_Rooms.append(0)
        if a:
            utl_Service.append(0)
        if b:
            utl_Value.append(0)
        if c:
            utl_Cleanliness.append(0)
    hotel_href_findnext = soup.findAll('a', {'class' :'nav next rndBtn ui_button primary taLnk'},href = True)
    if soup.findAll('span',{'class':'nav next disabled'}):
        break
    for href in hotel_href_findnext:
        if href.find(text = True) == 'Next':
            href['href']
        else:
            break
    if href.find(text = True) != 'Next':
        break
    next_url = base_url + href['href']
    next_url
    response = requests.get(next_url)
    html = response.text.encode('utf-8')
    soup = BeautifulSoup(html,'html.parser')
    hotel_href_next = soup.findAll('div', {'class' :'reviewSelector'})
    hotel_href = hotel_href_next

In [55]:
df['review_id'] = utl_review_id
df['hotel_name'] = utl_hotel_name
df['Location'] = utl_Location
df['Sleep Quality'] = utl_Sleep_Quality
df['Rooms'] = utl_Rooms
df['Service']  = utl_Service
df['Value'] = utl_Value
df['Cleanliness'] = utl_Cleanliness


In [42]:
import pandas as pd
from pandas import Series, DataFrame
name = []
for i in range(82):
    name.append('attribute_ratings_hotel_{0}'.format(i+1) + '.csv')
name
dff = []
for i in range(82):
    df = pd.read_csv(name[i])
    dff.append(df)


In [44]:

for i in range(1,82):
    dff[0] = dff[0].append(dff[i],ignore_index=True)
frames = dff[0]

In [48]:
print(frames.shape)
frames.to_csv('attribute_ratings.csv')

(117383, 9)

In [58]:
new_df = pd.read_csv('attribute_ratings.csv')
new_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,review_id,hotel_name,Location,Sleep Quality,Rooms,Service,Value,Cleanliness
0,0,0,review_433426851,"Marriott Vacation Club Pulse at Custom House, ...",5.0,0.0,5.0,5.0,0.0,0.0
1,1,1,review_433022432,"Marriott Vacation Club Pulse at Custom House, ...",0.0,5.0,0.0,5.0,5.0,0.0
2,2,2,review_431564898,"Marriott Vacation Club Pulse at Custom House, ...",0.0,5.0,0.0,5.0,0.0,5.0
3,3,3,review_428837081,"Marriott Vacation Club Pulse at Custom House, ...",0.0,0.0,0.0,0.0,0.0,0.0
4,4,4,review_428185627,"Marriott Vacation Club Pulse at Custom House, ...",5.0,0.0,5.0,5.0,0.0,0.0
5,5,5,review_427012974,"Marriott Vacation Club Pulse at Custom House, ...",5.0,5.0,0.0,5.0,0.0,0.0
6,6,6,review_426107222,"Marriott Vacation Club Pulse at Custom House, ...",0.0,0.0,0.0,0.0,0.0,0.0
7,7,7,review_433426851,"Marriott Vacation Club Pulse at Custom House, ...",0.0,0.0,5.0,5.0,5.0,0.0
8,8,8,review_433022432,"Marriott Vacation Club Pulse at Custom House, ...",0.0,0.0,5.0,5.0,4.0,0.0
9,9,9,review_431564898,"Marriott Vacation Club Pulse at Custom House, ...",0.0,0.0,0.0,0.0,0.0,0.0
