# TripAdvisor Crawler and Parser
The Python notebook enables one to scrape hotels from TripAdvisor.com, leveraging the power of LXML, DOM and Python requests library. Further, this work happens to include prilimnary ETL work on the extracted hotel fields. This work shall be a predecessor to another ipynb notebook detailing the Data Cleaning and ETL process in full swing.

# Part I: Download and Collect Hotel URLs.
Download Main Webpage and collect the Hotel URLs recursivley from webpages there on..

In [3]:
# import libraries
import os
import requests
import json
import re
import sys
import lxml.html 
from lxml import html
from lxml import etree
import csv
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim

print('All required libraries have been successfully imported.')

All required libraries have been successfully imported.


In [4]:
def download_only_page(url, filename):
    """""
    Input: 
    -----
        url- URL of the webpage to be downloaded to local file system.
        filename- Desired filename to be associated with the downloaded webpage.
    
    Output:
    ------
        return nothing.
    
    Functionality:
    -------------
        Request for the webpage based on URL and download it to a local directory.
    """""
    # Request to download the html page
    r = requests.get(url)
    
    # Save downloaded page as a text file
    with open(filename, mode='wb') as file:     
        file.write(r.content)

    print('TripAdvisor- hotel HTML page downloaded successfully..')   

In [5]:
def download_html_page(url, count_file):
    """""
    Input: 
    -----
        url- URL of the webpage to be downloaded to local file system.
        count_file- file number or file count, used to create a filename to be stored in local directory.
    
    Output:
    ------
        return lxml.etree instance(html) and the updated count aka file number.
    
    Functionality:
    -------------
        - Request for the webpage based on URL and download it to a local directory.
        - Create lxml.etree instance of the webpage downloaded, inorder to help parse it.
    """""
     # Request to download the html page
    r = requests.get(url)
    count_file += 1
    # Save downloaded page as a text file
    filename = 'Project-Dataset/tripadvisor_canada_hotels'+ str(count_file) + '.txt'
    with open(filename, mode='wb') as file:     
        file.write(r.content)

    print('TripAdvisor- hotel HTML page downloaded successfully..')
    
    # Open saved file to parse it.
    with open(filename,'r') as fileread:
        html = etree.HTML(fileread.read())
    
    # Parse the HTML page as a tree structure
    result = etree.tostring(html, pretty_print=True, method="html")
    print('File read successfully..')
    
    return html, count_file

In [8]:
def read_parse_file(filename):
    """""
    Input: 
    -----
        filename- Filename used to store the desired webpage in local directory.
    
    Output:
    ------
        return lxml.etree instance(html) and the updated count/file number.
    
    Functionality:
    -------------
        - Read the desired file from local directory.
        - Create lxml.etree instance of the webpage downloaded inorder to help parse it.
    """""
    # Open saved file to parse it.
    with open(filename,'r') as fileread:
        html = etree.HTML(fileread.read())
    
    # Parse the HTML page as a tree structure
    result = etree.tostring(html, pretty_print=True, method="html")
    print('File read successfully..')
    
    return html

In [7]:
def get_hotel_url(count, html):
    """""
    Input: 
    -----
        count- Number of desired elements(here the 'divs' containing hotel URLs) present in the downloaded page.
        html- lxml.etree instance(html), used to parse the file.
    
    Output:
    ------
        return the extracted full hotel URL(hotel_url) from the base webpage.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain hotel URLs embedded in the mainpage.
        - This function considers only exact search results for a given category of hotels.
    """""
    hotel_url = []
    double_count = 0
    num_ad = 2 # to counter ads intbetween
    iter_val = 0
    
    # Parse html page to get the urls for nested webpages.
    for element in range(int(count)):
        iter_val = element + 1
        # Adjust numbering based on webpage structure.
        if iter_val <= 4 :
            iter_val = iter_val
        elif iter_val == int(count):
            iter_val += int(double_count * num_ad) 
        else:
            if (iter_val % 5 == 0) & (iter_val != int(count)):
                double_count += 1
                iter_val += int(double_count * num_ad)
            else:
                iter_val += int(double_count * num_ad)
                
        # Use xpath to retrieve the necessary content.
        XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]/div/div[' + str(iter_val) + ']/div/div[1]/@data-url')

        if str(XPATH)[2:-2] == '':
            XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]/div[' + str(iter_val) + ']/div/div[1]/@data-url')
        
        if str(XPATH)[2:-2] == '':
            XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]/div/div[' + str(iter_val) + ']/div/div/div/div[1]/@data-url')
        
        hotel_url.append('https://www.tripadvisor.ca' + str(XPATH)[2:-2])
    
    return hotel_url

In [8]:
def get_hotel_url_related(count, html):
    """""
    Input: 
    -----
        count- Number of desired elements(here the 'divs' containing hotel URLs) present in the downloaded page.
        html- lxml.etree instance(html), used to parse the file.
    
    Output:
    ------
        return the extracted full hotel URL(hotel_url) from the base webpage.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain hotel URLs embedded in the mainpage.
        - This function considers only related search results (if exact results are nil, this will be called)
          for a given category of hotels.
    """""
    hotel_url = []
    double_count = 0
    num_ad = 2 # to counter ads intbetween
    iter_val = 0
    
    # Parse html page to get the urls for nested webpages.
    for element in range(int(count)):
        iter_val = element + 1
        # Adjust numbering based on webpage structure.
        if iter_val <= 4 :
            iter_val = iter_val
        elif iter_val == int(count):
            iter_val += int(double_count * num_ad) 
        else:
            if (iter_val % 5 == 0) & (iter_val != int(count)):
                double_count += 1
                iter_val += int(double_count * num_ad)
            else:
                iter_val += int(double_count * num_ad)
        value = iter_val + 1
    
        # Use xpath to retrieve the necessary content.
        XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_filtered_out_hotels_0"]/div/div[' + str(value) + ']/div/div[1]/@data-url')
        
        if str(XPATH)[2:-2] == '':
            XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_filtered_out_hotels_0"]/div[' + str(value) + ']/div/div[1]/@data-url')

        hotel_url.append('https://www.tripadvisor.ca' + str(XPATH)[2:-2])
    
    return hotel_url

In [9]:
def get_hotel_url_nomatches(count, html):
    """""
    Input: 
    -----
        count- Number of desired elements(here the 'divs' containing hotel URLs) present in the downloaded page.
        html- lxml.etree instance(html), used to parse the file.
    
    Output:
    ------
        return the extracted full hotel URL(hotel_url) from the base webpage.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain hotel URLs embedded in the mainpage.
        - This function considers only other search results (if exact results and related results for a page are nil, 
          this will be called) for a given category of hotels.
    """""
    hotel_url = []
    double_count = 0
    num_ad = 2 # to counter ads intbetween
    iter_val = 0
    
    # Parse html page to get the urls for nested webpages.
    for element in range(int(count)):
        iter_val = element + 1
        # Adjust numbering based on webpage structure.
        if iter_val <= 4 :
            iter_val = iter_val
        elif iter_val == int(count):
            iter_val += int(double_count * num_ad) 
        else:
            if (iter_val % 5 == 0) & (iter_val != int(count)):
                double_count += 1
                iter_val += int(double_count * num_ad)
            else:
                iter_val += int(double_count * num_ad)
    
        # Use xpath to retrieve the necessary content.
        XPATH = html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_ab_hotels_sponsored_0"]/div[' + str(iter_val) + ']/div/div[1]/@data-url')
        
        if str(XPATH)[2:-2] == '':
            XPATH =  html.xpath('//*[@id="taplc_hsx_hotel_list_lite_dusty_ab_hotels_sponsored_0"]/div[' + str(iter_val) + ']/div/div/div/div[1]/@data-url')

        hotel_url.append('https://www.tripadvisor.ca' + str(XPATH)[2:-2])
    
    return hotel_url

In [8]:
# Extract related content from the tree using XPath for the MainPage of TripAdvisor hotels canada.
count_file = 0

# Request html page
url = str('https://www.tripadvisor.ca/Hotels-g153339-Canada-Hotels.html')
html, count_file = download_html_page(url, count_file)

# Create xpath to access necessary content
XPATH_MAINPAGE = '//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"]'
hotel_main_page = html.xpath(XPATH_MAINPAGE)

# Get count of elements of interest in html page
count = html.xpath('count(//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"])')
# print(count)

# Get the parsed html etree
hotel_url_returned = get_hotel_url(count, html)
print('Content Extracted..')

TripAdvisor- hotel HTML page downloaded successfully..
File read successfully..
Content Extracted..


In [9]:
# Convert list to dataframe
hotel_url_df = pd.DataFrame(np.array(hotel_url_returned))
print(hotel_url_df[:30])
print('30 Hotel URLs retrieved..')

#  Write to csv file
hotel_url_df.to_csv('Project-Dataset/final-data/file_mainpage_url.txt', index=False)
print('Written on to file..')

                                                    0
0   https://www.tripadvisor.ca/Hotel_Review-g15499...
1   https://www.tripadvisor.ca/Hotel_Review-g15501...
2   https://www.tripadvisor.ca/Hotel_Review-g15494...
3   https://www.tripadvisor.ca/Hotel_Review-g15503...
4   https://www.tripadvisor.ca/Hotel_Review-g15491...
5   https://www.tripadvisor.ca/Hotel_Review-g15499...
6   https://www.tripadvisor.ca/Hotel_Review-g15494...
7   https://www.tripadvisor.ca/Hotel_Review-g15501...
8   https://www.tripadvisor.ca/Hotel_Review-g15503...
9   https://www.tripadvisor.ca/Hotel_Review-g15491...
10  https://www.tripadvisor.ca/Hotel_Review-g15502...
11  https://www.tripadvisor.ca/Hotel_Review-g15499...
12  https://www.tripadvisor.ca/Hotel_Review-g15494...
13  https://www.tripadvisor.ca/Hotel_Review-g15501...
14  https://www.tripadvisor.ca/Hotel_Review-g10226...
15  https://www.tripadvisor.ca/Hotel_Review-g15503...
16  https://www.tripadvisor.ca/Hotel_Review-g15494...
17  https://www.tripadvisor.

In [10]:
# Count number of pages needs to parsed further.
num_pages = int(str(html.xpath('//*[@id="taplc_hotels_leaf_geo_list_dusty_hotels_resp_0"]/div/div[21]/div/div/@data-numpages'))[2:-2])
base_pages_toscrape = num_pages
print('number of pages to scrape', num_pages)

number of pages to scrape 52


In [11]:
# Get the content of mainpage so as to make recursivepages-call for all other subsequent pages from it.
hotel_pages = []
value = 0
cut = 0

# Get count of elements of interest in html page
count_pages = html.xpath('//*[@id="taplc_main_pagination_bar_dusty_hotels_resp_0"]/div/div/div/span[2]/@onclick')
split_string = str(count_pages).split(',')
# print(split_string)
    
hotel_page = split_string[-1]
hotel_page = hotel_page[2:-5]
group = hotel_page.split('-')

In [12]:
# Create urls of inner webpages to make recursive calls
for iter_val in range(num_pages-1):
    hotel_page = ''
    for iter_val in range(len(group)):
        if iter_val == 2:
            value += 20
            group[2] = 'oa' + str(value)
        hotel_page += str(group[iter_val]) + str('-')
        if value >= 1080:
                cut = 1
                break
    if cut == 1:
        break
    hotel_page = hotel_page[:-1]
    #print(hotel_page)
    
    # Make url for embedded html pages.
    url = str('https://www.tripadvisor.ca') + str(hotel_page) + str('#LEAF_GEO_LIST')
    hotel_pages.append(str(url))
    # Get count of elements of interest in html page
    count = html.xpath('count(//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"])')

    # Get the etree instance for parsing html page of interest.
    html, count_file = download_html_page(url, count_file)
    
    # Get the embedded urls in the current html page being parsed.
    hotel_url_returned = get_hotel_url(int(count), html)
    print('url: '+ str(url))
    #print(count)

TripAdvisor- hotel HTML page downloaded successfully..
File read successfully..
url: https://www.tripadvisor.ca/Hotels-g153339-oa20-Canada-Hotels.html#LEAF_GEO_LIST
TripAdvisor- hotel HTML page downloaded successfully..
File read successfully..
url: https://www.tripadvisor.ca/Hotels-g153339-oa40-Canada-Hotels.html#LEAF_GEO_LIST
TripAdvisor- hotel HTML page downloaded successfully..
File read successfully..
url: https://www.tripadvisor.ca/Hotels-g153339-oa60-Canada-Hotels.html#LEAF_GEO_LIST
TripAdvisor- hotel HTML page downloaded successfully..
File read successfully..
url: https://www.tripadvisor.ca/Hotels-g153339-oa80-Canada-Hotels.html#LEAF_GEO_LIST
TripAdvisor- hotel HTML page downloaded successfully..
File read successfully..
url: https://www.tripadvisor.ca/Hotels-g153339-oa100-Canada-Hotels.html#LEAF_GEO_LIST
TripAdvisor- hotel HTML page downloaded successfully..
File read successfully..
url: https://www.tripadvisor.ca/Hotels-g153339-oa120-Canada-Hotels.html#LEAF_GEO_LIST
TripAdvi

TripAdvisor- hotel HTML page downloaded successfully..
File read successfully..
url: https://www.tripadvisor.ca/Hotels-g153339-oa1020-Canada-Hotels.html#LEAF_GEO_LIST


In [13]:
# Write to csv file
hotels_url_df_new = pd.DataFrame(np.array(hotel_pages))
hotels_url_df_new.to_csv('Project-Dataset/file_HotelPage_url.txt', index=False)
print('Written on to file..')

Written on to file..


In [14]:
# Recursively Hit 2-52 pages downloaded to get the hotel urls.
pages_url = []

for iter_val in range(num_pages-2):
    page_number = iter_val + 2
    filename = 'Project-Dataset/tripadvisor_canada_hotels'+ str(page_number) + '.txt'
    print('filename:', filename)
    html = read_parse_file(filename)
    count = html.xpath('count(//*[@id="taplc_broad_geo_tiles_dusty_hotels_resp_0"]/ul/li)')
    print(count)
    for i in range(int(count)):
        i += 1
        call_pages_link = html.xpath('//*[@id="taplc_broad_geo_tiles_dusty_hotels_resp_0"]/ul/li[' + str(i) + ']/a/@href')
        pages_url.append(str(str('https://www.tripadvisor.ca') + str(call_pages_link)[2:-2]))   

filename: Project-Dataset/tripadvisor_canada_hotels2.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels3.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels4.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels5.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels6.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels7.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels8.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels9.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels10.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels11.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_canada_hotels12.txt
File read successfully..
20.0
filename: Project-Dataset/tripadvisor_ca

In [15]:
# Write to csv file
pages_url_df_new = pd.DataFrame(np.array(pages_url))
pages_url_df_new.to_csv('Project-Dataset/file_innerpages_url.txt', index=False)
print('Written on to file..')

Written on to file..


In [16]:
print(len(pages_url_df_new))
pages_url_df_new.head(5)

1000


Unnamed: 0,0
0,https://www.tripadvisor.ca/Hotels-g154918-Jasp...
1,https://www.tripadvisor.ca/Hotels-g181727-Canm...
2,https://www.tripadvisor.ca/Hotels-g155023-Char...
3,https://www.tripadvisor.ca/Hotels-g4041681-Kit...
4,https://www.tripadvisor.ca/Hotels-g154937-Pent...


In [17]:
length = len(pages_url_df_new)
print(length)

1000


In [1]:
# Download hotel html pages.
for iter_val in range(length):
    url = pages_url[iter_val]
    print(count_file)
    html,count_file = download_html_page(url, count_file)

In [2]:
# Initialize values
start_val = base_pages_toscrape + 1
hotel_urls = []
file_num = 1052
sum_urls = 30
max_urls_perpage = 30
cut = 0

# Hit pages to download all the embedded urls
for iter_val in range(length):
    value = 0
    count_file = start_val + iter_val
    # Create filename to parse the html page.
    filename = 'Project-Dataset/tripadvisor_canada_hotels'+ str(count_file) + '.txt'
    # print('filename:', filename)

    # Parse the html page and get etree instance.
    html = read_parse_file(filename)
    
    # Get count of elements of interest in the html page.
    count = html.xpath('count(//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"])')
    if count > max_urls_perpage:
        count = max_urls_perpage
    print('count:',count)
    
    # Get number of pages to recursively hit to get hotel URLs.
    num_pages = str(html.xpath('//*[@id="taplc_main_pagination_bar_dusty_hotels_resp_0"]/div/div/div/@data-numpages'))[2:-2]
    if num_pages == '':
        num_pages = 1
    num_pages = int(num_pages)
    
    # Load urls from the list of mainpage URLS we had created previously.
    url = pages_url[iter_val]
    # print('url:',url,"\n")
    
    # Create xpath to get the URL links.
    path = html.xpath('//*[@id="taplc_main_pagination_bar_dusty_hotels_resp_0"]/div/div/div/a/@href')
    split_string = str(path).split(',')
    hotel_page = split_string[-1]
    hotel_page = hotel_page[2:-5]
    group = hotel_page.split('-')
    # print('path',path,"\n")
        
    for i in range(num_pages):
        i += 1
        cut = 0
        hotel_page = ''
        if i == 1:
            url = url
        else:
            for iter_val in range(len(group)):
                if iter_val == 2:
                    value += 30
                    group[2] = 'oa' + str(value)
                hotel_page += str(group[iter_val]) + str('-')
            hotel_page = hotel_page[:-1]
            # print(hotel_page)
     
            # Create URLs for the inner URLs obtained.
            url = str('https://www.tripadvisor.ca') + str(hotel_page)
            hotel_pages.append(str(url))
            
            # Download the inner pages.
            html, file_num = download_html_page(url, file_num)
            
            # Count the number of elements of interest in the html page.
            count = html.xpath('count(//*[@id="taplc_hsx_hotel_list_lite_dusty_hotels_combined_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"])')
            print('count:',count)
        
        # If count returns 0 previously try an alternate xpath.
        if count == 0.0:
            count = html.xpath('count(//*[@id="taplc_hsx_hotel_list_lite_dusty_filtered_out_hotels_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"])')
            print('count-related:',count)
            cut = 1
             # If count returns 0 previously try an alternate xpath.
            if count == 0.0:
                count = html.xpath('count(//*[@id="taplc_hsx_hotel_list_lite_dusty_ab_hotels_sponsored_0"]//div[@class="prw_rup prw_meta_hsx_responsive_listing ui_section listItem"])')
                print('count-nomtaches:',count)
                cut = 2
        print('url: '+ str(url) + "\n")
         
        # Call the respective parse functions written for the xpath type element obtained.
        if cut == 0:
            temp = get_hotel_url(int(count), html)
        elif cut == 1:
            temp = get_hotel_url_related(int(count), html)
        elif cut == 2:
            temp = get_hotel_url_nomatches(int(count), html)
        url_list = str(temp)[1:-1]
        url_list = url_list.split(',')
        length_url_list = len(url_list)
         
        # Clean retrieved data to form proper tripadvisor structured URLS and add to list.
        for j in range(length_url_list):
            urls = str(url_list[j])
            
            if ((j == 0) & (str(urls)[1:-1]).startswith('https://www.tripadvisor.ca/Hotel')):
                urls = urls[0:]
                hotel_urls.append(str(urls)[1:-1])
                print('inner urls:',str(urls)[1:-1])
                sum_urls += 1
            elif str(urls)[2:-1] != 'https://www.tripadvisor.ca':
                if j == length_url_list:
                    urls = urls[:-1]
                    
                if (str(urls)[2:-1]).startswith('https://www.tripadvisor.ca/Hotel') | (str(urls)[1:-1]).startswith('https://www.tripadvisor.ca/Hotel'):
                    hotel_urls.append(str(urls)[2:-1])
                    print('inner urls:',str(urls)[2:-1])
                    sum_urls += 1

count_file = file_num

print('\nNumber of Hotels URLs retrieved in total', sum_urls)
print('\nNumber of Hotels URLs retrieved in this pass', len(hotel_urls))


Number of Hotels URLs retrieved in total 8996

Number of Hotels URLs retrieved in this pass 8966


In [22]:
# Convert the URL list to pandas dataframe.
urls_df = pd.DataFrame(np.array(hotel_urls))
urls_df.head()

#  Write to csv file- append the current retrieved URLs with the 30 URLs got already.
with open('Project-Dataset/final-data/file_mainpage_url.txt', 'a') as fd:
    urls_df.to_csv(fd, index=False, header=False)

In [26]:
# Recursively Hit 2-52 page links downloaded to get the hotel urls.
review_ratings = []
hotel_full_urllist = pd.read_csv("Project-Dataset/final-data/file_mainpage_url.txt")
print(len(hotel_full_urllist))
print('Hotel urls CSV file read successfully..')

8996
Hotel urls CSV file read successfully..


In [27]:
hotel_full_urllist.head(10)

Unnamed: 0,0
0,https://www.tripadvisor.ca/Hotel_Review-g15499...
1,https://www.tripadvisor.ca/Hotel_Review-g15501...
2,https://www.tripadvisor.ca/Hotel_Review-g15494...
3,https://www.tripadvisor.ca/Hotel_Review-g15503...
4,https://www.tripadvisor.ca/Hotel_Review-g15491...
5,https://www.tripadvisor.ca/Hotel_Review-g15499...
6,https://www.tripadvisor.ca/Hotel_Review-g15494...
7,https://www.tripadvisor.ca/Hotel_Review-g15501...
8,https://www.tripadvisor.ca/Hotel_Review-g15503...
9,https://www.tripadvisor.ca/Hotel_Review-g15491...


# Part II: Parse the downloaded hotel webpages to extract useful fields from them.
- Parse the hotel webpages sequentially to extract important fields such as user ratings, average rating, hotel name, location, etc,.
- Do basic ETL and cleaning for Null values such as dropping null values that cannot be retrieved through hits, aggregating values for fields for which a value can be obtained though it has null value post hits.
- De-duplicate the final dataframe obatined (both hotel information and reviews dataframe).

In [28]:
def get_overall_rating(html, XPATH_HOTELOVERALL_RATING, XPATH_OVERALLRATING_WORDS):
    """""
    Input: 
    -----
        html- lxml.etree instance(html), used to parse the file.
        XPATH_HOTELOVERALL_RATING- XPATH to be used to extract overall hotel rating.
        XPATH_OVERALLRATING_WORDS- XPATH to be used to extract overall hotel rating in words.
    
    Output:
    ------
        return extracted overall rating(overall_rating) and overallrating(hotel_rating_overall_word)
        in words fields.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain overall rating.
    """""
    overall_rating = str(html.xpath(XPATH_HOTELOVERALL_RATING))[2:-2]
    if overall_rating == '':
        overall_rating = str(html.xpath(XPATH_ALT_HOTELOVERALL_RATING))[2:-2]
    hotel_rating_overall_word = str(html.xpath(XPATH_OVERALLRATING_WORDS))[2:-2]
    
    return overall_rating, hotel_rating_overall_word

In [29]:
def get_address(html, XPATH_STADDRESS, XPATH_LOCALITY, XPATH_COUNTRY):
    """""
    Input: 
    -----
        html- lxml.etree instance(html), used to parse the file.
        XPATH_STADDRESS- XPATH to be used to extract street address of the hotel.
        XPATH_LOCALITY- XPATH to be used to extract locality of the hotel.
        XPATH_COUNTRY- XPATH to be used to extract country(location) information.
    
    Output:
    ------
        return street_name, locality_name, country_name, address_full, locality- corresponds
        to extracted streent name, cleaned locality name to obatin lat-lon, country, complete 
        address and original locality name respectively.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain address of the hotel.
    """""
    street = str(html.xpath(XPATH_STADDRESS)).split(',')
    street_name = str(street[0])[2:-1]
    locality = str(html.xpath(XPATH_LOCALITY)).split(',')
    
    if len(locality) > 6:
        try:
            locality_name = str(locality[2])[1:-8]
        except:
            locality_name = str(locality[2])[1:-1]
    else:
        try:
            locality_name = str(locality[0]+locality[1])[2:-8]
        except:
            locality_name = str(locality[0])[2:-1]
        
    country = str(html.xpath(XPATH_COUNTRY)).split(',')
    country_name = str(country[0])[2:-1]
    address_full = str(street_name + ' ' + locality_name)
    
    return street_name, locality_name, country_name, address_full, locality

In [30]:
def get_lat_long(address_full, locality):
    """""
    Input: 
    -----
        address_full- Complete address of the hotel, obtained after cleaning.
        locality- Locality of the hotel, obatined after cleaning.
    
    Output:
    ------
        return lat, lon- corresponds to the latitude and longitude of the hotel location.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain latitude, longitude
          information of the hotel.
        - If full address throws null, try to obatin just the locality's lat-lon.
        - If locality also throws null, try to obatin lat-lon of the province. 
        
    """""
    try:
        geolocator = Nominatim(user_agent="ITRS")
        location = geolocator.geocode(address_full)
        lat = location.latitude
        lon = location.longitude
        print((lat, lon))
    except:
        try:
            if len(locality) > 6:
                location = geolocator.geocode(str(locality[2])[1:-8])
            else:
                location = geolocator.geocode(str(locality[1])[1:-8])
            lat = location.latitude
            lon = location.longitude
            print((lat, lon))
        except:
            lat = 'nil'
            lon = 'nil'
            print("An exception occurred", (lat, lon))
    
    return lat, lon

In [31]:
def get_price(html, XPATH_PRICE):
    """""
    Input: 
    -----
        html- lxml.etree instance(html), used to parse the file.
        XPATH_PRICE- XPATH to be used to extract price for the hotel.
    
    Output:
    ------
        return price- price for the hotel. Consider the striked price(maximum) if available,
        else consider the best quoted price of all.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain price of the hotel.
    """""
    price = str(html.xpath(XPATH_PRICE))[4:-2]
    
    return price

In [32]:
def get_reviews(html, XPATH_REVIEW_NUMBER, XPATH_ALTREVIEW_NUMBER):
    """""
    Input: 
    -----
        html- lxml.etree instance(html), used to parse the file.
        XPATH_REVIEW_NUMBER- XPATH to obtain the total reviews present for the hotel.
        XPATH_ALTREVIEW_NUMBER- Alternate XPATH to obtain the total reviews present for the 
                                hotel.
    
    Output:
    ------
        return reviews- total number of reviews available for the hotel.
    
    Functionality:
    -------------
        - Parse the downloaded file using html.etree and XPATH to obtain total number of
          reviews present for the hotel.
    """""
    # Get total number of user reviews given for the hotel.
    reviews = str(html.xpath(XPATH_REVIEW_NUMBER))[2:-2]
    if reviews == '':
        reviews = str(html.xpath(XPATH_ALTREVIEW_NUMBER))[2:-2]
    reviews = re.findall("[0-9]", reviews)
    reviews = ''.join(reviews)
    
    # If total reviews is nil handle it.
    if reviews == '':
        reviews = 0 # need to handle condition to extract reviews
        user_rating = 'nil' 
        user_profile = 'nil' 
        user_name = 'nil' 
        review_date = 'nil'
        stay_date = 'nil'
        review_title = 'nil'
        review = 'nil'
    elif int(reviews) > 200:
        reviews = 200
    print(reviews)
        
    return reviews

In [3]:
# NOTE: This script will only download the first 5K hotel pages to local directory.
# Split urls to be hit into batches of 5k each.
length_df = int(len(hotel_full_urllist)/2)
print(length_df)
file_no = 0
batch = 'first'

# Run batch 1, from 1-4498
for iter_val in range(length_df):
    url = str(hotel_full_urllist.values[iter_val])[2:-2]
    #print(url)

    # Download the webpage
    file_no += 1
    filename = 'Project-Dataset/final-data/hotel_webpage'+ str(file_no) + '.txt'
    download_only_page(url, filename)
    
print('Batch 1- webpages download completed successfully')
file_num_value = file_no

4498
Batch 1- webpages download completed successfully


In [47]:
# NOTE: This script will only download the next/second 5K hotel pages to local directory.
# Run batch 2, from 4499-rest
batch = 'second'
file_no = length_df

for iter_val in range(length_df):
    file_count = length_df + iter_val
    url = str(hotel_full_urllist.values[file_count])[2:-2]
    #print(url)
    
    # Download the webpage
    file_no += 1
    filename = 'Project-Dataset/final-data/hotel_webpage'+ str(file_no) + '.txt'
    download_only_page(url, filename)
    
print('Batch 2- webpages download completed successfully')

Batch 2- webpages download completed successfully


### Master Script:
The following python piece of code (a script) helps to obtain all the fields mentioned above by recursively hitting inner URLs for obtaining user reviews,other user related information, hotel name, location and other hotel related information.

In [4]:
# This script will recursively hit hotel URLs and obtain all the information required from it.
# Initialize variables.
if batch == 'second':
    file_no = file_num_value - 1
    file_number = review_file_count
    hotel_count = hotel_left_count
else:
    file_no = 0
    file_number = 0
    hotel_count = 1

count = 3
loop_count = length_df
hotel_name = []
hotel_overall_rating = []
hotel_overall_rating_words = []
total_reviews = []
user_ratings = []
hotel_id = []
review_id = []
user_name = []
user_profile = []
user_review = []
user_reviewdate = []
user_staydate = []
hotel_price = []
address = []
lat_lon = []
amenities = []

# XPath for different hotel field to be retrieved.
XPATH_HOTELNAME = '//*[@id="HEADING"]/text()'
XPATH_ALT_HOTELOVERALL_RATING = '//*[@class="rating"]/span/text()'
XPATH_HOTELOVERALL_RATING = '//*[@class="hotels-hotel-review-about-with-photos-Reviews__overallRating--vElGA"]/text()'
XPATH_OVERALLRATING_WORDS = '//*[@class="hotels-hotel-review-about-with-photos-Reviews__ratingLabel--24XY2"]/text()'
XPATH_REVIEW_NUMBER = '//*[@class="hotels-hotel-review-about-with-photos-Reviews__seeAllReviews--3PpLR"]/text()'
XPATH_ALTREVIEW_NUMBER = '//*[@class="reviewCount "]/text()'
XPATH_ADDRESS = '//*[@class="public-business-listing-ContactInfo__nonWebLink--2rxPP"]/span/text()'
XPATH_STADDRESS = '//*[@class="street-address"]/text()'
XPATH_LOCALITY = '//*[@class="locality"]/text()'
XPATH_COUNTRY = '//*[@class="country-name"]/text()'
XPATH_PRICE = '//*[contains(@id, "bor_book_link_")]//div[contains(@class, "bb_price_text")]/text()'
XPATH_AMENITIES = '//*[@class="hotels-hotel-review-about-with-photos-Amenity__name--2IUMR"]/text()'

# Main body of the script where everything goes- extraction of information.
for iter_val in range(loop_count): 
    file_count = file_no + iter_val
    file_no += 1
    url = str(hotel_full_urllist.values[file_count])[2:-2]
    print(url)
    
    filename = 'Project-Dataset/final-data/hotel_webpage'+ str(file_no) + '.txt'
    download_only_page(url, filename)
    html = read_parse_file(filename)
    hotel_id.append(hotel_count)
    
    # Get hotel name
    name = str(html.xpath(XPATH_HOTELNAME))[2:-2]
    hotel_name.append(name)
    print(name)
    
    # Get overall hotel rating in figures and words.
    overall_rating, hotel_rating_overall_word = get_overall_rating(html, XPATH_HOTELOVERALL_RATING, XPATH_OVERALLRATING_WORDS)
    hotel_overall_rating.append(overall_rating.strip())
    hotel_overall_rating_words.append(hotel_rating_overall_word.strip())
    
    # Get address of hotels
    street_name, locality_name, country, address_full, locality = get_address(html, XPATH_STADDRESS, XPATH_LOCALITY, XPATH_COUNTRY)
    address.append(address_full)
    print(address_full)
    
    # Get (Latitude,Longitude) values for hotels
    lat, lon = get_lat_long(address_full, locality)
    lat_lon.append((lat, lon))
    
    # Get amenities
    amenities.append(html.xpath(XPATH_AMENITIES))
    
    # Get price
    price = get_price(html, XPATH_PRICE)
    hotel_price.append(price)
        
    # Get reviews- user profile, user ratings, date/time
    reviews_total =  get_reviews(html, XPATH_REVIEW_NUMBER, XPATH_ALTREVIEW_NUMBER)
    total_reviews.append(reviews_total)
    
    # Calculate the number of pages to parse based on total reviews present.
    if int(reviews_total) > 5:
        num_pages_reviews = int(int(reviews_total)/5)
        num_reviews_perpage = 5
    elif int(reviews_total) <= 5:
        num_pages_reviews = 1
        num_reviews_perpage = int(reviews_total)
    
    for iter_var_outer in range(int(num_pages_reviews)):
        count = 3
        val = 0
        
        # Create URLs to call next-next pages recursively
        url_reviews = url.split('-')
        if (iter_var_outer-1) >= 0:
            val = int(5 + ((iter_var_outer - 1) * 5))
            var = url_reviews[3]
            url_reviews[3] = str(var+'-or'+str(val)) 
            url_review = '-'.join(url_reviews)
            print(url_review)
            
            # Download html page and save it as a text file
            file_number += 1
            filename = 'Project-Dataset/final-data/hotel_reviews_webpage'+ str(file_number) + '.txt'
            download_only_page(url_review, filename)
            
            # Get etree instance to parse its content.
            html = read_parse_file(filename)
    
        for iter_var in range(num_reviews_perpage):
            review_count = iter_val + 1
            if review_count % 5 == 0:
                count = 3
                
            user_rating = str(html.xpath('//*[@id="component_24"]/div[3]/div/div[' + str(count) + ']//span[contains(@class, "ui_bubble_rating")]/@class'))[26:-2]
            if user_rating != '':
                user_rating = int(user_rating)/10
                print(user_rating)
            else:
                user_rating = 'nil'
                
            user_profiles = str(html.xpath('//*[@id="component_24"]/div[3]/div/div[' + str(count) + ']//a[@class="ui_header_link social-member-MemberEventOnObjectBlock__member--23Flv"]/@href'))[2:-2]
            if user_profiles != '':
                user_profiles = 'https://www.tripadvisor.ca' + user_profiles
            else:
                user_profiles = 'nil'
                
            user_names = str(html.xpath('//*[@id="component_24"]/div[3]/div/div[' + str(count) + ']//a[@class="ui_header_link social-member-MemberEventOnObjectBlock__member--23Flv"]/text()'))[2:-2]
            
            review_date = str(html.xpath('//*[@id="component_24"]/div[3]/div/div[' + str(count) + ']//div[@class="social-member-MemberEventOnObjectBlock__event_type--1lSEx"]/span/text()'))[18:-2]
            if review_date.lower() == 'yesterday':
                review_date = 'Mar. 22'
            if review_date == '':
                review_date = 'nil'
                
            
            stay_date = str(html.xpath('//*[@id="component_24"]/div[3]/div/div[' + str(count) + ']//div[@class="hotels-review-list-parts-EventDate__event_date--CRXs4"]/span/text()'))[2:-2]
            if stay_date == '':
                stay_date = 'nil'
            
            review_title = str(html.xpath('//*[@id="component_24"]/div[3]/div/div[' + str(count) + ']//a[@class="hotels-hotel-review-community-content-review-list-parts-ReviewTitle__reviewTitleText--2VUye"]/span/span/text()'))[2:-2]
            if review_title == '':
                review_title = ''
                
            review = str(html.xpath('//*[@id="component_24"]/div[3]/div/div[' + str(count) + ']//q[@class="hotels-hotel-review-community-content-review-list-parts-ExpandableReview__reviewText--2OVqJ"]/span//text()'))[2:-2]
            if review == '':
                review = 'nil'
            
            review_id.append(hotel_count)    
            user_ratings.append(user_rating)
            user_name.append(user_names)
            user_profile.append(user_profiles)
            user_review.append(str(review_title + " " + review))
            user_reviewdate.append(review_date)
            user_staydate.append(stay_date)
            count += 1
    hotel_count += 1
            
review_file_count = file_number
review_left_count = review_count
hotel_left_count = hotel_count
print("Batch- Collection successfully done..\n")

Batch- Collection successfully done..



In [6]:
# For the first batch create dataframe and write the collected information into a fresh file.
if batch != 'second':
    # Create single dataframe of collected hotel information.
    hotel_info_df = pd.DataFrame()
    hotel_info_df['id'] = hotel_id
    hotel_info_df['hotel_name'] = hotel_name
    hotel_info_df['hotel_rating'] = hotel_overall_rating
    hotel_info_df['hotel_experience'] = hotel_overall_rating_words
    hotel_info_df['amenities'] = amenities
    hotel_info_df['address'] = address
    hotel_info_df['country'] = country
    hotel_info_df['locality'] = locality_name
    hotel_info_df['location'] = lat_lon
    hotel_info_df['price'] = hotel_price

    # Write dataframe to csv and json format files.
    hotel_info_df.drop_duplicates(subset=['hotel_name'], inplace=True)
    hotel_info_df.to_json('Project-Dataset/final-data/hotel_info.json')
    hotel_info_df.to_csv('Project-Dataset/final-data/hotel_info.csv', index=False)

print('Hotel Dataframe updated..')

Hotel Dataframe updated..


In [25]:
print(len(hotel_info_df))
hotel_info_df.head(50)

3692


Unnamed: 0,id,hotel_name,hotel_rating,hotel_experience,amenities,address,country,locality,location,price
0,1,Niagara Falls Marriott Fallsview Hotel & Spa,4.0,Very good,"[Pool, Restaurant, Fitness Centre with Gym / W...",6740 Fallsview Blvd Niagara Falls Ontario,Canada,,"(43.0784124, -79.0820118287735)",118.0
1,2,Fairmont Royal York,4.5,Excellent,"[Restaurant, Room service, Fitness Centre with...",100 Front Street W Toronto Ontario,Canada,,"(43.6459092, -79.3813636)",489.0
2,3,Blue Horizon Hotel,4.5,Excellent,"[Pool, Restaurant, Free High Speed Internet (W...",1225 Robson St Vancouver British Columbia,Canada,,"(49.2870098, -123.1285348)",169.0
3,4,Fairmont Le Chateau Frontenac,4.5,Excellent,"[Restaurant, Fitness Centre with Gym / Workout...",1 Rue des Carrieres Quebec City Quebec,Canada,,"(46.81182845, -71.2055490276425)",212.0
4,5,Rimrock Resort Hotel,4.5,Excellent,"[Pool, Room service, Fitness Centre with Gym /...",300 Mountain Avenue Alberta,Canada,,"(51.1615167, -115.5706911)",189.0
5,6,Niagara Falls Marriott on the Falls,4.0,Very good,"[Pool, Restaurant, Fitness Centre with Gym / W...",6755 Fallsview Boulevard Niagara Falls Ontario,Canada,,"(43.0773648, -79.0827579478645)",
6,7,The Burrard,4.0,Very good,"[Free High Speed Internet (WiFi), Laundry Serv...",1100 Burrard St Vancouver British Columbia,Canada,,"(49.27975795, -123.127732331588)",123.0
7,8,One King West Hotel & Residence,4.5,Excellent,"[Restaurant, Fitness Centre with Gym / Workout...",1 King Street West Toronto Ontario,Canada,,"(50.000678, -86.000977)",
8,9,Le Square Phillips Hotel & Suites,4.5,Excellent,"[Pool, Free High Speed Internet (WiFi), Fitnes...",1193 Place Phillips Montreal Quebec,Canada,,"(45.503560325, -73.56732943)",
9,10,Banff Park Lodge Resort and Conference Centre,4.0,Very good,"[Pool, Restaurant, Room service, Fitness Centr...",222 Lynx Street Alberta,Canada,,"(51.1788053, -115.5744519)",


In [32]:
# For the first batch create dataframe and write the collected information into a fresh file
if batch != 'second':
    # Create single dataframe of collected review information.
    review_df = pd.DataFrame()
    review_df['id'] = review_id
    review_df['user_rating'] = user_ratings
    review_df['user_name'] = user_name
    review_df['user_profile'] = user_profile
    review_df['user_review'] = user_review
    review_df['user_reviewdate'] = user_reviewdate
    review_df['user_staydate'] = user_staydate

    # Write dataframe to csv and json format files.
    review_df.drop_duplicates(subset=['user_name', 'id'], inplace=True)
    review_df.to_json('Project-Dataset/final-data/reviews.json')
    review_df.to_csv('Project-Dataset/final-data/reviews.csv', index=False)
    print('review DataFrame updated..')

review DataFrame updated..


In [33]:
print(len(review_df))
review_df.head(200)

175791


Unnamed: 0,id,user_rating,user_name,user_profile,user_review,user_reviewdate,user_staydate
0,1,5,Billy Banda Gizmo Dorothy,https://www.tripadvisor.ca/Profile/Gizmo4ever,great view Great place to stay and you have th...,Mar. 22,June 2018
1,1,4,VisitorfromOntario9,https://www.tripadvisor.ca/Profile/Visitorfrom...,Nice Winter Break Our multi generational famil...,Mar. 19,January 2019
2,1,5,Odyssey624335,https://www.tripadvisor.ca/Profile/Odyssey624335,Perfect View of the Falls A room with a specta...,Mar. 19,March 2019
3,1,5,Melissa F,https://www.tripadvisor.ca/Profile/956melissaf,Great time!! Beautiful hotel. We would stay ag...,Mar. 19,March 2019
4,1,5,2kidswilltravel37,https://www.tripadvisor.ca/Profile/2kidswilltr...,"Peaceful, perfect unobstructed views Having an...",Mar. 15,March 2019
5,1,nil,,nil,nil,nil,nil
20,1,4,marysevigneault,https://www.tripadvisor.ca/Profile/marysevigne...,Wonderful place for a Wow You want a meeting o...,Feb. 2019,February 2019
21,1,4,HappyToTravel,https://www.tripadvisor.ca/Profile/No_Repeat12,Business workshop This was a 3 day 2 night sta...,Feb. 2019,February 2019
22,1,5,Mabdad1,https://www.tripadvisor.ca/Profile/Mabdad1,"Fantastic hotel One night stay in Feb, check i...",Feb. 2019,February 2019
23,1,4,Nate D,https://www.tripadvisor.ca/Profile/nated602,"Great View All the rooms overlook the falls, s...",Feb. 2019,January 2019


In [10]:
# For second batch, create dataframe and append the information to already created files.
if batch == 'second':
     # Create single dataframe of collected hotel information.
    hotel_info_df = pd.DataFrame()
    hotel_info_df['id'] = hotel_id
    hotel_info_df['hotel_name'] = hotel_name
    hotel_info_df['hotel_rating'] = hotel_overall_rating
    hotel_info_df['hotel_experience'] = hotel_overall_rating_words
    hotel_info_df['amenities'] = amenities
    hotel_info_df['address'] = address
    hotel_info_df['country'] = country
    hotel_info_df['locality'] = locality_name
    hotel_info_df['location'] = lat_lon
    hotel_info_df['price'] = hotel_price

    # Drop duplicates by hotel name.
    hotel_info_df.drop_duplicates(subset=['hotel_name'], inplace=True)
    hotel_info_df.drop(['locality'], axis = 1, inplace=True)

    #  Append to hotel-info.csv and hotel-info.json files. 
    with open('Project-Dataset/final-data/hotel_info.json', 'a') as fd_hotel:
        hotel_info_df.to_json(fd_hotel)
    with open('Project-Dataset/final-data/hotel_info.csv', 'a') as fd1_hotel:
        hotel_info_df.to_csv(fd1_hotel, index=False, header=False)
        
    print('Hotel Dataframe updated..')

    # Create single dataframe of collected review information.
    review_df = pd.DataFrame()
    review_df['id'] = review_id
    review_df['user_rating'] = user_ratings
    review_df['user_name'] = user_name
    review_df['user_profile'] = user_profile
    review_df['user_review'] = user_review
    review_df['user_reviewdate'] = user_reviewdate
    review_df['user_staydate'] = user_staydate

    # Drop duplicate reviews by user and for a the same hotel.
    review_df.drop_duplicates(subset=['user_name', 'id'], inplace=True)
    
     #  Append to reviews.csv and reviews.json files.
    with open('Project-Dataset/final-data/reviews.json', 'a') as fd_review:
        review_df.to_json(fd_review)
    with open('Project-Dataset/final-data/reviews.csv', 'a') as fd1_review:
        review_df.to_csv(fd1_review, index=False, header=False)
        
    print('review DataFrame updated..')

Hotel Dataframe updated..
review DataFrame updated..


In [10]:
# After appending batch 2 information onto files, read them and write it back freshly.
if batch == 'second':
    hotel_full_info_df = pd.read_csv("Project-Dataset/final-data/hotel_info.csv")
    # print(len(hotel_full_urllist))
    print('Hotel urls CSV file read successfully..')

    hotel_full_info_df.drop_duplicates(subset=['hotel_name'], inplace=True)
    # hotel_full_info_df.drop(['locality'], axis = 1, inplace=True)
    print(len(hotel_full_info_df))

    #  Write to csv and json files.
    hotel_full_info_df.to_json('Project-Dataset/final-data/hotel_info.json')
    hotel_full_info_df.to_csv('Project-Dataset/final-data/hotel_info.csv', index=False)

    print('Hotel Dataframe updated..')
    hotel_full_info_df.head()

Hotel urls CSV file read successfully..
5985
Hotel Dataframe updated..


In [12]:
# After appending batch 2 information onto files, read them and write it back freshly.
if batch == 'second':
    review_full_info_df = pd.read_csv("Project-Dataset/final-data/reviews.csv")
    # print(len(hotel_full_urllist))
    print('Review urls CSV file read successfully..')

    review_full_info_df.drop_duplicates(subset=['user_name', 'id'], inplace=True)
    review_full_info_df.dropna(inplace=True)
    print(len(review_full_info_df))

    #  Write to csv and json files.
    review_full_info_df.to_json('Project-Dataset/final-data/reviews.json')
    review_full_info_df.to_csv('Project-Dataset/final-data/reviews.csv', index=False)
        
    print('review DataFrame updated..')
    review_full_info_df.head()

Review urls CSV file read successfully..
245320
review DataFrame updated..
