In [None]:
import pandas as pd
from IPython.display import display, HTML
import requests
import lxml.html as lh
import urllib
from time import sleep

In [None]:
def clean_text(text):
    text = text.strip().strip('-')
    text = text.lower()
    text = '_'.join(text.split(' '))
    return text

In [None]:
def get_reviews(i,tree):
    try:
        container = tree.xpath("//div[@id='REVIEWS']//div[@class='listContainer']")[0]
        review_container = container.xpath(".//div[@class='review-container']")
        for details in review_container:
            user_container = details.xpath(".//div[@class='ui_column is-2']")[0]
            user.append(clean_text(user_container.xpath(".//div[@class='member_info']/div/div[2]/span/text()")[0]))
            rating_container = details.xpath(".//div[@class='ui_column is-9']")[0]
            user_rating.append(float(clean_text(rating_container.xpath(".//div[@class='rating reviewItemInline']/span/@class")[0])[-2:])/10)    
            review_date.append(rating_container.xpath(".//div[@class='rating reviewItemInline']/span/@title")[0])
            review.append(rating_container.xpath(".//div[contains(@class,'quote')]/span/text()")[0]+'. '+rating_container.xpath(".//p[@class='partial_entry']/text()")[0])
            attraction_id.append(i)

        if container.xpath(".//div[@class = 'unified ui_pagination ']/a[2]/@href"):
            sleep(5)
            page_no = container.xpath(".//div[@class = 'unified ui_pagination ']/a[2]/@data-page-number")[0]
            print("log: getting reviews for attraction "+str(i)+"- page "+page_no)
            link = "https://tripadvisor.ca"+container.xpath(".//div[@class = 'unified ui_pagination ']/a[2]/@href")[0]
            page = requests.get(link)
            html = page.content
            tree = lh.fromstring(html)
            get_reviews(i,tree)
    
    except:
        error_file.write(str.encode("error: reviews for attraction "+str(i)+" could not be extracted\n"))


In [None]:
def extract_info(i,url, maps_key):
    try:
        page = requests.get(url)
        html = page.content
        tree = lh.fromstring(html)

        geo = tree.xpath("//ul[@class = 'breadcrumbs']")[0]
        try:
            country.append(clean_text(geo.xpath("./li[1]/a/span/text()")[0]))
            province.append(clean_text(geo.xpath("./li[2]/a/span/text()")[0]))
            city.append(clean_text(geo.xpath("./li[3]/a/span/text()")[0]))
        except:
            country.append("canada")
            city.append("nil")
            province.append("nil")   
            
        try:
            address = tree.xpath("//div[@class='supplier']/a/text()")[0]+", "+country[-1]
            maps_api_url = 'https://maps.googleapis.com/maps/api/geocode/json'
            request_url = maps_api_url + '?' + urllib.parse.urlencode({'address':address,'key':maps_key})
            response = requests.get(request_url)
            resp_json_payload = response.json()
            location.append(resp_json_payload['results'][0]['geometry']['location'])
        except:
            location.append("nil")
        try:
            details = tree.xpath("//div[@class='product_highlights_module']")[0]
            name.append(clean_text(details.xpath("./h1/text()")[0]))
        except:
            name.append("nil")
        try:
            rating.append(float(clean_text(details.xpath("./div[@class='rating_and_tag_wrapper']/div/span/@class")[0])[-2:])/10)
        except:
            rating.append(float(-1))
        try:
            price.append(float(clean_text(details.xpath("./div[@class='product_cta_wrapper']//div[@class='price']/span/span/text()")[0])[2:]))
        except:
            price.append(float(-1))
        
        att_id.append(i)
        
        print("log: getting reviews for attraction "+str(i))
        get_reviews(i,tree)
    
    except:
        error_file.write(str.encode("error: Details of the attraction "+str(i)+" could not be extracted\n"))
        error_file.write(str.encode(url+"\n"))

In [None]:
df = pd.read_json('outputs/attractions_cat.json',orient='records')
df['attraction_id'] = df.index
df = df.rename(index=str,columns={"attraction": "url"})

att_id = list()
country = list()
province = list()
city = list()
location = list()
name = list()
rating = list()
price = list()

attraction_id = list()
user = list()
review = list()
user_rating = list()
review_date = list()

error_file = open("outputs/error_log.txt","wb")
for i in range(df.shape[0]):
    print("log: collecting details for attraction "+str(i))
    extract_info(i,df['url'][i],'AIzaSyC2jxjbR_svb9EjCeMBivCNEcCaaxdEYIA')
error_file.close()

In [None]:
print("Details dataframe verification:")
print(len(country))
print(len(province))
print(len(city))
print(len(name))
print(len(rating))
print(len(price))
print(len(location))
att_df = pd.DataFrame({'attraction_id':att_id,
                   'name':name,
                   'country':country,
                   'province':province,
                   'city':city,
                   'location':location,
                   'price':price,
                   'rating':rating})
att_df.to_json('outputs/attractions_details_batch2.json',orient='records',index=True)

In [None]:
print("Reviews dataframe verification")
print(len(attraction_id))
print(len(user))
print(len(review))
print(len(user_rating))
print(len(review_date))
att_rev_df = pd.DataFrame({'attraction_id':attraction_id,
                           'user':user,
                           'rating':user_rating,
                           'review':review,
                           'review_date':review_date})

att_rev_df.to_json('outputs/attractions_reviews_batch2.json',orient='records',index=True)