# Prepare Tourist Site data

In [1]:
# import needed libraries
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd

# Web scraping www.tripadvisor.com.ph to extract tourist sites data.
<b>Disclaimers: </b>

- All of the data extracted are not my own and are properties of www.tripadvisor.com.ph
- Scraped data are not used for commercial purposes and purely for personal education purposes
- HTML, format, tags, parameters, and other website script used as reference are working as of this writing and may change anytime by the website owner/administrators which may impact this code
- The code blocks may take 0-2mins depending on hardware/software/network capabilities. If you want to rerun, patience is appreciated
- Imported time module to avoid overloading the site and get blocked

## Use Search Url and create BeautifulSoup object

In [2]:
# set headers for scraping
headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        "User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.82 Safari/537.36"
    }

# get html text and create BeautifulSoup object 
search_url = 'https://www.tripadvisor.com.ph/Attractions-g298449-Activities-a_allAttractions.true-zft11309-Metro_Manila_Luzon.html'
search_html = requests.get(search_url, headers = headers)    
search_soup = BeautifulSoup(search_html.text, 'html.parser')

In [3]:
# extract tourist site links based from the search result page

# Use 3 lines below this to find the link_class 
#a_tags = search_soup.find_all('a') 
#for tag in a_tags :
    #print(tag)

links_class = {'class' : 'FmrIP _R w _Z P0 M0 Gm ddFHE'}
a_tags = search_soup.find_all('a', links_class) # find a tags
main_url = 'https://www.tripadvisor.com.ph'

# iterate and append all links in a list
tour_links_lst = []
for tag in a_tags :
    tour_links_lst.append(main_url + tag.get('href') ) 
     
print('Scraping Complete!')
print('Count of Tourist Site Links : ', len(tour_links_lst) )
print('Sample Links:')
for link in tour_links_lst[0:3] : print(link)

Scraping Complete!
Count of Tourist Site Links :  30
Sample Links:
https://www.tripadvisor.com.ph/Attraction_Review-g298573-d310887-Reviews-National_Museum-Manila_Metro_Manila_Luzon.html
https://www.tripadvisor.com.ph/Attraction_Review-g298574-d7396573-Reviews-Art_in_Island-Quezon_City_Metro_Manila_Luzon.html
https://www.tripadvisor.com.ph/Attraction_Review-g298573-d586732-Reviews-Fort_Santiago-Manila_Metro_Manila_Luzon.html


In [5]:
# iterate over each tourist site link to scrape the data of each 
tour_main_dict = {}
for tour_link in tour_links_lst :
    tour_details_dict = {}
    tour_html = requests.get(tour_link, headers = headers)    
    tour_soup = BeautifulSoup(tour_html.text, 'lxml')
    
    # Extract attraction name and add to details dictionary
    name = tour_soup.find('h1', {'class' : 'WlYyy cPsXC GeSzT'} ).get_text()
    tour_details_dict['tourist_site_name'] = name
    
    # Extract tourist site classification and add to details dictionary
    type_tag = tour_soup.find_all('div', {'class' : 'WlYyy diXIH dDKKM'})
    for tag in type_tag[4:5] :
        tour_classification = tag.get_text()
    tour_details_dict['tourist_site_classification'] = tour_classification
    
    # Extract tourist site location and add to details dictionary
    loc_tag = tour_soup.find('div', {'class' : 'dIDBU MJ'} )
    if loc_tag != None :
        tour_loc = loc_tag.get_text().split('Address')[-1]
    else : 
        tour_loc = tour_link.split('-')[-1].replace('_', ' ').split('.html')[0] 
    
    tour_details_dict['tourist_site_location'] = tour_loc

    # Extract tourist site other info (overall rating) and add to details dictionary
    other_tag = tour_soup.find_all('div', {'class' : 'WlYyy cPsXC fksET cMKSg'} )
    for tag in other_tag :
        other_info = tag.get_text() + '/5.0'
    tour_details_dict['tourist_site_other_info_overall_rating'] = other_info
    
    # Extract reviews and add to details dictionary
    reviews_tag = tour_soup.find_all('div', {'class' : 'pIRBV _T KRIav'})
    rev_count = 0
    tour_review = ''
    for tag in reviews_tag[1:4] :
        if tag.get_text() == '' : continue
        rev_count += 1
        tour_review = tour_review = tour_review + '\n' + 'Review #' + str(rev_count) + '\n' + tag.get_text()
    tour_details_dict['tourist_site_reviews'] = tour_review.lstrip()
    tour_main_dict[name] = tour_details_dict # add all scraped details of each tourist site
    time.sleep(1)
    
print('Scraping Complete!')
print('Count of Hotels:', len(tour_main_dict))

Scraping Complete!
Count of Hotels: 30


In [33]:
# Create Dataframe from tour_main_dict
df = pd.DataFrame.from_dict(tour_main_dict, orient = 'index')
df.info()
df.head(1)

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, National Museum to Philippine Air Force Aerospace Museum
Data columns (total 5 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   tourist_site_name                       30 non-null     object
 1   tourist_site_classification             30 non-null     object
 2   tourist_site_location                   30 non-null     object
 3   tourist_site_other_info_overall_rating  30 non-null     object
 4   tourist_site_reviews                    30 non-null     object
dtypes: object(5)
memory usage: 1.4+ KB


Unnamed: 0,tourist_site_name,tourist_site_classification,tourist_site_location,tourist_site_other_info_overall_rating,tourist_site_reviews
National Museum,National Museum,Speciality Museums,"Padre Burgos Ave Ermita, Manila, Luzon 2004 Ph...",4.0/5.0,Review #1\nWe really loved the paintings at th...


In [34]:
# lean location data
def clean_loc(s) :
    if ',' not in s :
        r = s.split(' ')[0]
    else : 
        r = s.split(',')[-2]
    return r.strip() + ' City'
     
df['tourist_site_location'] = df['tourist_site_location'].apply(clean_loc)
df.tourist_site_location.loc[df['tourist_site_location'] == 'Quezon City City'] = 'Quezon City'
df.tourist_site_location.loc[df['tourist_site_location'] == 'San City'] = 'San Juan City'
df.tourist_site_location.loc[df['tourist_site_location'] == 'Taguig City City'] = 'Taguig City'
#for loc in df.tourist_site_location.values :
    #print(loc)

df.tourist_site_location.value_counts()

Manila City         12
Pasay City           6
Quezon City          3
Taguig City          3
Makati City          3
Mandaluyong City     1
San Juan City        1
Muntinlupa City      1
Name: tourist_site_location, dtype: int64

In [35]:
# Number of Visitors per year and entrance fee are based on research that are available
# PSA, or other review site doesn't usually provide these data
# See tourist_site_visitors_and_price.csv for references

import_df = pd.read_csv('tourist_site_visitors_and_price.csv', index_col = 'tourist_site_name')
import_df.head(3)

Unnamed: 0_level_0,approx_visitors_per_year_in_thousands,tourist_site_entance_fee_in_pesos,references
tourist_site_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
National Museum,730,0,https://www.philstar.com/business/2018/06/06/1...
Art in Island,50,500,Art In Island - Home | Facebook
Fort Santiago,37000,0,https://www.pna.gov.ph/articles/1095487


In [36]:
import_df = import_df.drop(['references'], axis = 1)
import_df.head(3)          

Unnamed: 0_level_0,approx_visitors_per_year_in_thousands,tourist_site_entance_fee_in_pesos
tourist_site_name,Unnamed: 1_level_1,Unnamed: 2_level_1
National Museum,730,0
Art in Island,50,500
Fort Santiago,37000,0


In [37]:
# join 2 df, save to csv
df.set_index('tourist_site_name', inplace = True)
final_df = df.merge(import_df, how = 'left', on = 'tourist_site_name' )
final_df.info()
final_df

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, National Museum to Philippine Air Force Aerospace Museum
Data columns (total 6 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   tourist_site_classification             30 non-null     object
 1   tourist_site_location                   30 non-null     object
 2   tourist_site_other_info_overall_rating  30 non-null     object
 3   tourist_site_reviews                    30 non-null     object
 4   approx_visitors_per_year_in_thousands   30 non-null     int64 
 5   tourist_site_entance_fee_in_pesos       30 non-null     int64 
dtypes: int64(2), object(4)
memory usage: 1.6+ KB


Unnamed: 0_level_0,tourist_site_classification,tourist_site_location,tourist_site_other_info_overall_rating,tourist_site_reviews,approx_visitors_per_year_in_thousands,tourist_site_entance_fee_in_pesos
tourist_site_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
National Museum,Speciality Museums,Manila City,4.0/5.0,Review #1\nWe really loved the paintings at th...,730,0
Art in Island,Art Museums,Quezon City,4.5/5.0,Review #1\nit's a one-of-a-kind showcase of a...,50,500
Fort Santiago,Historic Sites • Parks,Manila City,4.0/5.0,"Review #1\nFort Santiago, as many already know...",37000,0
Bonifacio Global City,Points of Interest & Landmarks,Taguig City,4.5/5.0,"Review #1\nNothing to to with Pilipino spirit,...",0,0
Intramuros,Neighborhoods • Historic Walking Areas,Manila City,4.0/5.0,"Review #1\nVery good & cooperative staff, if a...",37000,75
Mall of Asia Arena,Arenas & Stadiums • Shopping Malls,Pasay City,4.0/5.0,Review #1\nThis is by far the best arena for g...,189800,50
Robinsons Place Mall,Shopping Malls,Manila City,4.0/5.0,Review #1\nWe went to here during a vacation a...,0,0
Runway Manila,Bridges,Pasay City,5.0/5.0,Review #1\nIts our first time to step at Runwa...,0,0
San Agustin Church,Religious Sites,Manila City,4.5/5.0,Review #1\nThis church is very historical. Thi...,10,0
San Agustin Museum,History Museums,Manila City,4.5/5.0,Review #1\nif you are a catholic this place is...,0,200


In [38]:
final_df.to_csv('Tourist_site.csv', index = True)