In [1]:
"""This module contains functions to scrape all the activities listed on a city page."""
import time
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import openpyxl

In [2]:
def get_city_url_list(city_url, max_pages=100):
    """The city url is the first page of activities, most city urls have multiple pages
    of activities, so we need to check if the page exists and if it does, add it to
    a dictionary of valid pages. The key is the page number and the value is the url.
    The variable max_pages is the maximum number of pages to check, this is to prevent
    the scraper from running for too long if the city has a lot of pages."""

    response = requests.get(city_url, timeout=5, allow_redirects=False)
    if response.status_code != 200:
        print(f"City url is not valid: {city_url}")
        return {}
    print(f"Checked validity for page 1 and it was 200")

    valid_pages = {1: response}
    if max_pages > 1:
        for page_number in range(2, max_pages+1):
            start = time.time()
            page_url = city_url + f"/{page_number}"
            response = requests.get(page_url, timeout=5)
            if response.status_code == 200:
                valid_pages[page_number] = response
            print(f"Checked validity for page {page_number} and it was {response.status_code}. Returned in {time.time() - start} seconds")
    print(f"Found {len(valid_pages)} valid pages")
    return valid_pages

In [3]:
def get_activity_urls(soup):
    """This function takes bs4 soup object of a city activity page and returns a list of
    activity urls contained in it."""

    activity_urls = []
    activity_links = soup.find_all("a", class_='text-dark highlight-able card-link')
    for link in activity_links:
        activity_urls.append("https://viator.com" + link['href'])
    return activity_urls

In [4]:
def extract_data(link):
    """This function takes a link to an activity page and returns a list
    of the data contained in it."""

    data = []
    response = requests.get(link, timeout=5)
    soup = bs(response.text, 'html.parser')


    activity_name = soup.find("h1", class_="title__1Wwg title2__C3R7").text
    activity_url = link
    price_per_participant = soup.find_all("span", class_="moneyView__2HPx defaultColor__1NL9")[0].text
    valid_number_of_participants = soup.find("input", class_ = "input__MNXR md__1Fp3")['data-automation-value']
    valid_date = soup.find("input", class_ = "input__2pmO md__2bZz")["value"]

    overview_block = soup.find("div", class_="overviewWrapper__bMs4")

    main_overview_ = overview_block.find_all("div")[0]
    try:
        activity_description = main_overview_.find_all("div")[0].text
    except:
        activity_description = "No description available"

    duration_block = soup.find_all("div", class_ = "item__3eVq")[1]
    duration = duration_block.find_all("div")[1].text

    avg_rating = soup.find("span", class_ = "averageRatingValue__Q1ep").text

    review_count = soup.find("div", class_ = "reviewCount__3sJa").text

    

    
    data.append(activity_name)
    data.append(activity_url)
    data.append(price_per_participant)
    data.append(valid_number_of_participants)
    data.append(valid_date)
    data.append(activity_description)
    data.append(duration)
    data.append(avg_rating)
    data.append(review_count)

    return data

    

In [5]:
def main_scraper(city_url, max_pages=100, dataframe=pd.DataFrame(columns=['Activity Name', 'Activity URL', 'Price per Participant', 'Valid Number of Participants', 'Valid Date', 'Activity Description', 'Duration', 'Average Rating', 'No. of Reviews'])):
    """This function takes a city url and returns a dataframe of all the activities
    listed on the city page."""

    valid_pages = get_city_url_list(city_url, max_pages)
    activity_urls = []
    for page_number, response in valid_pages.items():
        start = time.time()
        soup = bs(response.text, 'html.parser')
        activity_urls += get_activity_urls(soup)
        print(f"Activity urls for page {page_number} added to list in {time.time() - start} seconds")
    print(f"Found {len(activity_urls)} activity urls")

    print(f"Estimated time to completion: {len(activity_urls) * 1.8} seconds")

    activity_len = len(activity_urls)
    for link in activity_urls:
        start = time.time()
        try:
            data = extract_data(link)
            dataframe.loc[len(dataframe)] = data
            activity_len -= 1
            print(f"Extracted data for {link} in {time.time() - start} seconds. {activity_len} activities left")
        except Exception as e:
            print(f"Error extracting data for {link} because {e}")
            continue
    print(f"Finished scraping {city_url}")
    return dataframe
    

In [6]:
if __name__ == "__main__":
    df = pd.DataFrame(columns=['Activity Name', 'Activity URL', 'Price per Participant', 'Valid Number of Participants', 'Valid Date', 'Activity Description', 'Duration', 'Average Rating', 'No. of Reviews'])
    df = main_scraper("https://www.viator.com/Bangkok/d343-ttd", max_pages = 84)
    df.to_excel("bangkok_activities.xlsx", index=False)

Checked validity for page 1 and it was 200
Checked validity for page 2 and it was 200. Returned in 1.0540146827697754 seconds
Checked validity for page 3 and it was 200. Returned in 0.8984968662261963 seconds
Checked validity for page 4 and it was 200. Returned in 0.9331660270690918 seconds
Checked validity for page 5 and it was 200. Returned in 1.4653961658477783 seconds
Checked validity for page 6 and it was 200. Returned in 1.0260207653045654 seconds
Checked validity for page 7 and it was 200. Returned in 0.905811071395874 seconds
Checked validity for page 8 and it was 200. Returned in 1.3069498538970947 seconds
Checked validity for page 9 and it was 200. Returned in 1.1144258975982666 seconds
Checked validity for page 10 and it was 200. Returned in 1.2990829944610596 seconds
Checked validity for page 11 and it was 200. Returned in 0.9520149230957031 seconds
Checked validity for page 12 and it was 200. Returned in 1.1556456089019775 seconds
Checked validity for page 13 and it was 20