# EventBrite SG Events information scraper.
This notebook is used for pulling and consolidating datasets found in EventBrite page https://www.eventbrite.sg/d/singapore--singapore/all-events/?page=1

Library imports

In [1]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException
from typing import Tuple, Union
from datetime import datetime
from selenium.webdriver.firefox.options import Options
import pandas as pd
import numpy as np

Below are the firefox profile options set to speed up loading process. Note: Do download and Install GeckoDriver in Selenium and set the PATH variable in sys variable to point to it in order to ensure that the FireFox Driver for Selenium can work. You may point it via a Service module of selenium' firefox service instead of using PATH variable as an alternative as follows.

In [2]:
options=Options()
firefox_option = webdriver.FirefoxProfile()
firefox_option.set_preference("network.http.pipelining", True)
firefox_option.set_preference("network.http.proxy.pipelining", True)
firefox_option.set_preference("network.http.pipelining.maxrequests", 8)
firefox_option.set_preference("content.notify.interval", 500000)
firefox_option.set_preference("content.notify.ontimer", True)
firefox_option.set_preference("content.switch.threshold", 250000)

options.profile = firefox_option

## Main execution of info scraping

Supporting function 

In [3]:
def find_venue_date_info(href:str, options: Options, multi_date:bool) -> Union[Tuple[str,str], Tuple[list,list]]:
    """Function which search for venue and date information from provided href link.

    Args:
        href (str): href link for webdriver to access to pull required info.
        options (Options): Selenium webdriver options.

    Returns:
        Union[tuple[str,str], tuple[list,list]]: _description_
    """
    service = Service(executable_path=r"C:\Program Files\geckodriver.exe")
    new_driver = webdriver.Firefox(service=service, options=options)
    new_driver.get(href)

    alternative_date_xpath = "//p[contains(text(), 'Date:')]"
    alternative_location_xpath = "//p[contains(text(), 'Venue:') or contains(text(), 'Location:') or contains(text(), 'Address:') or contains(text(), 'Place:')]"
    location_xpath = "//div[@class='location-info__address']"
    view_all_event_detail_xpath = "//button[@data-testid='view-event-details-button']"
    #datetime_xpath = "//*[@class='DateCard-module__root___28_4K']"

    try:
        new_driver.find_element(by=By.XPATH, value=view_all_event_detail_xpath).click()
        print("Found view all event details to click")
    except NoSuchElementException:
        print("No view all event details to click")

    if not multi_date:
        datetime_xpath = "//span[@class='date-info__full-datetime']"
        try:
            datetime_info = new_driver.find_element(by=By.XPATH, value=datetime_xpath).text
        except NoSuchElementException:
            print("Cant find datetime element. Setting to None.")
            datetime_info = ""

        # Use alternative datexpath
        if datetime_info == "":
            try:
                datetime_info = new_driver.find_element(by=By.XPATH, value=alternative_date_xpath).text
            except NoSuchElementException:
                print("Cant find datetime element using alternative xpath. Setting to None.")
                datetime_info = ""
        try:
            location_info = new_driver.find_element(by=By.XPATH, value=location_xpath).text
            location_info = location_info.split("\n")[0]
        except NoSuchElementException:
            print("Cant find location element. Setting to None.")
            location_info = ""

                # Use alternative datexpath
        if location_info == "":
            try:
                location_info = new_driver.find_element(by=By.XPATH, value=alternative_location_xpath).text
            except NoSuchElementException:
                print("Cant find location element using alternative xpath. Setting to None.")
                location_info = ""

        new_driver.quit()
        return datetime_info, location_info
    # Return a list of date/location when multiple datetimes are found
    else:
        datetime_list_xpath = "//li[@class='child-event-dates-item']"
        datetime_info_list = new_driver.find_elements(by=By.XPATH, value=datetime_list_xpath)

        datetime_info_list = []
        location_info_list = []
        for datetime_info in datetime_info_list:
            datetime_info.click()
            try:
                datetime_info = new_driver.find_element(by=By.XPATH, value=datetime_xpath).text
                #print(f"Datetime info: {datetime_info}")
            except NoSuchElementException:
                #print("Cant find datetime element. Setting to None.")
                datetime_info = ""
            try:
                location_info = new_driver.find_element(by=By.XPATH, value=location_xpath).text
                location_info = location_info.split("\n")[0]
            except NoSuchElementException:
                #print("Cant find location element. Setting to None.")
                location_info = ""
            # Append information
            datetime_info_list.append(datetime_info)
            location_info_list.append(location_info)
        # Quit driver after completion
        new_driver.quit()
        return datetime_info_list, location_info_list

In [4]:
#CFG
URL = "https://www.eventbrite.sg/d/singapore--singapore/all-events/?page=1"
service = Service(executable_path=r"C:\Program Files\geckodriver.exe")
driver = webdriver.Firefox(service=service, options=options)
driver.get(URL)

# pagination xpath
paginate_xpath= "//ul[@class='eds-pagination__navigation-group']/li[contains(@class, eds-pagination__navigation-minimal)][2]"

# Webpage wait for required paginate stuff to load
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, paginate_xpath)))

try: 
    total_page_div =  driver.find_element(by=By.XPATH, value=paginate_xpath).text
    # Get the total page number from the html text in the format: 'of X'
    total_page = int(total_page_div.split(" ")[-1])
except NoSuchElementException:
    print("Unable to find pagination information. Only current page will be scraped.")
    total_page = 1

# List to track extracted events
event_tracking_list = []

for page in np.arange(1, total_page + 1):
    page_url = f"https://www.eventbrite.sg/d/singapore--singapore/all-events/?page={page}"
    print(f"Processing page {page}")
    #Switch page
    driver.get(page_url)
    event_xpath = "//div[contains(@class, 'event-card__horizontal')]/section[@class='event-card-details']"
    events_div_list = driver.find_elements(by=By.XPATH, value=event_xpath)
    for event in events_div_list:
        event_title_xpath = "./div/a"
        try:
            event_title = event.find_element(by=By.XPATH, value=event_title_xpath).text

            event_href= event.find_element(by=By.XPATH, value=event_title_xpath).get_attribute('href')
        except NoSuchElementException:
            print("Unable to get event title. Skipping current event")
            continue
        
        # Skip cases for Malaysia related event (hardcoded)
        if "in Johor" in event_title or "Pasir Gudang" in event_title:
            print("Detected event name hosted in Johor, skipping this event from inclusion")
            continue
        # Event date/location information
        event_date_location_xpath = "./div/p"
        event_date_location_ele_list = event.find_elements(by=By.XPATH, value=event_date_location_xpath)

        # When both date and venue info are available
        if len(event_date_location_ele_list) == 2:

            event_date = event_date_location_ele_list[0].text
            event_location = event_date_location_ele_list[1].text

            # Get actual date info from href page as eventbrite does not list actual date on the page itself if the event occurs within few days of the date which data is scraped. (e.g at thurs, XX:XX when viewing page for scraping is done on wed or before).
            if " at " in event_date :
                event_date, event_location = find_venue_date_info(
                href=event_href,
                options=options,
                multi_date=False
            )
            # When there is more dates to it as indicated by + X more in the text
            elif " +" in event_date and " more" in event_date:
                print(f"Multiple dates found for {event_title}")
                event_date, event_location = find_venue_date_info(
                href=event_href,
                options=options,
                multi_date=True
            )
        # When date or venue info is lacking or abundance of info
        else:
            event_date, event_location = find_venue_date_info(
                href=event_href,
                options=options,
                multi_date=False
            )
        
        # For multiple dates case. loop through and consolidate the dates
        if isinstance(event_date, list) and isinstance(event_location, list):
            for date,location in zip(event_date, event_location):
                event_metadata_list = [event_title, date, location, event_href]
                print(event_metadata_list)
                event_tracking_list.append(event_metadata_list)
        else:
            event_metadata_list = [event_title, event_date, event_location, event_href]
            #print(event_metadata_list)
            event_tracking_list.append(event_metadata_list)
        print()

    # Due to large number of page to be processed, we will always save each page event as a file. Should anything go wrong, we can identify the page which needs to be rescraped onwards.
    print("Saving to csv file")
    df = pd.DataFrame(event_tracking_list, columns=["Event Title", "Date", "Location", "URL"])
    
    # Drop duplicates for cases with same date/location/url
    df.drop_duplicates(subset=["Date", "Location", "URL"], inplace=True, keep="last")
    df.dropna(subset=["Date", "Location", "URL"], inplace=True)
    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now

    dataset_filename = f"EventBrite_dataset_{file_name_date}_page_{page}.csv"
    df.to_csv(dataset_filename, index=False, encoding='utf-8')

driver.quit()

Processing page 1

No view all event details to click
Cant find datetime element. Setting to None.
Cant find datetime element using alternative xpath. Setting to None.

No view all event details to click
Cant find datetime element. Setting to None.
Cant find datetime element using alternative xpath. Setting to None.


Found view all event details to click


No view all event details to click





No view all event details to click

No view all event details to click



No view all event details to click

No view all event details to click
Cant find datetime element. Setting to None.




Saving to csv file
Processing page 2
No view all event details to click

No view all event details to click


No view all event details to click


No view all event details to click

No view all event details to click

No view all event details to click


No view all event details to click





No view all event details to click



No view all event details to click



Saving to csv file
Processing page