# ActiveSG Events information scraper.
This notebook is used for pulling and consolidating Sports event data found on ActiveSG page https://www.activesgcircle.gov.sg/things-to-do/events.

A quick filtering which was done found that only events from Jan 2023 onwards are available from the page itself, hence it is advised, to filter date starting from Jan 2023 onwards so as to get past and upcoming events. 

Library imports

In [5]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException
from typing import Tuple
from datetime import datetime
from selenium.webdriver.firefox.options import Options
import pandas as pd
import numpy as np
import calendar
import datetime

Below are the firefox profile options set to speed up loading process. Note: Do download and Install GeckoDriver in Selenium and set the PATH variable in sys variable to point to it in order to ensure that the FireFox Driver for Selenium can work. You may point it via a Service module of selenium' firefox service instead of using PATH variable as an alternative as follows.

In [2]:
options=Options()
firefox_option = webdriver.FirefoxProfile()
firefox_option.set_preference("network.http.pipelining", True)
firefox_option.set_preference("network.http.proxy.pipelining", True)
firefox_option.set_preference("network.http.pipelining.maxrequests", 8)
firefox_option.set_preference("content.notify.interval", 500000)
firefox_option.set_preference("content.notify.ontimer", True)
firefox_option.set_preference("content.switch.threshold", 250000)

options.profile = firefox_option

## Main execution of info scraping

Supporting function 

In [3]:
def find_venue_date_info(href:str, options: Options) -> Tuple[str,str]:
    """Function which search for venue and date information from provided href link.

    Args:
        href (str): href link for webdriver to access to pull required info.
        options (Options): Selenium webdriver options.

    Returns:
        tuple[str,str]: _description_
    """
    service = Service(executable_path=r"C:\Program Files\geckodriver.exe")
    new_driver = webdriver.Firefox(service=service, options=options)
    new_driver.get(href)

    datetime_xpath = "//span[@class='date-info__full-datetime']"
    location_xpath = "//div[@class='location-info__address']"

    try:
        datetime_info = new_driver.find_element(by=By.XPATH, value=datetime_xpath).text
    except NoSuchElementException:
        print("Cant find datetime element. Setting to None.")
        datetime_info = ""
    try:
        location_info = new_driver.find_element(by=By.XPATH, value=location_xpath).text
        location_info = location_info.split("\n")[0]
    except NoSuchElementException:
        print("Cant find location element. Setting to None.")
        location_info = ""

    new_driver.quit()
    return datetime_info, location_info

DATE configuration section

In [8]:
# START and END DATE setting. Modify the dates based on your need. Must be in dd/mm/yyyy
START_DATE = "1/1/2023"
END_DATE = "1/12/2024"

start_unix_time = calendar.timegm(datetime.datetime.strptime(START_DATE, "%d/%m/%Y").timetuple())
end_unix_time= calendar.timegm(datetime.datetime.strptime(END_DATE, "%d/%m/%Y").timetuple())
print(start_unix_time)
print(end_unix_time)

1672531200
1733011200


In [4]:
#CFG
URL = "https://www.activesgcircle.gov.sg/things-to-do/events?page=1"
service = Service(executable_path=r"C:\Program Files\geckodriver.exe")
driver = webdriver.Firefox(service=service, options=options)
driver.get(URL)

START = 


# pagination xpath
paginate_xpath= "//ul[@class='eds-pagination__navigation-group']/li[contains(@class, eds-pagination__navigation-minimal)][2]"

# Webpage wait for required paginate stuff to load
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, paginate_xpath)))

try: 
    total_page_div =  driver.find_element(by=By.XPATH, value=paginate_xpath).text
    # Get the total page number from the html text in the format: 'of X'
    total_page = int(total_page_div.split(" ")[-1])
except NoSuchElementException:
    print("Unable to find pagination information. Only current page will be scraped.")
    total_page = 1

# List to track extracted events
event_tracking_list = []

for page in np.arange(1, total_page + 1):
    page_url = f"https://www.eventbrite.sg/d/singapore--singapore/all-events/?page={page}"
    #Switch page
    driver.get(page_url)
    event_xpath = "//div[contains(@class, 'event-card__horizontal')]/section[@class='event-card-details']"
    events_div_list = driver.find_elements(by=By.XPATH, value=event_xpath)
    for event in events_div_list:
        event_title_xpath = "./div/a"
        try:
            event_title = event.find_element(by=By.XPATH, value=event_title_xpath).text

            event_href= event.find_element(by=By.XPATH, value=event_title_xpath).get_attribute('href')
        except NoSuchElementException:
            print("Unable to get event title. Skipping current event")
            continue
        print(f"Processing {event_title}")
        # Event date/location information
        event_date_location_xpath = "./div/p"
        event_date_location_ele_list = event.find_elements(by=By.XPATH, value=event_date_location_xpath)

        # When both date and venue info are available
        if len(event_date_location_ele_list) == 2:

            event_date = event_date_location_ele_list[0].text
            event_location = event_date_location_ele_list[1].text

            # Get actual date from href page
            if " at " in event_date:
                event_date, _ = find_venue_date_info(
                href=event_href,
                options=options
            )
        # When date or venue info is lacking or abundance of info
        else:
            event_date, event_location = find_venue_date_info(
                href=event_href,
                options=options
            )

        event_metadata_list = [event_title, event_date, event_location, event_href]
        print(event_metadata_list)
        event_tracking_list.append(event_metadata_list)
        print()
driver.quit()

['REITs Symposium 2024', 'Sat, 11 May, 09:00', 'Suntec Convention Centre', 'https://www.eventbrite.sg/e/reits-symposium-2024-tickets-845501847787?aff=ebdssbdestsearch']
['[Last Few Tickets] PREP - Live in Singapore', 'Thu, 9 May, 19:00', '*SCAPE The Ground Theatre', 'https://www.eventbrite.sg/e/last-few-tickets-prep-live-in-singapore-tickets-855199152687?aff=ebdssbdestsearch']
['Doujin Market 2024', 'Sat, 11 May, 12:00', 'Suntec Singapore Convention & Exhibition Centre', 'https://www.eventbrite.sg/e/doujin-market-2024-tickets-854639749497?aff=ebdssbdestsearch']
['RGPS Fantasia 2024', 'Tue, 21 May, 19:00', "Paya Lebar Methodist Girls' Secondary School", 'https://www.eventbrite.com/e/rgps-fantasia-2024-tickets-868077502177?aff=ebdssbdestsearch']
['MS Spring Choir Concert', 'Thu, 9 May, 16:30', 'Auditorium, Singapore American School', 'https://www.eventbrite.sg/e/ms-spring-choir-concert-tickets-885325742127?aff=ebdssbdestsearch']
["Neuroscience of Happiness Seminar by Harvard's Dr Sara La

## Save data to csv

In [None]:
## Convert dataset dictionary to dataframe
if event_tracking_list:
    df = pd.DataFrame(event_tracking_list, columns=["Event Title", "Date", "Location", "URL"])
    
    # Drop duplicates for cases with same date/location/url
    df.drop_duplicates(subset=["Date", "Location", "URL"], inplace=True, keep="last")
    df.dropna(subset=["Date", "Location", "URL"], inplace=True)
    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now
    df.head()

    dataset_filename = f"EventBrite_dataset_{file_name_date}.csv"
    df.to_csv(dataset_filename, index=False, encoding='utf-8')


# Close driver upon completion of saving file
driver.quit()
