# ActiveSG Events information scraper.
This notebook is used for pulling and consolidating Sports event data found on ActiveSG page https://www.activesgcircle.gov.sg/things-to-do/events.

A quick filtering which was done found that only events from Jan 2023 onwards are available from the page itself, hence it is advised, to filter date starting from Jan 2023 onwards so as to get past and upcoming events. 

Library imports

In [11]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from datetime import datetime
from selenium.webdriver.firefox.options import Options
import pandas as pd
import calendar
import time

Below are the firefox profile options set to speed up loading process. Note: Do download and Install GeckoDriver in Selenium and set the PATH variable in sys variable to point to it in order to ensure that the FireFox Driver for Selenium can work. You may point it via a Service module of selenium' firefox service instead of using PATH variable as an alternative as follows.

In [12]:
options=Options()
firefox_option = webdriver.FirefoxProfile()
firefox_option.set_preference("network.http.pipelining", True)
firefox_option.set_preference("network.http.proxy.pipelining", True)
firefox_option.set_preference("network.http.pipelining.maxrequests", 8)
firefox_option.set_preference("content.notify.interval", 500000)
firefox_option.set_preference("content.notify.ontimer", True)
firefox_option.set_preference("content.switch.threshold", 250000)

options.profile = firefox_option

## Main execution of info scraping

Supporting function 

DATE configuration section
- Set start date and end date desired for START_DATE, END_DATE

In [13]:
# START and END DATE setting. Modify the dates based on your need. Must be in dd/mm/yyyy
START_DATE = "1-Jan-2023"
END_DATE = "1-Dec-2024"

# Construct monthly start interval between the start/end dates
dates_list = pd.date_range(START_DATE,END_DATE, freq='MS').strftime("%d-%b-%Y").tolist()
unix_time_dict= {x: calendar.timegm(datetime.datetime.strptime(x, "%d-%b-%Y").timetuple()) for x in dates_list}
unix_time_dict

AttributeError: type object 'datetime.datetime' has no attribute 'datetime'

In [14]:
#CFG
service = Service(executable_path=r"C:\Program Files\geckodriver.exe")
driver = webdriver.Firefox(service=service, options=options)

event_tracking_list = []

# Loop thru each calendar month
for date, unix_time in unix_time_dict.items():
    # Note the unix time in the url is measured in millisec
    URL = f"https://www.activesgcircle.gov.sg/things-to-do/events?date={unix_time}000&filter=All&type=All&s=All"
    driver.get(URL)
    print(f"Page showing events for the month starting {date}")
    event_xpath= "//div[@id='oneDay']/div[@class='outerDiv']"
    # Webpage wait for required paginate stuff to load
    try:
        WebDriverWait(driver, 120).until(EC.presence_of_element_located((By.XPATH, event_xpath)))
    except TimeoutException:
        print("No events found")
        continue

    print("Finding total events listed in the page")
    total_events_ele_list =  driver.find_elements(by=By.XPATH, value=event_xpath)
    print(f"Total events for the page: {len(total_events_ele_list)}")

    if total_events_ele_list == []:
        print("No events for this month.., moving to next")
        continue
    for event in total_events_ele_list:
        # Case 1: Start and end date are different
        event_href_xpath = ".//a"
        try:
            event_href = event.find_element(by=By.XPATH, value=event_href_xpath).get_attribute("href")
        except NoSuchElementException:
            print("No href found.")
            event_href = ""

        event_title_xpath = ".//div[@class='cal-title']"
        try:
            event_title = event.find_element(by=By.XPATH, value=event_title_xpath).text
        except NoSuchElementException:
            print("No event title found.")
            event_title = ""
            
        event_location_xpath = ".//div[@class='venue']"
        try:
            event_location = event.find_element(by=By.XPATH, value=event_location_xpath).text
        except NoSuchElementException:
            print("No event location found.")
            event_location = ""

        # Case 1: Start and end date are different
        start_date_xpath = ".//div[@class='d-wrap']/div[@class='sDate']/div[contains(@class,'round')]/div[@class='b-date']"
        end_date_xpath = ".//div[@class='d-wrap']/div[@class='eDate']/div[contains(@class,'round')]/div[@class='b-date']"
        try:
            start_date = event.find_element(by=By.XPATH, value=start_date_xpath).text
            start_date = start_date.replace("\n", " ")
            end_date = event.find_element(by=By.XPATH, value=end_date_xpath).text
            end_date = end_date.replace("\n", " ")

            date_period = start_date + " - " + end_date
        except NoSuchElementException:
            print("No separate start/end date found")
            date_period = ""

        # Make another attempt
        if date_period == "":
            # Case 2: Start/end date the same
            print("Attempting to find date-day element")
            try:
                same_start_end_date_xpath = ".//div[@class='date-day']"
                date = event.find_element(by=By.XPATH, value=same_start_end_date_xpath).text
                date = date.replace("\n", " ")
                date_period = date + " - " + date
            except NoSuchElementException:
                print("No date-day element found")
                date_period = ""

        event_metadata_list = [event_title, date_period, event_location, event_href]
        print(event_metadata_list)
        event_tracking_list.append(event_metadata_list)
    print("Sleeping before going next page")
    time.sleep(5)
    print()

# Save to csv file
print("Saving to csv file")
if event_tracking_list:
    df = pd.DataFrame(event_tracking_list, columns=["Event Title", "Date", "Location", "URL"])

    # Drop duplicates for cases with same date/location/url
    df.drop_duplicates(subset=["Date", "Location", "URL"], inplace=True, keep="last")
    df.dropna(subset=["Date", "Location", "URL"], inplace=True)
    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now

    dataset_filename = f"ActiveSG_dataset_{file_name_date}.csv"
    df.to_csv(dataset_filename, index=False, encoding='utf-8')

driver.quit()

Page showing events for the month starting 01-Jan-2023
Finding total events listed in the page
Total events for the page: 3
['2023 ABL Invitational Tournament', '02 Jan - 08 Jan', 'OCBC Arena', 'https://www.activesgcircle.gov.sg/things-to-do/events/2023-abl-invitational-tournament-02/01/2023']
['Singapore Sail Grand Prix', '14 Jan - 15 Jan', 'Parkland Green, East Coast Park', 'https://www.activesgcircle.gov.sg/things-to-do/events/singapore-sail-grand-prix-14/01/2023']
['Ice Magic - City Ski Championship 2023', '21 Jan - 21 Jan', 'Bayfront Event Space', 'https://www.activesgcircle.gov.sg/things-to-do/events/ice-magic---city-ski-championship-2023-21/01/2023']
Sleeping before going next page

Page showing events for the month starting 01-Feb-2023
Finding total events listed in the page
Total events for the page: 4
['National Open Championships 2023', '04 Feb - 10 Feb', 'OCBC Arena', 'https://www.activesgcircle.gov.sg/things-to-do/events/national-open-championships-2023-04/02/2023']
No sep