# Singstat and Data.gov.sg Information crawler.
This notebook is used for pulling and consolidating datasets found in both sites

## Install dependencies for pulling data sources information

### Notes: BeautifulSoup can only handle static website content scraping. Selenium library is required together with beautifulsoup to read all dynamically loaded content which is the case for singstat and data.gov.sg.

Library imports

In [1]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException

from datetime import datetime
from selenium.webdriver.firefox.options import Options
import pandas as pd

In [2]:
options=Options()
firefox_option = webdriver.FirefoxProfile()
firefox_option.set_preference("network.http.pipelining", True)
firefox_option.set_preference("network.http.proxy.pipelining", True)
firefox_option.set_preference("network.http.pipelining.maxrequests", 8)
firefox_option.set_preference("content.notify.interval", 500000)
firefox_option.set_preference("content.notify.ontimer", True)
firefox_option.set_preference("content.switch.threshold", 250000)
firefox_option.set_preference("browser.cache.memory.capacity", 65536) # Increase the cache capacity.
firefox_option.set_preference("browser.startup.homepage", "about:blank")
firefox_option.set_preference("reader.parse-on-load.enabled", False) # Disable reader, we won't need that.
firefox_option.set_preference("browser.pocket.enabled", False) # Duck pocket too!
firefox_option.set_preference("loop.enabled", False)
firefox_option.set_preference("browser.chrome.toolbar_style", 1) # Text on Toolbar instead of icons
firefox_option.set_preference("browser.display.show_image_placeholders", False) # Don't show thumbnails on not loaded images.
firefox_option.set_preference("browser.display.use_document_colors", False) # Don't show document colors.
firefox_option.set_preference("browser.display.use_document_fonts", 0) # Don't load document fonts.
firefox_option.set_preference("browser.display.use_system_colors", True) # Use system colors.
firefox_option.set_preference("browser.formfill.enable", False) # Autofill on forms disabled.
firefox_option.set_preference("browser.helperApps.deleteTempFileOnExit", True) # Delete temprorary files.
firefox_option.set_preference("browser.shell.checkDefaultBrowser", False)
firefox_option.set_preference("browser.startup.homepage", "about:blank")
firefox_option.set_preference("browser.startup.page", 0) # blank
firefox_option.set_preference("browser.tabs.forceHide", True) # Disable tabs, We won't need that.
firefox_option.set_preference("browser.urlbar.autoFill", False) # Disable autofill on URL bar.
firefox_option.set_preference("browser.urlbar.autocomplete.enabled", False) # Disable autocomplete on URL bar.
firefox_option.set_preference("browser.urlbar.showPopup", False) # Disable list of URLs when typing on URL bar.
firefox_option.set_preference("browser.urlbar.showSearch", False) # Disable search bar.
firefox_option.set_preference("extensions.checkCompatibility", False) # Addon update disabled
firefox_option.set_preference("extensions.checkUpdateSecurity", False)
firefox_option.set_preference("extensions.update.autoUpdateEnabled", False)
firefox_option.set_preference("extensions.update.enabled", False)
firefox_option.set_preference("general.startup.browser", False)
firefox_option.set_preference("plugin.default_plugin_disabled", False)
firefox_option.set_preference("permissions.default.image", 2) # Image load disabled again
firefox_option.set_preference("javascript.enabled", False)
options.profile = firefox_option

## Getting SG calendar of events from SmartLocal https://thesmartlocal.com/event-calendar/?a=alltime
Smartlocal also provides information on events for the following countries other than Singapore.
1) Thailand
2) Malaysia
3) Indonesia
4) Vietnam
5) Korea
6) Phillipines

In the case where venue information is not available directly, first attempt to get the venue/date info from href provided through the section on event information at the bottom of the page.
Should the venue info be empty, we assume its island wide event for simplicity. This is due to varied ways on how the page author presents the event information in the page. 

Example of such as is as follows:
- https://thesmartlocal.com/read/wellness-festival-singapore/

In [3]:
# Function that is being called to get information on venue/date via href provided when actual info is not available on main calendar of events 
def find_venue_date_info(href:str, options, event_title: str): 
    
    new_driver = webdriver.Firefox(options=options)
    new_driver.get(href)

    all_event_info_xpath = './/div[contains(@class, "wfnb-flex-box")]'
    all_event_info_element_list = new_driver.find_elements(by=By.XPATH, value=all_event_info_xpath)
        
    WebDriverWait(new_driver, 5).until(EC.presence_of_element_located((By.XPATH, all_event_info_xpath)))
    # Get published event date as fallback date reference
    try:
        publish_event_date_xpath = ".//span[@id='meta-date']"
        publish_event_date = new_driver.find_element(by=By.XPATH, value=publish_event_date_xpath).text
        publish_event_date = publish_event_date.replace("\n", " ")
    except NoSuchElementException:
        publish_event_date = ""

    # Get dates venues from paragraphs if either date/venue is not of assumed usable format. Attempt to get multiple venues from the page content instead
    return_dict = {}
    print(f"Subevent of {event_title} schedules found : {len(all_event_info_element_list)}")
    for idx in range(len(all_event_info_element_list)):
        info_xpath = f'.//div[contains(@class, "wfnb-flex-box")][{idx}]'
        date_xpath = f'{info_xpath}//div[@class="wfnb-info-metabox"]//div[2]'
        venue_xpath = f'{info_xpath}//div[@class="wfnb-info-metabox"]//div[1]'

        print("Attempt to find date/venue info from event information section")
        try:    
            date = new_driver.find_element(by=By.XPATH, value=date_xpath).text
            date = date.replace("Date:","").replace("Dates:","")
            date = ''.join([c for c in date if c not in [' ', '\t', '\n']])
            if date == "-":
                date = ""
        except NoSuchElementException:
            date = ""

        try:
            venue = new_driver.find_element(by=By.XPATH, value=venue_xpath).text
            venue = venue.strip()
            venue = ''.join([c for c in venue if c not in ['\t', '\n']])
            if not venue:
                venue = ""
        except NoSuchElementException:
            venue = ""

        # Fallback search
        if venue == "":
            try:
                # Worst case fall back
                default_venue_xpath = './/article//*[contains(text(), ", Singapore ")]'
                venue = new_driver.find_element(by=By.XPATH, value=default_venue_xpath).text
            except NoSuchElementException:
                if "RSAF" in event_title:
                    venue = "Paya Lebar Airbase"
                else:
                    venue = ""

        # When date and venue info are available, update dictionary with index as suffix as representation
        if date!= "" and venue!="":
            print("Date and venue info found..")
            event_title = f"{event_title + ":#" + str(idx)}"
            return_dict[event_title] = [date, venue]
        # Use content sub-events/ and date instead
        else:
            print("Missing date/venue info, Attempt to use rule-based search for such info in the content.")
            # Extract subevents listed on the page itself. Hardcoded assumptions
            ordinals_list = ["st ", "th ", "nd ", "rd ","st,", "th,", "nd,", "rd,"]

            # Get events from content (called sub-events). Returns a list of events, empty list if none (especially for online)
            sub_events_xpath = ".//p//*[contains(text(), 'Venue') or contains(text(), 'Address')]/../.."
            sub_events_list = new_driver.find_elements(by=By.XPATH, value=sub_events_xpath)

            # When subevents section is not provided, use main original event title with published date and assume online
            if sub_events_list == []:
                print("No sub events available, using publish event date as reference")

                if date!= "":
                    return_dict[event_title] = [date, "Online"]
                else:
                    return_dict[event_title] = [publish_event_date, "Online"]
            # Case of at least 1 event provided, extract all such events info
            else:
                for idx, sub_event in enumerate(sub_events_list, start=1):

                    # Get the sub_event name if possible
                    sub_info_xpath = ".//*//span[@style='text-decoration: underline']"
                    try:
                        sub_event_info = sub_event.find_element(by=By.XPATH, value=sub_info_xpath).text
                    except NoSuchElementException:
                        # Use index as suffix to main event if event info is not available
                        sub_event_info = str(idx)

                    # Get the paragraph text, assume venue is the last line of text in <p> tag
                    sub_event_para_list = sub_event.text.split("\n")

                    # Remove identifiers such as venue and address
                    sub_event_venue = str(sub_event_para_list[-1]).replace("Venue:","").replace("Address:", "").strip()

                    # COnstruct new event name with subevents identified
                    sub_event_title = event_title + ":#" + sub_event_info
                    print(sub_event_title)
                    # Extract paragraph value dates first. Else extract from key

                    # Get date information. (Can comes as key info or value info)
                    sub_event_date_list = [content for content in sub_event_para_list if (any(ord in content for ord in ordinals_list) and "Date" in content)]

                    print(f"Subevent title and venue: {sub_event_title}, {sub_event_venue}")
                    #print(sub_event_date_list)
                    if sub_event_date_list == []:
                        # When date becomes the key instead of values which is not in paragraph
                        print("Unable to find any sub event dates as it is not a value format of key-value pair")
                        after_date_n_time_xpath = ".//b[contains(text(), 'st') or contains(text(), 'th') or contains(text(), 'nd') or contains(text(), 'rd')]"
                        date_list = sub_event.find_elements(by=By.XPATH, value=after_date_n_time_xpath)
                        sub_event_date = ",".join([date.text for date in date_list])
                    else:
                        # Take the first element as date usually is positioned at the front.
                        print("Found date instance as value")
                        # Strip starting Date: string and removal leading/trailing space
                        sub_event_date = sub_event_date_list[0].replace("Date:", "").replace("Dates:", "").strip()
                    
                    # Replace Now till wording 
                    till_list = ["Now till", "now till", "till", "Till"]
                    for till_string in till_list:
                        if till_string in sub_event_date:
                            # Add a - behind for replacing now till
                            sub_event_date = sub_event_date.replace(till_string, f"{publish_event_date} -")
                            break
                    # Multiple dates case, we amend the event name by adding loopg index as suffix. No change to menu
                    
                    if "|" in sub_event_date:
                        print("Multiple dates detected")
                        sub_event_date_list = sub_event_date.split("|")
                        for j, event_date in enumerate(sub_event_date_list):
                            return_dict[f"{sub_event_title}_{j}"] = [event_date, sub_event_venue]
                    else:
                        print("Single date detected")
                        #Update dict
                        return_dict[sub_event_title] = [sub_event_date, sub_event_venue]

                    print(f"Date: {sub_event_date}")

    new_driver.quit()
    return return_dict

Actual execution. Please avoid using Chrome for webdriver as the API service remains even with the use of quit method upon completion of necessary work. This causes increasing CPU usage for no useful purpose.

Below are the firefox profile options set to speed loading process

In [4]:
URL = "https://thesmartlocal.com/event-calendar/?a=alltime"
driver = webdriver.Firefox(options=options)
driver.get(URL)

# XPATH
results_xpath= ".//div[contains(@class, 'bg-light')]"

# Webpage wait for required xpath to load
WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.XPATH, results_xpath)))

# FInd elements
total_events = driver.find_elements(by=By.XPATH, value=results_xpath)
if total_events:
    print(f"Total events: {len(total_events)}")
else:
    print("Unable to find events related element via XPATH. Assuming No results.")
    exit()

event_tracking_list = []
for event in total_events:
    # For each event, extract title, status of event, and the dates and location which event occur. For multiple dates/locations, this will be added as separate entries
    event_header_xpath = ".//div[contains(@class, 'card-header')]//h5//a"
    try:
        event_title = event.find_element(by=By.XPATH, value=event_header_xpath).text
    except NoSuchElementException:
        print("Unable to find event info, continuing to the next event available")
        continue
    
    print("Event: ", event_title)

    event_status_xpath = ".//div[contains(@class, 'card-header')]//div[contains(@class, 'status')]"
    try:
        status = event.find_element(by=By.XPATH, value=event_status_xpath).text
    except NoSuchElementException:
        print("Unable to find event status info, assuming unknown state")
        status = "Unknown"

    # Status which is empty mean ongoing event based on the date of scraping in comparison with event date
    print("Status: ", status)
    event_info_xpath = ".//div[contains(@class, 'card-body')]//p"
    event_dates_list = event.find_elements(by=By.XPATH, value=event_info_xpath)

    # Href
    footer_xpath = ".//div[contains(@class, 'card-footer')]//a"
    try:
        href_element = event.find_element(by=By.XPATH, value=footer_xpath)
        href = href_element.get_attribute("href")
    except NoSuchElementException:
        print("Unable to find event status info, assuming unknown state")
        href = ""
    print("Href: ", href)

    # Loop through list of events
    for i, event_date in enumerate(event_dates_list, start=1):
        # Get event name via text method
        # Exclude next line info from text info collected
        date_info = event_date.text.split("\n")[0]
        date_info = date_info.replace("From ", "").replace("from ", "")
        if date_info == "-":
            date_info == ""
        # Find location info
        event_location_xpath = ".//a"

        # Attempt to get location directly
        try:
            location_info = event_date.find_element(by=By.XPATH, value=event_location_xpath ).text
        except NoSuchElementException:
            location_info = ""

        # If no info for location or date, access the page to pull more info. As all info will be pulled within, terminate the loop
        if (location_info == "" or date_info == "") and href != "":
            sub_event_dict = find_venue_date_info(
                href=href, options=options, event_title= event_title
            )
            for k,v in sub_event_dict.items():
                # Subevent title, date, venue unpacked
                event_metadata_list = [k, *v, status, href]
                print(event_metadata_list)
                event_tracking_list.append(event_metadata_list)
            break
        # Extract info through the loop if available
        else:
            event_metadata_list = [f"{event_title}", date_info, location_info, status, href]
            print(event_metadata_list)
            event_tracking_list.append(event_metadata_list)

# Find total entries based on date and location regardless of multiple same event information 
print("Including subevents:",len(event_tracking_list))

Total events: 349
Event:  West Mall BT21-Themed CNY Activities & Promos
Status:  ENDED
Href:  https://thesmartlocal.com/read/west-mall-bt21-cny-2024/
['West Mall BT21-Themed CNY Activities & Promos', '05 Jan - 24 Feb 2024', '1 Bukit Batok Central, Singapore 658713', 'ENDED', 'https://thesmartlocal.com/read/west-mall-bt21-cny-2024/']
Event:  Comma Creative Arts Festival 2024
Status:  ENDED
Href:  https://thesmartlocal.com/read/comma-creative-arts-festival-2024/
['Comma Creative Arts Festival 2024', '12 - 28 Jan 2024', '2 Orchard Link, Singapore 237978', 'ENDED', 'https://thesmartlocal.com/read/comma-creative-arts-festival-2024/']
Event:  Light To Night 2024
Status:  ENDED
Href:  https://thesmartlocal.com/read/light-to-night-2024/
['Light To Night 2024', '19 Jan - 08 Feb 2024', 'Civic District', 'ENDED', 'https://thesmartlocal.com/read/light-to-night-2024/']
Event:  Creative Intersections: Traces Of Dragons
Status:  ENDED
Href:  https://thesmartlocal.com/read/singapore-art-week-2024-funa

## Save data to csv

In [None]:
## Convert dataset dictionary to dataframe
if event_tracking_list:
    df = pd.DataFrame(event_tracking_list, columns=["Event Title", "Date", "Location", "Status", "URL"])

    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now
    df.head()

    dataset_filename = f"Smartlocal_dataset_{file_name_date}.csv"
    df.to_csv(dataset_filename, index=False, encoding='utf-8')


# Close driver upon completion of saving file
driver.quit()
df.head()

Unnamed: 0,Event Title,Date,Location,Status,URL,Date_of_check
0,West Mall BT21-Themed CNY Activities & Promos:1,05 Jan - 24 Feb 2024,"1 Bukit Batok Central, Singapore 658713",ENDED,https://thesmartlocal.com/read/west-mall-bt21-...,08/04/2024 22:25:59
1,Comma Creative Arts Festival 2024:1,12 - 28 Jan 2024,"2 Orchard Link, Singapore 237978",ENDED,https://thesmartlocal.com/read/comma-creative-...,08/04/2024 22:25:59
2,Light To Night 2024:1,19 Jan - 08 Feb 2024,Civic District,ENDED,https://thesmartlocal.com/read/light-to-night-...,08/04/2024 22:25:59
3,Creative Intersections: Traces Of Dragons:1,19 Jan - 25 Feb 2024,Funan,ENDED,https://thesmartlocal.com/read/singapore-art-w...,08/04/2024 22:25:59
4,Artbox Avenue 2024:1,26 Jan - 04 Feb 2024,"Singapore Expo Hall 2, 1 Expo Drive, Singapore...",ENDED,https://thesmartlocal.com/read/artbox-avenue-2...,08/04/2024 22:25:59
