# Singstat and Data.gov.sg Information crawler.
This notebook is used for pulling and consolidating datasets found in both sites

## Install dependencies for pulling data sources information

### Notes: BeautifulSoup can only handle static website content scraping. Selenium library is required together with beautifulsoup to read all dynamically loaded content which is the case for singstat and data.gov.sg.

Library imports

In [1]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException
from datetime import datetime
from types import ModuleType
import pandas as pd

## Getting SG calendar of events from SmartLocal https://thesmartlocal.com/event-calendar/?a=alltime
Smartlocal also provides information on events for the following countries other than Singapore.
1) Thailand
2) Malaysia
3) Indonesia
4) Vietnam
5) Korea
6) Phillipines

In [2]:
# Function that is being called to get information on venue/date via href provided when actual info is not available on main calendar of events 
def find_venue_date_info(href:str): 
    
    new_driver = webdriver.Firefox()
    new_driver.get(href)

    date_xpath = '//h5[contains(text(), "Date")]//..'
    venue_xpath = '//h5[@class="wfnb-venue"]'

    WebDriverWait(new_driver, 5).until(EC.presence_of_element_located((By.XPATH, date_xpath)))
    try:
        date = new_driver.find_element(by=By.XPATH, value=date_xpath).text
        date = date.strip()
    except NoSuchElementException:
        date = ""

    try:
        venue = new_driver.find_element(by=By.XPATH, value=venue_xpath).text
        venue = venue.strip()
    except NoSuchElementException:
        # Try in-text venue
        fallback_venue_xpath = ".//b[contains(text(), 'Venue')]"
        venue_list = new_driver.find_elements(by=By.XPATH, value=fallback_venue_xpath)
        # When there is no venue, assume whole of country
        if not venue_list:
            print("No venue information. Assume whole of country event")
            venue = "All"
        else:
            venue = venue_list[0].text
    new_driver.quit()
    return date, venue

Actual execution. Please avoid using Chrome for webdriver as the API service remains even with the use of quit method upon completion of necessary work. This causes increasing CPU usage for no useful purpose.

In [3]:
URL = "https://thesmartlocal.com/event-calendar/?a=alltime"
# chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')

# driver = webdriver.Chrome(options=chrome_options)
driver = webdriver.Firefox()
driver.get(URL)

# XPATH
results_xpath= ".//div[contains(@class, 'bg-light')]"

# Webpage wait for required xpath to load
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, results_xpath)))

# FInd elements
total_events = driver.find_elements(by=By.XPATH, value=results_xpath)
if total_events:
    print(f"Total events: {len(total_events)}")
else:
    print("Unable to find events related element via XPATH. Assuming No results.")
    exit()

event_tracking_list = []
for event in total_events:
    # For each event, extract title, status of event, and the dates and location which event occur. For multiple dates/locations, this will be added as separate entries
    event_header_xpath = ".//div[contains(@class, 'card-header')]//h5//a"
    try:
        event_title = event.find_element(by=By.XPATH, value=event_header_xpath).text
    except NoSuchElementException:
        print("Unable to find event info, continuing to the next event available")
        continue
    print("Event: ", event_title)

    event_status_xpath = ".//div[contains(@class, 'card-header')]//div[contains(@class, 'status')]"
    try:
        status = event.find_element(by=By.XPATH, value=event_status_xpath).text
    except NoSuchElementException:
        print("Unable to find event status info, assuming unknown state")
        status = "Unknown"
    print("Status: ", status)
    event_info_xpath = ".//div[contains(@class, 'card-body')]//p"
    event_dates_list = event.find_elements(by=By.XPATH, value=event_info_xpath)

    # Href
    footer_xpath = ".//div[contains(@class, 'card-footer')]//a"
    try:
        href_element = event.find_element(by=By.XPATH, value=footer_xpath)
        href = href_element.get_attribute("href")
    except NoSuchElementException:
        print("Unable to find event status info, assuming unknown state")
        href = ""

    for event_date in event_dates_list:
        # Get event name via text method
        event_info_text = event_date.text
        # Exclude next line info from text info collected
        date_info = event_info_text.split("\n")[0]
        # Find location info
        event_location_xpath = ".//a"

        # Attempt to get location directly
        try:
            location_info = event_date.find_element(by=By.XPATH, value=event_location_xpath ).text
        except NoSuchElementException:
            location_info = ""

        # If no info for location or date, access the page to pull more info
        if (location_info == "" or date_info == "-") and href != "":
            date_info, location_info = find_venue_date_info(
                href=href,
        )
        
        print(event_info_text)
        print(date_info, location_info)
        print()
        event_metadata_list = [event_title, date_info, location_info, status, href]
        event_tracking_list.append(event_metadata_list)

# Find total entries based on date and location regardless of multiple same event information 
print(len(event_tracking_list))

Total events: 349
Event:  West Mall BT21-Themed CNY Activities & Promos
Status:  ENDED
05 Jan - 24 Feb 2024
1 Bukit Batok Central, Singapore 658713
05 Jan - 24 Feb 2024 1 Bukit Batok Central, Singapore 658713

Event:  Comma Creative Arts Festival 2024
Status:  ENDED
12 - 28 Jan 2024
2 Orchard Link, Singapore 237978
12 - 28 Jan 2024 2 Orchard Link, Singapore 237978

Event:  Light To Night 2024
Status:  ENDED
19 Jan - 08 Feb 2024
Civic District
19 Jan - 08 Feb 2024 Civic District

Event:  Creative Intersections: Traces Of Dragons
Status:  ENDED
19 Jan - 25 Feb 2024
Funan
19 Jan - 25 Feb 2024 Funan

Event:  Artbox Avenue 2024
Status:  ENDED
26 Jan - 04 Feb 2024
Singapore Expo Hall 2, 1 Expo Drive, Singapore 486150
26 Jan - 04 Feb 2024 Singapore Expo Hall 2, 1 Expo Drive, Singapore 486150

Event:  Pinkfong & Baby Shark Playhouse
Status:  ENDED
26 Jan - 31 Mar 2024
6 Raffles Boulevard, #03-208/209 Marina Square, Singapore 039594
26 Jan - 31 Mar 2024 6 Raffles Boulevard, #03-208/209 Marina S

## Extract metadata of dataset for storing purpose

In [None]:
## Convert dataset dictionary to dataframe
if event_tracking_list:
    df = pd.DataFrame(event_tracking_list, columns=["Event Title", "Date", "Location", "Status", "URL"])

    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now
    df.head()

    dataset_filename = f"Smartlocal_dataset_{file_name_date}.csv"
    df.to_csv(dataset_filename,index=False)


# Close driver upon completion of saving file
driver.quit()
df.head()