# Singstat and Data.gov.sg Information crawler.
This notebook is used for pulling and consolidating datasets found in both sites

## Install dependencies for pulling data sources information

### Notes: BeautifulSoup can only handle static website content scraping. Selenium library is required together with beautifulsoup to read all dynamically loaded content which is the case for singstat and data.gov.sg.

Library imports

In [3]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException,NoSuchWindowException
from datetime import datetime
import pandas as pd
# Define options for webdriver
chrome_options = Options()

## Getting SG calendar of events from SmartLocal https://thesmartlocal.com/event-calendar/?a=alltime
Smartlocal also provides information on events for the following countries other than Singapore.
1) Thailand
2) Malaysia
3) Indonesia
4) Vietnam
5) Korea
6) Phillipines

In [21]:
URL = "https://thesmartlocal.com/event-calendar/?a=alltime"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
driver.get(URL)

# XPATH
results_xpath= ".//div[contains(@class, 'bg-light')]"

# Webpage wait for required xpath to load
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, results_xpath)))

# FInd elements
total_events = driver.find_elements(by=By.XPATH, value=results_xpath)
if total_events:
    print(f"Total events: {len(total_events)}")
else:
    print("Unable to find events related element via XPATH. Assuming No results.")
    exit()

event_tracking_list = []
for event in total_events:
    # For each event, extract title, status of event, and the dates and location which event occur. For multiple dates/locations, this will be added as separate entries
    event_header_xpath = ".//div[contains(@class, 'card-header')]//h5//a"
    try:
        event_title = event.find_element(by=By.XPATH, value=event_header_xpath).text
    except NoSuchElementException:
        print("Unable to find event info, continuing to the next event available")
        continue
    print("Event: ", event_title)

    event_status_xpath = ".//div[contains(@class, 'card-header')]//div[contains(@class, 'status')]"
    try:
        status = event.find_element(by=By.XPATH, value=event_status_xpath).text
    except NoSuchElementException:
        print("Unable to find event status info, assuming unknown state")
        status = "Unknown"
    print("Status: ", status)
    body_dates_xpath = ".//div[contains(@class, 'card-body')]//p"
    event_dates_list = event.find_elements(by=By.XPATH, value=body_dates_xpath)

    # Href
    footer_xpath = ".//div[contains(@class, 'card-footer')]//a"
    try:
        href_element = event.find_element(by=By.XPATH, value=footer_xpath)
        href = href_element.get_attribute("href")
    except NoSuchElementException:
        print("Unable to find event status info, assuming unknown state")
        href = "Unknown"

    for event_date in event_dates_list:
        # Get event name via text method
        date_info = event_date.text
        # Exclude next line info from text info collected
        date_info = date_info.split("\n")[0]
        # Find location info
        event_location_xpath = ".//a"
        try:
            location_info = event_date.find_element(by=By.XPATH, value=event_location_xpath ).text
        except NoSuchElementException:
            location_info = "Unknown location"

        print(date_info, location_info)
        # Construct metadata list
        event_metadata_list = [event_title, date_info, location_info, status, href]
        event_tracking_list.append(event_metadata_list)

# Find total entries based on date and location regardless of multiple same event information 
print(len(event_tracking_list))

Total events: 348
Event:  West Mall BT21-Themed CNY Activities & Promos
Status:  ENDED
05 Jan - 24 Feb 2024 1 Bukit Batok Central, Singapore 658713
Event:  Comma Creative Arts Festival 2024
Status:  ENDED
12 - 28 Jan 2024 2 Orchard Link, Singapore 237978
Event:  Light To Night 2024
Status:  ENDED
19 Jan - 08 Feb 2024 Civic District
Event:  Creative Intersections: Traces Of Dragons
Status:  ENDED
19 Jan - 25 Feb 2024 Funan
Event:  Artbox Avenue 2024
Status:  ENDED
26 Jan - 04 Feb 2024 Singapore Expo Hall 2, 1 Expo Drive, Singapore 486150
Event:  Pinkfong & Baby Shark Playhouse
Status:  
26 Jan - 31 Mar 2024 6 Raffles Boulevard, #03-208/209 Marina Square, Singapore 039594
Event:  Istana Open House 2024
Status:  ENDED
12 Feb 2024 Orchard Road, Singapore 238823
Event:  Sky Lantern Festival
Status:  ENDED
21 Feb 2024 Palawan Green, Siloso Beach Walk, Sentosa, Singapore 098236
Event:  Kampong Gelam Ramadan Bazaar 2024
Status:  
02 Mar - 05 Apr 2024 Kampong Gelam
Event:  Geylang Serai Ramadan

## Extract metadata of dataset for storing purpose

In [22]:
## Convert dataset dictionary to dataframe
if event_tracking_list:
    df = pd.DataFrame(event_tracking_list, columns=["Event Title", "Date", "Location", "Status", "URL"])

    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now
    df.head()

    dataset_filename = f"Smartlocal_dataset_{file_name_date}.csv"
    df.to_csv(dataset_filename,index=False)


# Close driver upon completion
driver.quit()

In [23]:
df.head()

Unnamed: 0,Event Title,Date,Location,Status,URL,Date_of_check
0,West Mall BT21-Themed CNY Activities & Promos,05 Jan - 24 Feb 2024,"1 Bukit Batok Central, Singapore 658713",ENDED,https://thesmartlocal.com/read/west-mall-bt21-...,31/03/2024 14:13:10
1,Comma Creative Arts Festival 2024,12 - 28 Jan 2024,"2 Orchard Link, Singapore 237978",ENDED,https://thesmartlocal.com/read/comma-creative-...,31/03/2024 14:13:10
2,Light To Night 2024,19 Jan - 08 Feb 2024,Civic District,ENDED,https://thesmartlocal.com/read/light-to-night-...,31/03/2024 14:13:10
3,Creative Intersections: Traces Of Dragons,19 Jan - 25 Feb 2024,Funan,ENDED,https://thesmartlocal.com/read/singapore-art-w...,31/03/2024 14:13:10
4,Artbox Avenue 2024,26 Jan - 04 Feb 2024,"Singapore Expo Hall 2, 1 Expo Drive, Singapore...",ENDED,https://thesmartlocal.com/read/artbox-avenue-2...,31/03/2024 14:13:10
