# GovTech Events information scraper.
This notebook is used for pulling and consolidating GovTech Event pages https://www.developer.tech.gov.sg/communities/events/conferences/

Library imports

In [1]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException
from datetime import datetime
from selenium.webdriver.firefox.options import Options
import pandas as pd



Below are the firefox profile options set to speed up loading process. Note: Do download and Install GeckoDriver in Selenium and set the PATH variable in sys variable to point to it in order to ensure that the FireFox Driver for Selenium can work. You may point it via a Service module of selenium' firefox service instead of using PATH variable as an alternative as follows.

In [2]:
options=Options()
firefox_option = webdriver.FirefoxProfile()
firefox_option.set_preference("network.http.pipelining", True)
firefox_option.set_preference("network.http.proxy.pipelining", True)
firefox_option.set_preference("network.http.pipelining.maxrequests", 8)
firefox_option.set_preference("content.notify.interval", 500000)
firefox_option.set_preference("content.notify.ontimer", True)
firefox_option.set_preference("content.switch.threshold", 250000)

options.profile = firefox_option

In [21]:
def click_href_extract_event_info(event_href: str, options: Options) -> list:
    
    service = Service(executable_path=r"C:\Program Files\geckodriver.exe")
    new_driver = webdriver.Firefox(service=service, options=options)
    new_driver.get(event_href)
    event_info_xpath = "//div[@class='is-fullwidth is-hidden-desktop']"

    wait = WebDriverWait(new_driver, 120)
    # Webpage wait for required datetime
    try:
        wait.until(EC.presence_of_element_located((By.XPATH, event_info_xpath)))
    except TimeoutException:
        print("No event info element found")

    print("Getting information")
    # Event date
    event_date_xpath = event_info_xpath + "/div"
    try:
        event_start_date_element = new_driver.find_element(by=By.XPATH, value=event_date_xpath).get_attribute("data-start-date")
        
        event_end_date_element = new_driver.find_element(by=By.XPATH, value=event_date_xpath).get_attribute("data-end-date")


        event_start_date = event_start_date_element.split("T")[0]
        event_start_date = datetime.strptime(event_start_date, '%Y-%m-%d').date().strftime("%d %b %y")
        event_end_date = event_end_date_element.split("T")[0]
        event_end_date = datetime.strptime(event_end_date, '%Y-%m-%d').date().strftime("%d %b %y")

        event_date = event_start_date + " - " + event_end_date
        print("Found element for event date")
        print(event_date)
    except NoSuchElementException:
        print("No event date element found")
        event_date = ""

    # Due to the way the event information is structured , there are some xpath which text value could not be obtained as it is not visible. hence greedy approach is use by trying all possibilities and filling the details when it is available.
    event_location_xpath = "//div[@class='sgds-card-content padding--top padding--bottom']/div[3]/div[2]"
    try:
        event_location_elems = new_driver.find_elements(by=By.XPATH, value=event_location_xpath)
                                                
        for event_location_elem in event_location_elems:
            print(event_location_elem.text)
            if event_location_elem.text:
                event_location = event_location_elem.text
                break

    except NoSuchElementException:
        print("No event location element found")
        event_location = ""

    event_title_xpath = "//h1"
    try:
        event_title = str(new_driver.find_element(by=By.XPATH, value=event_title_xpath).text)
    except NoSuchElementException:
        print("No event title found.")
        event_title = ""

    event_metadata_list = [event_title, event_date, event_location, event_href]
    print(event_metadata_list)
    new_driver.quit()
    return event_metadata_list

## Main execution of info scraping

In [22]:
#CFG
service = Service(executable_path=r"C:\Program Files\geckodriver.exe")
driver = webdriver.Firefox(service=service, options=options)

event_tracking_list = []

# Note the unix time in the url is measured in millisec
URL = f"https://www.developer.tech.gov.sg/communities/events/conferences/"
driver.get(URL)

view_all_button_xpath = "//button[@id='view-all-past-conferences-button']"
event_xpath= "//div[@class='sgds-card-content sgds-card-variant-calendar']//a"

# Webpage wait for required view all button to load and click to expand the page
try:
    WebDriverWait(driver, 60).until(EC.presence_of_element_located((By.XPATH, view_all_button_xpath)))
    print("Clicking view all button")
    view_all_button_element = driver.find_element(by=By.XPATH, value=view_all_button_xpath)
    view_all_button_element.click()
except TimeoutException:
    print("No view all button found.")

except ElementNotInteractableException:
    print("Element not interactable..")

total_events_ele_list =  driver.find_elements(by=By.XPATH, value=event_xpath)
print(f"Total events for the page: {len(total_events_ele_list)}")

if total_events_ele_list == []:
    print("No events available. Terminating program...")
    driver.quit()
else:
    # Access each event via its href
    for event in total_events_ele_list:
        event_href = event.get_attribute("href")
        event_metadata_list = click_href_extract_event_info(event_href=event_href, options=options)
        event_tracking_list.append(event_metadata_list)
    print()

# Save to csv file
print("Saving to csv file")
if event_tracking_list:
    df = pd.DataFrame(event_tracking_list, columns=["Event Title", "Date", "Location", "URL"])

    # Drop duplicates for cases with same date/location/url
    df.drop_duplicates(subset=["Date", "Location", "URL"], inplace=True, keep="last")
    df.dropna(subset=["Date", "Location", "URL"], inplace=True)
    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now

    # Filename to save
    dataset_filename = f"GovTechEvents_dataset_{file_name_date}.csv"
    df.to_csv(dataset_filename, index=False, encoding='utf-8')

driver.quit()

Clicking view all button
Element not interactable..
Total events for the page: 9
Getting information
Found element for event date
06 Nov 24 - 07 Nov 24

Sands Expo and Convention Centre
['STACK Developer Conference 2024', '06 Nov 24 - 07 Nov 24', 'Sands Expo and Convention Centre', 'https://www.developer.tech.gov.sg/communities/events/conferences/stack-developer-conference-2024']
Getting information
Found element for event date
27 Mar 24 - 27 Mar 24

Suntec Convention Centre
['STACKx Smart City 2024', '27 Mar 24 - 27 Mar 24', 'Suntec Convention Centre', 'https://www.developer.tech.gov.sg/communities/events/conferences/stackx-smart-city-2024']
Getting information
Found element for event date
18 Jul 23 - 18 Jul 23

Suntec Singapore Convention and Exhibition Centre (Summit 1)
['STACKx Data & AI 2023', '18 Jul 23 - 18 Jul 23', 'Suntec Singapore Convention and Exhibition Centre (Summit 1)', 'https://www.developer.tech.gov.sg/communities/events/conferences/stackx-data-ai-2023']
Getting infor