# Singstat and Data.gov.sg Information crawler.
This notebook is used for pulling and consolidating datasets found in both sites

## Install dependencies for pulling data sources information

### Notes: BeautifulSoup can only handle static website content scraping. Selenium library is required together with beautifulsoup to read all dynamically loaded content which is the case for singstat and data.gov.sg.

Library imports

In [15]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException,StaleElementReferenceException, TimeoutException, NoSuchWindowException, ElementClickInterceptedException
from datetime import datetime
from typing import NewType
import pandas as pd
# Define options for webdriver
chrome_options = Options()

# Getting calendar of events from SmartLocal https://thesmartlocal.com/event-calendar/?a=alltime

In [16]:
# Helper function
def load_all_data(dataset_on_display: list,
                  total_results: int,
                  load_more_button: object,
                  datasets_div_xpath:str,
                  load_more_xpath:str,
                  driver: NewType):
    while len(dataset_on_display) < total_results and load_more_button:
        print("Loading more data")
        # Count displayed dataset

        dataset_on_display = driver.find_elements(by=By.XPATH, value=datasets_div_xpath)
        try:
            WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, 
            load_more_xpath)))
        except TimeoutException:
            print("No load more button available after waiting. Assuming the end of page")

        try:
            load_more_button = driver.find_element(by=By.XPATH, value=load_more_xpath)
            print(f"Current display data: {len(dataset_on_display)}/{total_results}")
            load_more_button.click()
        except NoSuchElementException:
            print("Unable to click load more button due to no such element")
            dataset_on_display = driver.find_elements(by=By.XPATH, value=datasets_div_xpath)
            print(f"Current display data: {len(dataset_on_display)}/{total_results}")
        except ElementClickInterceptedException:
            print("Encountered interference with clicking...")
            dataset_on_display = driver.find_elements(by=By.XPATH, value=datasets_div_xpath)
            print(f"Current display data: {len(dataset_on_display)}/{total_results}")

In [27]:
URL = "https://thesmartlocal.com/event-calendar/?a=alltime"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
driver.get(URL)

# XPATH
results_xpath= "//div[contains(@class, 'bg-light')"

# Webpage wait
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, results_xpath)))

# FInd elements
total_events = driver.find_elements(by=By.XPATH, value=results_xpath)
if total_events:
    print(f"Total events:" {total_events})
else:
    print("Cant extract any results information from page. Assuming No results")
    exit()

for event in total_events:
    event_header_xpath = ".//div[contains(@class, 'card-header')]//h5//a"
    event_title = event.find_element(by=By.XPATH, value=event_header_xpath).text
    print("Event: ", event_title)
    event_status_xpath = ".//div[contains(@class, 'card-header')]//div[contains(@class, 'status')]"
    status = event.find_element(by=By.XPATH, value=event_status_xpath).text
    print("Status: ", status)
    body_dates_xpath = ".//div[contains(@class, 'card-body')]//p"
    event_dates_list = event.find_elements(by=By.XPATH, value=body_dates_xpath)
    for event_date in event_dates_list:
        event_date_xpath = ".//i[1]"
        date_info = event_date.find_element(by=By.XPATH, value=event_date_xpath ).text
        event_location_xpath = ".//i[2]"
        location_info = event_date.find_element(by=By.XPATH, value=event_location_xpath ).text

        print(date_info, location_info)

Found load more button with 4859 total results available
Loading more data
Current display data: 20/4859
Loading more data
Current display data: 20/4859
Loading more data
Current display data: 40/4859
Loading more data
Current display data: 60/4859
Loading more data
Current display data: 80/4859
Loading more data
Current display data: 100/4859
Loading more data
Current display data: 120/4859
Loading more data
Current display data: 140/4859
Loading more data
Current display data: 160/4859
Loading more data
Current display data: 180/4859
Loading more data
Current display data: 200/4859
Loading more data
Current display data: 220/4859
Loading more data
Current display data: 240/4859
Loading more data
Current display data: 260/4859
Loading more data
Current display data: 280/4859
Loading more data
Current display data: 300/4859
Loading more data
Current display data: 320/4859
Loading more data
Current display data: 340/4859
Loading more data
Current display data: 360/4859
Loading more data

## Do an overall count after load more has been exhausted

In [None]:
try:
    total_dataset_on_display = driver.find_elements(by=By.XPATH, value=datasets_div_xpath)
    print(len(total_dataset_on_display))

except NoSuchWindowException:
    print("Window has been inadvertently closed")
    driver.quit()

4859


## Extract metadata of dataset for storing purpose

In [None]:
# #Collection dict to store collections info
collection_tracker_dict = {}

# Dataset dict to store dataset info
dataset_tracker_dict= {}

get_dataset_name_xpath = ".//div/div/p"
get_metadata_info_xpath = ".//div/div/div[contains(@class, 'chakra-wrap')]/ul/p[contains(@class, 'chakra-text')]"
for dataset in total_dataset_on_display:

    # Extract href
    try:
        href = dataset.get_attribute('href')
        print("Dataset link:")
        print(href)
    except AttributeError:
        href = "No info"
        print("No href info found.")
    
    # Extract dataset name
    try:
        name = dataset.find_element(by=By.XPATH, value=get_dataset_name_xpath)
        name = name.text
        print("Dataset name:")
        print(name) 
    except NoSuchElementException:
        print("No name information found")

    # Multiple metadata information
    metadata_list = dataset.find_elements(by=By.XPATH, value=get_metadata_info_xpath)
    # Filter alternate elements (excluding the dot separator)
    metadata_list = [metadata.text for i,metadata in enumerate(metadata_list) if i%2==0]

    # Default will see data period last updated, datatype, source information
    #print(metadata_list)

    std_metadata_list = []
    REQUIRED_ACTUAL_META_LENGTH = 4
    if isinstance(metadata_list, list):
        
        # Combine all data source info to single entity as last list element as we want to have
        # date, updated, filetype, source structure
        if "(" in metadata_list[-1] and metadata_list[-1].endswith(")"):
            source_str = ""
            # Concatenate all sources together if two elements contains (
            while " (" in metadata_list[-1] and metadata_list[-1].endswith(")"):
                source_str += metadata_list.pop()
                source_str += ","
            std_metadata_list.append(source_str)
        else:
            std_metadata_list.append("No source info")
        print(metadata_list)
        # Check third element, which usuall indicates datatype
        if isinstance(metadata_list, list) and metadata_list[-1].isupper():
            std_metadata_list.append(metadata_list[-1])
            metadata_list.pop()
        else:
            std_metadata_list.append("No file info")

        print(metadata_list)
        # CHeck second element from left (can be updated or datasets)
        if isinstance(metadata_list, list) and (metadata_list[-1].startswith("Updated") or metadata_list[-1].endswith(" datasets")):
            std_metadata_list.append(metadata_list[-1])
            metadata_list.pop()
        else:
            std_metadata_list.append("No update info")

        print(metadata_list)
        if isinstance(metadata_list, list) and " - " in metadata_list[-1]:
            std_metadata_list.append(metadata_list[-1])
        else:
            std_metadata_list.append("No date info")
    # When all info are not available. Construct a fix list of length 4
    else:
        std_metadata_list = ["No date info", "No update info", "No file info", "No source info"]

    #Insert href info to last element (5th element representing url)
    
    std_metadata_list = std_metadata_list[::-1]
    std_metadata_list.append(href)
    
    print(std_metadata_list)
    # MEtadata construct influence whether we are dealing with collections or dataset
    if any(" dataset" in metadata for metadata in std_metadata_list):
        collection_tracker_dict[name] = std_metadata_list
    else:
        dataset_tracker_dict[name] = std_metadata_list
    print()   


MaxRetryError: HTTPConnectionPool(host='localhost', port=60884): Max retries exceeded with url: /session/88dd7089b59c478c31872bf33592e5f8/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x00000200EE0DD940>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

In [24]:
print(f"Datasets: {len(dataset_tracker_dict)}, Collections: {len(collection_tracker_dict)}")

Datasets: 3965, Collections: 714


In [25]:
## Convert dataset dictionary to dataframe
if dataset_tracker_dict:
    df = pd.DataFrame.from_dict(dataset_tracker_dict, orient="index", columns=["Data period", "Last updated", "Datatype", "Source", "Dataset url"])

    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now
    df.index.name = "Datasets"
    df.head()

    dataset_filename = f"Datagovsg_dataset_{file_name_date}.csv"
    df.to_csv(dataset_filename,index=True)


## Convert collections dictionary to dataframe
if collection_tracker_dict:
    df = pd.DataFrame.from_dict(dataset_tracker_dict, orient="index", columns=["Data period", "Number of datasets", "Datatype", "Source", "Collections url"])

    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now
    df.index.name = "Collections"
    df.head()

    dataset_filename = f"Datagovsg_collections_{file_name_date}.csv"
    df.to_csv(dataset_filename,index=True)


# Close driver upon completion
driver.quit()