# Singstat and Data.gov.sg Information crawler.
This notebook is used for pulling and consolidating datasets found in both sites

## Install dependencies for pulling data sources information

### Notes: BeautifulSoup can only handle static website content scraping. Selenium library is required together with beautifulsoup to read all dynamically loaded content which is the case for singstat and data.gov.sg.

Library imports

In [1]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException,StaleElementReferenceException, TimeoutException, NoSuchWindowException, ElementClickInterceptedException
from datetime import datetime
import pandas as pd

## Getting Global Holiday calendar from Time and Date webpage, https://www.timeanddate.com/holidays/

In [10]:
# Update URL here to point to correct source
URL = "https://www.timeanddate.com/holidays/?allcountries"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
driver.get(URL)

# XPATH to geolocations
geo_xpath= ".//article[@class='category-list']//ul//li//a"
year_dropdown_xpath = ".//label/select[@id='year']"
# XPath to all year options provided based on dropdown func in the page
year_xpath = ".//label/select[@id='year']//option"

# Webpage wait for required xpath to load
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, geo_xpath)))

# Find elements
total_geo_locations_list = driver.find_elements(by=By.XPATH, value=geo_xpath)
if total_geo_locations_list:
    print(f"Total geo locations: {len(total_geo_locations_list)}")
else:
    print("Unable to find geo locations related element via XPATH specified. Assuming No results.")
    exit()

# Dictionary to store holidays of each geo
geo_holiday_tracking_dict = {}
href_list = []

# Get all href info
for geo_location in total_geo_locations_list:
    # Get href
    try:
        href = geo_location.get_attribute("href")
        print(href)
        href_list.append(href)
    except:
        href = "Unknown"
        continue


Total geo locations: 233
https://www.timeanddate.com/holidays/afghanistan/
https://www.timeanddate.com/holidays/albania/
https://www.timeanddate.com/holidays/algeria/
https://www.timeanddate.com/holidays/american-samoa/
https://www.timeanddate.com/holidays/andorra/
https://www.timeanddate.com/holidays/angola/
https://www.timeanddate.com/holidays/anguilla/
https://www.timeanddate.com/holidays/antigua-and-barbuda/
https://www.timeanddate.com/holidays/argentina/
https://www.timeanddate.com/holidays/armenia/
https://www.timeanddate.com/holidays/aruba/
https://www.timeanddate.com/holidays/australia/
https://www.timeanddate.com/holidays/austria/
https://www.timeanddate.com/holidays/azerbaijan/
https://www.timeanddate.com/holidays/bahrain/
https://www.timeanddate.com/holidays/bangladesh/
https://www.timeanddate.com/holidays/barbados/
https://www.timeanddate.com/holidays/belarus/
https://www.timeanddate.com/holidays/belgium/
https://www.timeanddate.com/holidays/belize/
https://www.timeanddate.

Loop through all the href

In [11]:
for href in href_list:
    # Get location info
    geo_location_info = href.split("/")[-2]
    print(f"Current geo: {geo_location_info}")
    # For each geo location, go to the page of interest and get the information on available years
    driver.get(href)

    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, year_xpath)))

    try:
        year_dropdown = driver.find_element(by=By.XPATH, value=year_dropdown_xpath)
        year_dropdown.click()
    except:
        print("Unable to find year dropdown to click")
        continue
    # Get all years availability for the geo location
    supported_years_ele_list = driver.find_elements(by=By.XPATH, value=year_xpath)
    supported_years_list = [year.text for year in supported_years_ele_list if year.text!="Today"]
    print(f"Total years: {len(supported_years_list)}")

    # Get the table content xpath
    holiday_details_xpath = ".//tr[contains(@class, 'showrow')]"

    holiday_list = []
    for year in supported_years_list:
        print(f"Processing year: {year}")
        # Add year suffix to current geo location
        year_url = f"{href}/{year}"
        # For each geo location go to the page of interest and get the information
        driver.get(year_url)
        holiday_table_xpath = ".//table[@id='holidays-table']"

        # Wait for holiday table element to load. If nothing means no data.
        try:
            WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.XPATH, holiday_table_xpath)))
        except TimeoutException:
            print(f"No information found for year {year}. Moving to next year")
            continue
        
        holiday_details = driver.find_elements(
            by=By.XPATH, value=holiday_details_xpath
        )
        print(f"Total holidays for the year: {len(holiday_details)}")
        # Get date, day, name of holiday and type of holiday info from table
        for holiday_metadata in holiday_details:
            date = holiday_metadata.find_element(by=By.XPATH, value=".//th").text
            day = holiday_metadata.find_element(by=By.XPATH, value=".//td[1]").text
            name = holiday_metadata.find_element(by=By.XPATH, value=".//td[2]").text
            type = holiday_metadata.find_element(by=By.XPATH, value=".//td[3]").text
            holiday_info = [year, date, day, name, type]
            print(holiday_info)
            # Append such info to list
            holiday_list.append(holiday_info)
    # Update 
    geo_holiday_tracking_dict[geo_location_info] = holiday_list
    # Go back to main page and loop again for next geo
    print("Back to main page")
    driver.get(URL)
    # Webpage wait for required xpath to load
    WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, geo_xpath)))

# Find total entries based on date and location regardless of multiple same event information 
print(len(geo_holiday_tracking_dict))
driver.quit()

Current geo: afghanistan
Total years: 41
Processing year: 2000
No information found for year 2000. Moving to next year
Processing year: 2001
No information found for year 2001. Moving to next year
Processing year: 2002
No information found for year 2002. Moving to next year
Processing year: 2003
No information found for year 2003. Moving to next year
Processing year: 2004
No information found for year 2004. Moving to next year
Processing year: 2005
No information found for year 2005. Moving to next year
Processing year: 2006
No information found for year 2006. Moving to next year
Processing year: 2007
No information found for year 2007. Moving to next year
Processing year: 2008
No information found for year 2008. Moving to next year
Processing year: 2009
No information found for year 2009. Moving to next year
Processing year: 2010
No information found for year 2010. Moving to next year
Processing year: 2011
No information found for year 2011. Moving to next year
Processing year: 2012
N

## Extract metadata of dataset for storing purpose

In [22]:
## Convert dataset dictionary to dataframe
if geo_holiday_tracking_dict:
    df = pd.DataFrame.from_dict(geo_holiday_tracking_dict, columns=["Event Year", "Date", "Day", "Name", "Type"])

    datetime_now = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
    file_name_date = datetime.now().strftime("%d%m%Y_%H%M%S")

    df["Date_of_check"] = datetime_now
    df.head()

    dataset_filename = f"Smartlocal_dataset_{file_name_date}.csv"
    df.to_csv(dataset_filename,index=False)


# Close driver upon completion
driver.quit()