# Address to lat Lon conversion  
This notebook is used for generating lat lon for all known locations from the input files. 
It uses Nominatim tool to search addresses lat lon from OpenSourceMap service, which does not need any API key and is free.

In [1]:
# importing geopy library and Nominatim class
from geopy.distance import geodesic
from geopy.geocoders import Nominatim
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver 
from selenium.common.exceptions import NoSuchElementException
import pandas as pd
import time
import math
import pandas as pd
# calling the Nominatim tool and create Nominatim class limiting to SG
loc = Nominatim(user_agent="my-latlon-application",timeout=10)


# Using identified mrt stations as markers for NSEW

- 'Tampines station': [1.3542907, 103.9446144428833],
- 'Jurong East station': [1.3321552, 103.7429844],
- 'Woodlands station': [1.4372552, 103.787231],
- 'Outram park station': [1.2804624, 103.83991801312565]

In [2]:
# Location
country_codes="sg"
nsew_dict = {
    "Tampines station": None,
    "Jurong East station": None,
    "Woodlands station": None,
    "Outram park station": None
}

# Mapping dictionary for used mrt stations
stn_geo_dict = {
    "Tampines station": "East",
    "Jurong East station": "West",
    "Woodlands station": "North",
    "Outram park station": "South"
}

for location in nsew_dict.keys():
    try:
        print(f"Lat lon for {location}")
        getLoc = loc.geocode(location, country_codes="sg")
        if getLoc:
            nsew_dict[location] = [getLoc.latitude, getLoc.longitude]
        else:
            print("Cannot decipher the location")
    except TimeoutError:
        print("Retrying")
        time.sleep(3)
        getLoc = loc.geocode(location, country_codes="sg")
        print(f"Address: {getLoc.address}")
        nsew_dict[location] = [getLoc.latitude, getLoc.longitude]
nsew_dict

Lat lon for Tampines station
Lat lon for Jurong East station
Lat lon for Woodlands station
Lat lon for Outram park station


{'Tampines station': [1.3542907, 103.9446144428833],
 'Jurong East station': [1.3321552, 103.7429844],
 'Woodlands station': [1.4372552, 103.787231],
 'Outram park station': [1.2804624, 103.83991801312565]}

## Helper functions

In [3]:
# Define a retry decorator
def retry(max_retries, wait_time):
    def decorator(func):
        def wrapper(*args, **kwargs):
            retries = 0
            while retries < max_retries:
                try:
                    result = func(*args, **kwargs)
                    return result
                except TimeoutError:
                    retries += 1
                    time.sleep(wait_time)
            else:
              raise TimeoutError(f"Max retries of function {func} exceeded")
        return wrapper
    return decorator

In [4]:
# Apply search with retry decorartor 
@retry(max_retries=5, wait_time=1)
def location_lat_lon(location_str: str, country_codes: str ="sg"):
    # Get the first information block
    location_str = str(location_str)
    if location_str:
        print(location_str)
        location_list = location_str.split(",")
        for location in location_list:
            getLoc = loc.geocode(location, country_codes=country_codes, limit=1)
            if getLoc:
                print("Lat and Lon info:")
                print(getLoc.latitude, getLoc.longitude)
                print()
                return {"Lat": getLoc.latitude, "Lon": getLoc.longitude}
            else:
                print("Using the subsequent information to try")
                print()
    # For null case, just end the program
    else:
        return {"Lat": "All", "Lon": "All"}


In [5]:
# Some postal code cannot be geocoded due to sensitivities as below such as Istana
getLoc = loc.geocode("Singapore 238823", country_codes="sg", limit=1)
if getLoc:
    print(getLoc.latitude, getLoc.longitude)

In [6]:
# read in smart local dataset
SMARTLOCAL_DATASET_PATH = "Smartlocal_dataset_31032024_141310.csv"
df = pd.read_csv(SMARTLOCAL_DATASET_PATH)
df.head()

Unnamed: 0,Event Title,Date,Location,Status,URL,Date_of_check
0,West Mall BT21-Themed CNY Activities & Promos,05 Jan - 24 Feb 2024,"1 Bukit Batok Central, Singapore 658713",ENDED,https://thesmartlocal.com/read/west-mall-bt21-...,31/03/2024 14:13:10
1,Comma Creative Arts Festival 2024,12 - 28 Jan 2024,"2 Orchard Link, Singapore 237978",ENDED,https://thesmartlocal.com/read/comma-creative-...,31/03/2024 14:13:10
2,Light To Night 2024,19 Jan - 08 Feb 2024,Civic District,ENDED,https://thesmartlocal.com/read/light-to-night-...,31/03/2024 14:13:10
3,Creative Intersections: Traces Of Dragons,19 Jan - 25 Feb 2024,Funan,ENDED,https://thesmartlocal.com/read/singapore-art-w...,31/03/2024 14:13:10
4,Artbox Avenue 2024,26 Jan - 04 Feb 2024,"Singapore Expo Hall 2, 1 Expo Drive, Singapore...",ENDED,https://thesmartlocal.com/read/artbox-avenue-2...,31/03/2024 14:13:10


There are missing location in the entries due to info unavailability on the page which info is scraped. TO determine the location, separate access to url link is required to extract relevant info out. This applies to date as well.

In [7]:
# Check of missing entries
missing_location_entries_df = df[df["Location"].isnull()]
missing_location_entries_df.shape

(84, 6)

Do location mapping based on scraped address in the best ability.
Note that the geocoder tool may not necessary provide the lat/lon info which could be done via GoogleMapV3.

In [8]:
# Direct location mapping 
df[["Lat","Lon"]]= df.apply(lambda row: location_lat_lon(row["Location"]), axis="columns", result_type="expand")

1 Bukit Batok Central, Singapore 658713


Lat and Lon info:
1.3500765000000001 103.74925662743149

2 Orchard Link, Singapore 237978
Lat and Lon info:
1.3010848 103.8352023

Civic District
Lat and Lon info:
1.2917349 103.8511298

Funan
Lat and Lon info:
1.291313 103.84994707397723

Singapore Expo Hall 2, 1 Expo Drive, Singapore 486150
Lat and Lon info:
1.3371524 103.9599267

6 Raffles Boulevard, #03-208/209 Marina Square, Singapore 039594
Lat and Lon info:
1.29161 103.8568156522325

Orchard Road, Singapore 238823
Lat and Lon info:
1.3068733 103.8284095

Palawan Green, Siloso Beach Walk, Sentosa, Singapore 098236
Lat and Lon info:
1.2505965 103.8181594

Kampong Gelam
Lat and Lon info:
1.3025865 103.85996496668358

1 Geylang Serai, Singapore 402001
Lat and Lon info:
1.3166958 103.8982195336871

Chinatown Point
Lat and Lon info:
1.28519745 103.8449051848437

18 Marina Gardens Drive, Gardens by the Bay, Singapore 018953
Lat and Lon info:
1.2821122 103.8654957

8 Sentosa Gateway, Singapore 098269
Lat and Lon info:
1.25574585 103.821

In [22]:
# Check nearest NSEW
def calc_distance(lat_loc: float, lon_loc:float, nsew_coord_dict: dict, stn_geo_mapping_dict: dict):
    region = "All"
    dist = None
    # Calculate distance when lat/lon is not nan
    if not math.isnan(lat_loc) and not math.isnan(lon_loc):
        #print(lat_loc, lon_loc)
        #event_coordinates
        event_coord = (lat_loc, lon_loc)

        dist = math.inf
        for k,v in nsew_coord_dict.items():
            nsew_coord = (v[0], v[1])
            new_dist = geodesic(event_coord, nsew_coord).km
            # Make necessary update and continue
            if new_dist < dist:
                dist = new_dist
                # Find the region using stn_geo_mapping_dict
                region = stn_geo_mapping_dict[k]
        #print(f"Nearest region: {region} and distance in km: {dist}")
    return {"Location": region, "Distance": dist}

def split_date_format(date_string):
    # Remove any spaces before and after the string
    date_string = str(date_string)
    date_string = date_string.strip()
    # Presence of date range with hyphens
    if date_string == "-":
        start, end = None, None
    elif "-" in date_string and len(date_string) > 1 :
        start, end = date_string.split("-", 1)
        # Strip excess spaces
        start = str(start).strip()
        end = str(end).strip()

        # Convert to datetime and extract month and year for filling start
        expected_format = "%d %b %Y"
        end = pd.to_datetime(end, format=expected_format)
        end_mth = end.strftime("%b")
        print(end_mth)
        end_year = str(end.year)

        # Convert back to typical format
        end = end.strftime(expected_format)
        if start.count(" ") == 0:
            start = " ".join([start, end_mth, end_year])
        if start.count(" ") == 1:
            start = " ".join([start, end_year])
        
    # Single date case where start is the same as end
    else:
        start, end = date_string, date_string
    return start, end

In [23]:
# Direct location mapping 
df[["Location", "Distance"]] = df.apply(lambda row: calc_distance(
        lat_loc=row["Lat"],
        lon_loc=row["Lon"],
        nsew_coord_dict=nsew_dict,
        stn_geo_mapping_dict=stn_geo_dict
    ),
    axis="columns",
    result_type="expand"
)

In [11]:
df.head()

Unnamed: 0,Event Title,Date,Location,Status,URL,Date_of_check,Lat,Lon,Distance
0,West Mall BT21-Themed CNY Activities & Promos,05 Jan - 24 Feb 2024,West,ENDED,https://thesmartlocal.com/read/west-mall-bt21-...,31/03/2024 14:13:10,1.350077,103.749257,2.100992
1,Comma Creative Arts Festival 2024,12 - 28 Jan 2024,South,ENDED,https://thesmartlocal.com/read/comma-creative-...,31/03/2024 14:13:10,1.301085,103.835202,2.339933
2,Light To Night 2024,19 Jan - 08 Feb 2024,South,ENDED,https://thesmartlocal.com/read/light-to-night-...,31/03/2024 14:13:10,1.291735,103.85113,1.763689
3,Creative Intersections: Traces Of Dragons,19 Jan - 25 Feb 2024,South,ENDED,https://thesmartlocal.com/read/singapore-art-w...,31/03/2024 14:13:10,1.291313,103.849947,1.638695
4,Artbox Avenue 2024,26 Jan - 04 Feb 2024,East,ENDED,https://thesmartlocal.com/read/artbox-avenue-2...,31/03/2024 14:13:10,1.337152,103.959927,2.548565


Quick view of date information extracted.

In [12]:
# Quick view of date formats
df["Date"].value_counts()

Date
13 - 14 Jul 2019             3
17 Nov 2023 - 01 Jan 2024    3
29 Nov - 01 Dec 2019         3
28 Sep 2019                  3
-                            3
                            ..
23 - 24 Jul 2022             1
16 Jul 2022                  1
12 - 19 Jul 2022             1
05 Jul - 14 Aug 2022         1
05 - 31 Mar 2021             1
Name: count, Length: 341, dtype: int64

## Process date and break into start and end dates
Date format can be in:
- 05 Jan- 24 Feb 2024 (missing start year)
- 12 - 28 Jan 2024 (missing start mth and year)

To standardise, we will use month and year information from the end date to update the start date whereeither its month and year are information, on the assumption that they are omitted for simplicity in representation 

In [13]:
# Split period to start/end dates
df[["Date_start", "Date_end"]] = df.apply(lambda row: split_date_format(row["Date"]), axis="columns", result_type="expand")

df.head()

Feb
Jan
Feb
Feb
Feb
Mar
Apr
Apr
Mar
Apr
Jun
Mar
Apr
Jan
Jan
Jan
Apr
Jan
Jan
Mar
Apr
Apr
Feb
Feb
Mar
Mar
Apr
Apr
Apr
May
Apr
Apr
Sep
Jun
Jul
Jun
Jul
Oct
Jul
Jul
Aug
Aug
Aug
Sep
Aug
Oct
Oct
Oct
Sep
Oct
Sep
Sep
Nov
Dec
Oct
Oct
Nov
Oct
Oct
Nov
Nov
Dec
Jan
Jan
Jan
Dec
Dec
Nov
Nov
Dec
Jan
Jan
Dec
Jan
Feb
Jan
Feb
Feb
Jan
Feb
Feb
Jul
May
Jul
May
Apr
Jun
May
Apr
Apr
May
Jul
Jun
Jun
Jun
Jun
Jun
Jun
Oct
Jul
Sep
Aug
Jul
Jul
Jul
Jul
Jan
Oct
Jul
Aug
Jul
Sep
Aug
Aug
Sep
Sep
Sep
Aug
Sep
Sep
Feb
Sep
Jan
Oct
Oct
Oct
Oct
Oct
Oct
Nov
Nov
Oct
Mar
Nov
Feb
Apr
Nov
Nov
Feb
Dec
Nov
Dec
Mar
Jan
Jan
Dec
Dec
Dec
Dec
Feb
Mar
Jan
Jan
Mar
Feb
Mar
Mar
Mar
Mar
Mar
Aug
Jun
Jan
Sep
Dec
Nov
Sep
Mar
Dec
Dec
Mar
Dec
Dec
Jan
Jan
Dec
Dec
Jan
Mar
Jan
Jan
Jan
Jan
Jan
Jan
Jan
Jan
Mar
May
Mar
Mar
Mar
Mar
Dec
Jun
Aug
Aug
Sep
Oct
Oct
Dec
Nov
Nov
Dec
Dec
Dec
Nov
Nov
Dec
Dec
Jan
Dec
Jan
Aug
Jun
Jun
Jul
Feb
Jul
Jun
Jun
Aug
Jul
Jul
Jul
Jul
Jul
Jul
Jul
Jul
Jul
Jul
Jul
Jul
Jul
Jul
Aug
Jul
Aug
Jul
Aug
Aug
Aug
Aug
Aug
Aug
Aug
Aug
Aug
Sep


Unnamed: 0,Event Title,Date,Location,Status,URL,Date_of_check,Lat,Lon,Distance,Date_start,Date_end
0,West Mall BT21-Themed CNY Activities & Promos,05 Jan - 24 Feb 2024,West,ENDED,https://thesmartlocal.com/read/west-mall-bt21-...,31/03/2024 14:13:10,1.350077,103.749257,2.100992,05 Jan 2024,24 Feb 2024
1,Comma Creative Arts Festival 2024,12 - 28 Jan 2024,South,ENDED,https://thesmartlocal.com/read/comma-creative-...,31/03/2024 14:13:10,1.301085,103.835202,2.339933,12 Jan 2024,28 Jan 2024
2,Light To Night 2024,19 Jan - 08 Feb 2024,South,ENDED,https://thesmartlocal.com/read/light-to-night-...,31/03/2024 14:13:10,1.291735,103.85113,1.763689,19 Jan 2024,08 Feb 2024
3,Creative Intersections: Traces Of Dragons,19 Jan - 25 Feb 2024,South,ENDED,https://thesmartlocal.com/read/singapore-art-w...,31/03/2024 14:13:10,1.291313,103.849947,1.638695,19 Jan 2024,25 Feb 2024
4,Artbox Avenue 2024,26 Jan - 04 Feb 2024,East,ENDED,https://thesmartlocal.com/read/artbox-avenue-2...,31/03/2024 14:13:10,1.337152,103.959927,2.548565,26 Jan 2024,04 Feb 2024


In [14]:
print(df.shape)
# Drop null first
df = df[~df["Date_start"].isnull()]
print(df.shape)
# Drop date
df.drop(columns="Date", axis=1 ,inplace=True)

(366, 11)
(363, 11)


In [15]:
# Convert to datetime
df["Date_start"] = pd.to_datetime(df["Date_start"], format="%d %b %Y")
df["Date_end"] = pd.to_datetime(df["Date_end"], format="%d %b %Y")

In [16]:
# Explode the dates
df["Date"] = df.apply(
    lambda x: pd.date_range(start=x["Date_start"], end=x["Date_end"]), axis=1
)
df = df.explode("Date")

In [24]:
column_of_interest = ["Date", "Location"]
for location in df["Location"].unique():
    print(location)
    df_location = df[df["Location"] == location][column_of_interest]

    # Drop duplicates
    df.drop_duplicates(inplace=True, keep="first")
    if location:
        filename = f"{location}_event_latlon.csv"
    else:
        filename = "nsew_latlon.csv"
    df_location.to_csv(filename, index=False)

West
South
East
All
North


# The following codes are more for filling those unknown locations

In [18]:
# Define chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

NoSuchDriverException: Message: Unable to obtain driver for chrome using Selenium Manager.; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


As we access the actual pages, we expect some pages to have missing venue information under "Event information" (although this is primary point of information where location name can be pulled). Otherwise we seek for the text with venue or address as a fallback.

In [None]:
for location in missing_location_entries_df["URL"].unique():
    URL = location
    driver.get(URL)
    # venue XPATH
    venue_xpath= '//h5[@class="wfnb-venue"]'

In [None]:
def find_venue_info_xpath(xpath:str) -> str:
    try:
        val = driver.find_element(by=By.XPATH, value=xpath).text
    except NoSuchElementException:
        print("Unable to find info, returning none")
        val = None
    
    return val

In [None]:
# Process those entries which have missing location info
URL = "https://thesmartlocal.com/event-calendar/?a=alltime"
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)
driver.get(URL)

xpath_list = [
    '//h5[@class="wfnb-venue"]',
    '//b[contains(text(),"Venue:")]/../following-sibling::span',
    '//b[contains(text(),"Venue:")]/..' #Case when there is no sibling span
]
# venue default XPATH
venue_xpath= '//h5[@class="wfnb-venue"]'

venue = find_venue_info_xpath(xpath=venue_xpath)

# Second approach
if not venue:
    venue_xpath = '//b[contains(text(),"Venue:")]/../following-sibling::span'
    venue = find_venue_info_xpath(xpath=venue_xpath)


# Webpage wait for required xpath to load
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.XPATH, venue_xpath)))


Map out lat lon using defined function

In [None]:
df[["Lat","Lon"]] = df["Location"]