In [1]:
import requests_cache
import numpy as np
import pandas as pd
import os
import re
# from dotenv import load_dotenv
from modules.utils import *

In [2]:
from sqlalchemy import create_engine
engine = create_engine(f"mysql+mariadbconnector://{os.environ['user']}:{os.environ['password']}@{os.environ['host']}:3306/{os.environ['database']}")

In [74]:
query = '''
        SELECT *
        FROM stg_addresses
        '''
df = pd.read_sql(query, con=engine)
df

Unnamed: 0,town,street_name,block_number,full_address
0,ANG MO KIO,ANG MO KIO AVE 1,205,"205, ANG MO KIO AVE 1"
1,ANG MO KIO,ANG MO KIO AVE 1,207,"207, ANG MO KIO AVE 1"
2,ANG MO KIO,ANG MO KIO AVE 1,208,"208, ANG MO KIO AVE 1"
3,ANG MO KIO,ANG MO KIO AVE 1,215,"215, ANG MO KIO AVE 1"
4,ANG MO KIO,ANG MO KIO AVE 1,216,"216, ANG MO KIO AVE 1"
...,...,...,...,...
9518,YISHUN,YISHUN ST 81,876,"876, YISHUN ST 81"
9519,YISHUN,YISHUN ST 81,877,"877, YISHUN ST 81"
9520,YISHUN,YISHUN ST 81,878,"878, YISHUN ST 81"
9521,YISHUN,YISHUN ST 81,879,"879, YISHUN ST 81"


In [19]:
session = requests_cache.CachedSession('logs/address_cache', backend="sqlite")

In [81]:
# Logger here is the debugger logger
logger.info(f"{'-'*50}New run started {'-'*50}")

# Getting latitude, longitude, postal code
def get_lat_long(address_df : pd.DataFrame, verbose: int=1):
    '''
    The actual API call to be called row-wise to get latitude, longitude, and postal code
    ## Parameters
    address_df : pd.DataFrame
        DataFrame that contains a combination of ['block_number'] and ['street_name'] as ['full_address']
    '''

    # API call
    try:
        address = address_df['full_address']
        call = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y"
        # Caching is enabled in the session
        response = session.get(call)
        response.raise_for_status()
        data = response.json()
        if verbose ==1:
            logger.info(f'Response: {response.status_code} \tGet request call: {response.url}')

        # For those with ST abbreviations
        if len(data['results'])<1:
            address = re.sub(' ST', ' STREET', address_df['full_address'])
            call = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y"
            # Caching is enabled in the session
            response = session.get(call)
            response.raise_for_status()
            data = response.json()
            if verbose ==1:
                logger.info(f'Response: {response.status_code} \tGet request call: {response.url}')

        return [float(data['results'][0]['LATITUDE']), float(data['results'][0]['LONGITUDE']), data['results'][0]['POSTAL']]

    except Exception as err:
        logger.error(f'Error occurred - get_lat_long() API call: {err} on the following call:', exc_info=True)
        return [np.NaN, np.NaN, np.NaN ]

# This calls the API call function row wise
df[['latitude', 'longitude', 'postcode']] = df.apply(get_lat_long, axis=1, result_type='expand')

In [8]:
df.to_sql("geolocations", engine, if_exists="replace", index=False) 

2024-04-15 16:09:29,862 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-04-15 16:09:29,868 INFO sqlalchemy.engine.Engine DESCRIBE `hdb_prices_dev`.`geolocations`
2024-04-15 16:09:29,869 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-04-15 16:09:29,898 INFO sqlalchemy.engine.Engine DESCRIBE `hdb_prices_dev`.`geolocations`
2024-04-15 16:09:29,899 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-04-15 16:09:29,930 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `hdb_prices_dev`
2024-04-15 16:09:29,931 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-04-15 16:09:29,937 INFO sqlalchemy.engine.Engine SHOW FULL TABLES FROM `hdb_prices_dev`
2024-04-15 16:09:29,938 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-04-15 16:09:29,953 INFO sqlalchemy.engine.Engine SHOW CREATE TABLE `geolocations`
2024-04-15 16:09:29,954 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-04-15 16:09:29,960 INFO sqlalchemy.engine.Engine 
DROP TABLE geolocations
2024-04-15 16:09:29,961 INFO sqlalchemy.engine.Engin

9523

In [9]:
query = '''
        SELECT *
        FROM geolocations
        '''
df = pd.read_sql(query, con=engine)
df

2024-04-15 16:09:38,476 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-04-15 16:09:38,477 INFO sqlalchemy.engine.Engine DESCRIBE `hdb_prices_dev`.`
        SELECT *
        FROM geolocations
        `
2024-04-15 16:09:38,479 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-04-15 16:09:38,481 INFO sqlalchemy.engine.Engine 
        SELECT *
        FROM geolocations
        
2024-04-15 16:09:38,483 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-04-15 16:09:38,552 INFO sqlalchemy.engine.Engine ROLLBACK


Unnamed: 0,town,street_name,block_number,full_address,latitude,longitude,postcode
0,ANG MO KIO,ANG MO KIO AVE 1,205,"205, ANG MO KIO AVE 1",1.366941,103.843582,560205
1,ANG MO KIO,ANG MO KIO AVE 1,207,"207, ANG MO KIO AVE 1",1.365821,103.842848,560207
2,ANG MO KIO,ANG MO KIO AVE 1,208,"208, ANG MO KIO AVE 1",1.365445,103.842715,560208
3,ANG MO KIO,ANG MO KIO AVE 1,215,"215, ANG MO KIO AVE 1",1.366558,103.841624,560215
4,ANG MO KIO,ANG MO KIO AVE 1,216,"216, ANG MO KIO AVE 1",1.366197,103.841505,560216
...,...,...,...,...,...,...,...
9518,YISHUN,YISHUN ST 81,876,"876, YISHUN ST 81",1.414745,103.835532,760876
9519,YISHUN,YISHUN ST 81,877,"877, YISHUN ST 81",1.413902,103.835454,760877
9520,YISHUN,YISHUN ST 81,878,"878, YISHUN ST 81",1.414053,103.835888,760878
9521,YISHUN,YISHUN ST 81,879,"879, YISHUN ST 81",1.414442,103.836118,760879


In [11]:
df.to_csv('ingestion/geolocations.csv')