# HDB Resale Price Predictor & Visualisation

This project aims to create a data pipeline with the help of availale APIs (Data.gov.sg and OneMap) to build a web-based application for
1. HDB Price visualisation
2. HDB Price prediction

The prototype aims to read latest data directly from data.gov.sg and perform ETL (Extract, Transform, and Load) to a local/web database of choice.

In [16]:
import requests
import numpy as np
import pandas as pd
import json
import logging
import time
from requests.exceptions import HTTPError
from pprint import pprint
from functools import wraps

## Contents
1. API call data
2. Data Wrangling
3. Feature Engineering

## Getting the data through API call

### Wrapper functions
* To time function calls
* To error handle HTTPerrors and other Exceptions
* CACHE

In [17]:
logging.basicConfig(filename='app.log', filemode='a', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.warning(f"{'-'*20}New run started {'-'*100}")

In [18]:
# Wrapper for timing function calls:
def timeit(func):
    '''
    Wrapper to time function call
    '''
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        '''
        *args and **kwargs here allow parameters for the original function to be taken in
        and passed to the function contained in the wrapper.
        '''
        current_time = time.strftime("%H:%M:%S", time.localtime())
        start = time.perf_counter()
        result = func(*args, **kwargs)
        end = time.perf_counter()
        time_taken = end-start
        print(f'{func.__name__}() called at \t{current_time} \texecution time: {time_taken:.4f} seconds')
        logging.info(f'{func.__name__}() called at \texecution time: {time_taken:.4f} seconds')
        return result
    return timeit_wrapper

def error_handler(func, max_attempts=3, delay=120):
    '''
    Wrapper to catch and handle errors
    '''
    @wraps(func)
    def error_handler_wrapper(*args, **kwargs):
        '''
        *args and **kwargs here allow parameters for the original function to be taken in
        and passed to the function contained in the wrapper, without needed to declare them in the wrapper function.
        '''
        for i in range(max_attempts):
            try:
                result = func(*args, **kwargs)
            except HTTPError as err:
                logging.error(f'{func.__name__}() encountered {err}')
                # Raise exception if we reach max tries
                if i == max_attempts:
                    raise HTTPError(f'Exceeded max tries of {max_attempts}')
                print(f'{func.__name__}() encountered {err}')

                # err.response gives us the Response object from requests module, we can call .status_code to get the code as int
                if err.response.status_code == 429:
                    print(f'Sleeping for {delay} seconds', end = '\t')
                    time.sleep(delay)
                    print('Retrying...', end='\t')
            except Exception as err:
                logging.error(f'{func.__name__}() encountered {err}') 
                print(f'{func.__name__}() encountered {err}')
                break
            else:
                return result
    return error_handler_wrapper

### Details for Data.gov.sg API call can be found at
https://data.gov.sg/dataset/ckan-datastore-search

In [19]:
@timeit
@error_handler
def get_token(location: str):
    '''
    Function to check if API token is still valid and updates API token if outdated
    ##Parameters
        location: filepath (str)
    Returns API token : str
    '''
    with open(location, 'r+') as fp:
        file = fp.read()
        data = json.loads(file)
        response = requests.post("https://developers.onemap.sg/privateapi/auth/post/getToken", data=data)
        token = response.json()
        if token['access_token'] != data['access_token']:
            print(f"New token found")
            data['access_token'] = token['access_token']
            data['expiry_timestamp'] = token['expiry_timestamp']
            fp.seek(0)
            json.dump(data, fp = fp, indent=4)
            print('Updated token json')
            data = json.loads(file)
        return data['access_token']

@timeit
@error_handler
def datagovsg_api_call(url: str, sort: str = 'month desc', limit: int = 100, 
                       months:list =[1,2,3,4,5,6,7,8,9,10,11,12], 
                       years:list =["2023"]) -> pd.DataFrame:
    '''
    Function to build the API call and construct the pandas dataframe
    ## Parameters
    url: str
        url for API, with resource_id parameters
    sort: str
        field, by ascending/desc, default by Latest month
    limit: int
        maximum entries (API default by OneMap is 100, if not specified)
    months: list
        months desired, int between 1-12
    years: list
        months desired , int
    Returns Dataframe of data : pd.DataFrame
    '''
    month_dict = '{"month":['
    for year in years:
        for month in months: # months 1-12
            month_dict = month_dict + f'"{year}-{str(month).zfill(2)}", '
    month_dict = month_dict[:-2] # Cancel out extra strings <, >
    month_dict = month_dict + ']}'
    url = url+f'&sort={sort}&filters={month_dict}'
    if limit: # API call's default is 100 even without specifying
        print(f'Call limit : {limit}')
        url = url+f'&limit={limit}'
    pprint(f'API call = {url}')
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    df = pd.DataFrame(data['result']['records'])
    return df

In [20]:
df = datagovsg_api_call('https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3', 
                        sort='month desc',
                        limit = 10000,
                        months = [5],
                        years=[2023])
df

Call limit : 10000
('API call = '
 'https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3&sort=month '
 'desc&filters={"month":["2023-05"]}&limit=10000')
datagovsg_api_call() called at 	21:28:19 	execution time: 1.3859 seconds


Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,remaining_lease,lease_commence_date,storey_range,_id,block
0,ANG MO KIO,2 ROOM,Improved,44,ANG MO KIO AVE 3,275000,2023-05,53 years 09 months,1978,01 TO 03,152258,314
1,ANG MO KIO,3 ROOM,New Generation,68,ANG MO KIO AVE 5,400000,2023-05,55 years 10 months,1980,07 TO 09,152292,538
2,ANG MO KIO,3 ROOM,New Generation,68,ANG MO KIO AVE 5,370000,2023-05,56 years 04 months,1980,01 TO 03,152293,524
3,ANG MO KIO,3 ROOM,New Generation,68,ANG MO KIO AVE 5,398000,2023-05,56 years 09 months,1981,10 TO 12,152294,502
4,ANG MO KIO,3 ROOM,New Generation,68,ANG MO KIO AVE 8,360000,2023-05,56 years 04 months,1980,04 TO 06,152295,505
...,...,...,...,...,...,...,...,...,...,...,...,...
1852,YISHUN,5 ROOM,Improved,112,YISHUN ST 51,710000,2023-05,93 years 08 months,2018,07 TO 09,154110,502B
1853,YISHUN,5 ROOM,Improved,113,YISHUN ST 51,658888,2023-05,91 years 10 months,2016,04 TO 06,154111,504D
1854,YISHUN,EXECUTIVE,Apartment,164,YISHUN AVE 4,1050000,2023-05,68 years 02 months,1992,07 TO 09,154112,661
1855,YISHUN,EXECUTIVE,Maisonette,146,YISHUN RING RD,810000,2023-05,64 years 03 months,1988,04 TO 06,154113,359


In [21]:
# from dataprep.eda import create_report
# create_report(df).show()

## Data wrangling steps
1. Reindexed dataframe using _id (unique to every resale transaction)
2. Changed room types into float values, with Executive as 4.5 rooms (extra study/balcony), and Multigeneration 6 rooms
3. Storey range was converted to avg_storey, the avg floor would be used (every value is a difference of 3 storeys)
4. Resale_price, Floor area converted to float values
5. Month was converted into datetime format, to be used to detrend the time series moving average
6. Year/Month was separated into Year and Month for visualisation purposes
7. Remaining lease was converted into remaining months (float)
8. Update capitalisation and street naming conventions (for purpose of API call later)
9. Categorised towns into regions (North, West, East, North-East, Central) https://www.hdb.gov.sg/about-us/history/hdb-towns-your-home

In [22]:
@timeit
def clean_df(df: pd.DataFrame):
    '''
    Function to clean the raw dataframe
    ##Parameters
    pd.DataFrame
    ##Cleaning done
        1. Reindexed dataframe using _id (unique to every resale transaction)
        2. Changed room types into float values, with Executive as 4.5 rooms (extra study/balcony), and Multigeneration 6 rooms
        3. Storey range was converted to avg_storey, the avg floor would be used (every value is a difference of 3 storeys)
        4. Resale_price, Floor area converted to float values
        5. Month was converted into datetime format, to be used to detrend the time series moving average
        6. Year/Month was separated into Year and Month for visualisation purposes
        7. Remaining lease was converted into remaining months (float)
        8. Update capitalisation and street naming conventions (for purpose of API call later)
        9. Categorised towns into regions (North, West, East, North-East, Central) 
    Returns the cleaned dataframe
    '''
    try:
        # Start
        # Step 1: set index to overall id
        step = 1
        df.set_index('_id', inplace=True)
            
        # Step 2: Create feature "rooms", "avg_storey"
        def categorise_rooms(flat_type):
            '''
            Helper function for categorising number of rooms
            '''
            if flat_type[0] == 'E':
                return 5.5
            elif flat_type[0] == 'M':
                return 6.0
            else:
                return float(flat_type[0])
        
        step = 2
        df['rooms'] = df['flat_type'].apply(categorise_rooms)
        step = 3
        df['avg_storey'] = df['storey_range'].apply(lambda x: (int(x[:2])+int(x[-2:]))/2)

        # Step 4-6: Change dtypes
        df['resale_price'] = df['resale_price'].astype('float')
        df['floor_area_sqm'] = df['floor_area_sqm'].astype('float')
        step = 5
        df['timeseries_month'] = pd.to_datetime(df['month'], format="%Y-%m")
        step = 6
        df['year'] = df['timeseries_month'].dt.year
        df['month'] = df['timeseries_month'].dt.month
        step = 7
        df['lease_commence_date'] = df['lease_commence_date'].astype('int')
        
        # Calculate remaining_lease
        def year_month_to_year(remaining_lease):
            '''
            Helper function to change year & months, into years (float)
            '''
            remaining_lease = remaining_lease.split(' ')
            if len(remaining_lease) > 2:
                year = float(remaining_lease[0]) + float(remaining_lease[2])/12
            else:
                year = float(remaining_lease[0])
            return year
        
        df['remaining_lease'] = df['remaining_lease'].apply(year_month_to_year)

        step = 8
        # Step 8: Change capitalization of strings
        for column in df.columns:
            if df[column].dtype == 'O':
                df[column] = df[column].str.title()
        
        # Update address abbreviations for onemap API call
        abbreviations = {'Sth':'South', 
                        '[S][t][^.ri]':'Street ', 
                        '[S][t]$':'Street',
                        '[S][t][.]':'Saint', 
                        'Nth':'North', 
                        'Ave':'Avenue', 
                        'Dr':'Drive', 
                        'Rd':'Road'}
        for abbreviation, full in abbreviations.items():
            df['street_name'] = df['street_name'].str.replace(abbreviation, full, regex=True)
        
        # Step 9: Categorise town regions
        step = 9
        town_regions = {'Sembawang' : 'North',
                    'Woodlands' : 'North',
                    'Yishun' : 'North',
                    'Ang Mo Kio' : 'North-East',
                    'Hougang' : 'North-East',
                    'Punggol' : 'North-East',
                    'Sengkang' : 'North-East',
                    'Serangoon' : 'North-East',
                    'Bedok' : 'East',
                    'Pasir Ris' : 'East',
                    'Tampines' : 'East',
                    'Bukit Batok' : 'West',
                    'Bukit Panjang' : 'West',
                    'Choa Chu Kang' : 'West',
                    'Clementi' : 'West',
                    'Jurong East' : 'West',
                    'Jurong West' : 'West',
                    'Tengah' : 'West',
                    'Bishan' : 'Central',
                    'Bukit Merah' : 'Central',
                    'Bukit Timah' : 'Central',
                    'Central Area' : 'Central',
                    'Geylang' : 'Central',
                    'Kallang/Whampoa' : 'Central',
                    'Marine Parade' : 'Central',
                    'Queenstown' : 'Central',
                    'Toa Payoh' : 'Central'}      
        df['region'] = df['town'].map(town_regions)
    except Exception as err:
        print(f"Error at step {step}, error message: {err}")
    else:
        # Reorder columns
        temp_df = df[['block', 'street_name']]
        df = df[['resale_price', 'year', 'month', 'timeseries_month', 'region', 'town', 'rooms', 'avg_storey', 'floor_area_sqm', 'remaining_lease']]
                # Unused columns - 'lease_commence_date', 'flat_model', 'storey_range', 'flat_type'
    return df, temp_df

In [23]:
df, address_df = clean_df(df)
display(df.dtypes)
df

clean_df() called at 	21:28:21 	execution time: 0.0229 seconds


resale_price               float64
year                         int32
month                        int32
timeseries_month    datetime64[ns]
region                      object
town                        object
rooms                      float64
avg_storey                 float64
floor_area_sqm             float64
remaining_lease            float64
dtype: object

Unnamed: 0_level_0,resale_price,year,month,timeseries_month,region,town,rooms,avg_storey,floor_area_sqm,remaining_lease
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
152258,275000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,2.0,2.0,44.0,53.750000
152292,400000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,3.0,8.0,68.0,55.833333
152293,370000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,3.0,2.0,68.0,56.333333
152294,398000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,3.0,11.0,68.0,56.750000
152295,360000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,3.0,5.0,68.0,56.333333
...,...,...,...,...,...,...,...,...,...,...
154110,710000.0,2023,5,2023-05-01,North,Yishun,5.0,8.0,112.0,93.666667
154111,658888.0,2023,5,2023-05-01,North,Yishun,5.0,5.0,113.0,91.833333
154112,1050000.0,2023,5,2023-05-01,North,Yishun,5.5,8.0,164.0,68.166667
154113,810000.0,2023,5,2023-05-01,North,Yishun,5.5,5.0,146.0,64.250000


## 3. Feature Engineering (Location data)

Lastly, location plays a huge role in house pricing, hence

3.1 Obtaining latitude, longitude, postal codes

3.2 Distance to city center

3.3 Obtaining MRT locations

3.4 Determine nearest MRT and traveling time

### 3.1 Latitude & longitude from address
Using street name and block, I utilized OneMap API to obtain the latitude, longitude, and postal codes of each flat https://www.onemap.gov.sg/docs

In [24]:
@error_handler
def get_location_data(address_df: pd.DataFrame):
    # Getting latitude, longitude, postal code
    @timeit
    def get_lat_long(address_df : pd.DataFrame, sleeptime : float =0.15):
        '''
        API call to get latitude, longitude, and postal code
        ## Parameters
        df : pd.DataFrame
            dataframe for cleaning, should contain columns ['block'] and ['street_name]
        sleeptime : float
            Incorporates sleep time to not exceed a max of 250 calls per min
            Default 0.15s
        '''
        # Lag time between calls
        time.sleep(sleeptime)

        # API call
        address = address_df['block'] + ', ' + address_df['street_name']
        try:
            call = f'https://developers.onemap.sg/commonapi/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y'
            response = requests.get(call)
            response.raise_for_status()
            data = response.json()
            return data['results'][0]['LATITUDE'] + ',' + data['results'][0]['LONGITUDE'] + ' ' + data['results'][0]['POSTAL']
        except Exception as err:
            print(f'Error occurred - get_lat_long() API call: {err} on the following call:')
            pprint(call)
            return '0,0 0' # Still return 0 values

    def to_numpy_array(lat_long_df):
        # Build a numpy array from latitude and longitude
        combi = np.array([lat_long_df[0], lat_long_df[1]])
        return combi
    

    # This calls the API call function row wise
    position = address_df.apply(get_lat_long, axis=1)

    try:
        temp_df = position.str.split(expand=True)
        temp_df.iloc[:,1] = temp_df.iloc[:,1].apply(lambda x: 0 if x=='NIL' else x)
        temp_df.iloc[:,1] = temp_df.iloc[:,1].astype('int')
        lat_long_df = temp_df.iloc[:,0].str.split(pat=',', expand=True)
        lat_long_df = lat_long_df.astype('float')
        numpy_array = lat_long_df.apply(to_numpy_array, axis=1)
        
    except Exception as err:
        print(f"Error occurred - Splitting data : {err}")
    else:
        geo_data_df = pd.concat([temp_df, lat_long_df, numpy_array], axis=1)
        geo_data_df.columns = ['lat_long', 'postal_code', 'latitude', 'longitude', 'numpy_array']
        return geo_data_df

In [25]:
geo_data_df= get_location_data(address_df)
display(geo_data_df.dtypes)
geo_data_df


get_lat_long() called at 	21:28:21 	execution time: 0.2877 seconds
get_lat_long() called at 	21:28:21 	execution time: 0.2573 seconds
get_lat_long() called at 	21:28:21 	execution time: 0.2383 seconds
get_lat_long() called at 	21:28:21 	execution time: 0.2675 seconds
get_lat_long() called at 	21:28:22 	execution time: 0.2510 seconds
get_lat_long() called at 	21:28:22 	execution time: 0.2365 seconds
get_lat_long() called at 	21:28:22 	execution time: 0.2422 seconds
get_lat_long() called at 	21:28:22 	execution time: 0.2548 seconds
get_lat_long() called at 	21:28:23 	execution time: 0.2516 seconds
get_lat_long() called at 	21:28:23 	execution time: 0.2646 seconds
get_lat_long() called at 	21:28:23 	execution time: 0.2601 seconds
get_lat_long() called at 	21:28:23 	execution time: 0.2416 seconds
get_lat_long() called at 	21:28:24 	execution time: 0.2372 seconds
get_lat_long() called at 	21:28:24 	execution time: 0.2334 seconds
get_lat_long() called at 	21:28:24 	execution time: 0.2518 sec

lat_long        object
postal_code     object
latitude       float64
longitude      float64
numpy_array     object
dtype: object

Unnamed: 0_level_0,lat_long,postal_code,latitude,longitude,numpy_array
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
152258,"1.36622707120636,103.850085858983",560314,1.366227,103.850086,"[1.36622707120636, 103.850085858983]"
152292,"1.37559336678106,103.853347801104",560538,1.375593,103.853348,"[1.37559336678106, 103.853347801104]"
152293,"1.3733782941643,103.853042534069",560524,1.373378,103.853043,"[1.3733782941643, 103.853042534069]"
152294,"1.37572474765516,103.849934305516",560502,1.375725,103.849934,"[1.37572474765516, 103.849934305516]"
152295,"1.37485446863611,103.8502086106",560505,1.374854,103.850209,"[1.37485446863611, 103.8502086106]"
...,...,...,...,...,...
154110,"1.41683901466432,103.841551768426",762502,1.416839,103.841552,"[1.41683901466432, 103.841551768426]"
154111,"1.41833362541375,103.843721655882",764504,1.418334,103.843722,"[1.41833362541375, 103.843721655882]"
154112,"1.42201523201179,103.840714295544",760661,1.422015,103.840714,"[1.42201523201179, 103.840714295544]"
154113,"1.42741919415677,103.845702193715",760359,1.427419,103.845702,"[1.42741919415677, 103.845702193715]"


### 3.2 Distance to city center

The central district of Singapore has the highest housing prices. Property nearer to the city centre tend to have a higher price.

We will make use of this to create a new feature to test if it is significant in model building.

In [26]:
@error_handler
def distance_to(from_coordinates : pd.DataFrame, to_address : str , verbose : int=0):
    '''
    Function to determine distance to a location (from a series of locations in a dataframe
    ## Parameters
    from_coordinates : pd.Series
        each entry should consist of single numpy array containing latitude, longitude
    to_address : str
        place and streetname
    verbose : int
        whether to show the workings of the function

    Returns np.Series of distance between input and location
    '''
    call = f'https://developers.onemap.sg/commonapi/search?searchVal={to_address}&returnGeom=Y&getAddrDetails=Y'
    response = requests.get(call)
    response.raise_for_status()
    data = response.json()
    coordinates = np.array([float(data['results'][0]['LATITUDE']), float(data['results'][0]['LONGITUDE'])])
    if verbose==1:
        print(f'Coordinates of {to_address} : {coordinates}')

    def matrix_operations(local_coordinates, coordinates):
        # Matrix substraction to get difference 
        distance_diff = local_coordinates - coordinates
        absolute_dist = np.absolute(distance_diff)

        #Matrix sum over latitude and longitude of each entry
        sum_of_distances = np.sum(absolute_dist)

        if verbose==2:
            print(f'Difference in distances: \n{distance_diff}')
            print()
            print(f'Absolute difference: \n{absolute_dist}')
            print()
            print(f'Sum of distances \n {sum_of_distances}')
        
        return sum_of_distances

    diff_dist = from_coordinates.apply(matrix_operations, coordinates=coordinates)

    return diff_dist

In [27]:
dist_to_marina_bay = distance_to(geo_data_df['numpy_array'], 'Marina Bay', verbose=1)
dist_to_marina_bay = pd.Series(dist_to_marina_bay, name='dist_to_marina_bay')
df = pd.concat([df, dist_to_marina_bay, geo_data_df['latitude'], geo_data_df['longitude']], axis=1)

Coordinates of Marina Bay : [  1.2834542  103.86080905]


In [28]:
df

Unnamed: 0_level_0,resale_price,year,month,timeseries_month,region,town,rooms,avg_storey,floor_area_sqm,remaining_lease,dist_to_marina_bay,latitude,longitude
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
152258,275000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,2.0,2.0,44.0,53.750000,0.093496,1.366227,103.850086
152292,400000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,3.0,8.0,68.0,55.833333,0.099600,1.375593,103.853348
152293,370000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,3.0,2.0,68.0,56.333333,0.097691,1.373378,103.853043
152294,398000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,3.0,11.0,68.0,56.750000,0.103145,1.375725,103.849934
152295,360000.0,2023,5,2023-05-01,North-East,Ang Mo Kio,3.0,5.0,68.0,56.333333,0.102001,1.374854,103.850209
...,...,...,...,...,...,...,...,...,...,...,...,...,...
154110,710000.0,2023,5,2023-05-01,North,Yishun,5.0,8.0,112.0,93.666667,0.152642,1.416839,103.841552
154111,658888.0,2023,5,2023-05-01,North,Yishun,5.0,5.0,113.0,91.833333,0.151967,1.418334,103.843722
154112,1050000.0,2023,5,2023-05-01,North,Yishun,5.5,8.0,164.0,68.166667,0.158656,1.422015,103.840714
154113,810000.0,2023,5,2023-05-01,North,Yishun,5.5,5.0,146.0,64.250000,0.159072,1.427419,103.845702


In [29]:
name = input('Name save file: e.g. <2023_apr>\n')
if name != '':
    filename= f'static/{name}.csv'
    df.to_csv(filename)
    print(f'File saved as {filename}')

File saved as static/2023_05_dist.csv


### 3.3 MRT Locations
The location of all MRT stations was also obtained using OneMap API and saved as a json file locally

In [11]:
@timeit
@error_handler
def update_mrt_coordinates(mrt_stations=None, filepath='static/mrt_dict.json'):
    '''
    Function to API call for MRT station coordinates and write to json file
    ## Parameters
    mrt_stations : list
        list of mrt station names, default to All stations if nothing is given
    filepath : str
        filepath and name of json file to write to, should end with .json
    Returns None
    '''
    if not mrt_stations:
        mrt_stations = ['Admiralty MRT', 'Aljunied MRT', 'Ang Mo Kio MRT', 'Bakau LRT', 'Bangkit LRT', 'Bartley MRT', 'Bayfront MRT',
                        'Bayshore MRT', 'Beauty World MRT', 'Bedok MRT', 'Bedok North MRT', 'Bedok Reservoir MRT', 'Bencoolen MRT',
                        'Bendemeer MRT', 'Bishan MRT', 'Boon Keng MRT', 'Boon Lay MRT', 'Botanic Gardens MRT', 'Braddell MRT',
                        'Bras Basah MRT', 'Buangkok MRT', 'Bugis MRT', 'Bukit Batok MRT', 'Bukit Brown MRT', 'Bukit Gombak MRT',
                        'Bukit Panjang MRT', 'Buona Vista MRT', 'Caldecott MRT', 'Cashew MRT', 'Changi Airport MRT',
                        'Chinatown MRT', 'Chinese Garden MRT', 'Choa Chu Kang MRT', 'City Hall MRT', 'Clarke Quay MRT',
                        'Clementi MRT', 'Commonwealth MRT', 'Compassvale LRT', 'Cove LRT', 'Dakota MRT', 'Dhoby Ghaut MRT',
                        'Downtown MRT', 'Xilin MRT', 'Tampines East MRT', 'Mayflower MRT', 'Upper Thomson MRT',
                        'Lentor MRT', 'Woodlands North MRT', 'Woodlands South MRT', 'Esplanade MRT', 'Eunos MRT',
                        'Expo MRT', 'Fajar LRT', 'Farmway LRT', 'Farrer Park MRT', 'Fort Canning MRT',
                        'Gardens by the Bay MRT', 'Geylang Bahru MRT', 'HarbourFront MRT', 'Haw Par Villa MRT', 'Hillview MRT',
                        'Holland Village MRT', 'Hougang MRT', 'Jalan Besar MRT', 'Joo Koon MRT', 'Jurong East MRT',
                        'Jurong West MRT', 'Kadaloor LRT', 'Kaki Bukit MRT', 'Kallang MRT', 'Kembangan MRT', 'Keppel MRT',
                        'King Albert Park MRT', 'Kovan MRT', 'Kranji MRT', 'Labrador Park MRT', 'Lakeside MRT', 'Lavender MRT',
                        'Layar LRT', 'Little India MRT', 'Lorong Chuan MRT', 'MacPherson MRT', 'Marina Bay MRT', 'Marina South Pier MRT',
                        'Marsiling MRT', 'Marymount MRT', 'Mattar MRT', 'Meridian LRT', 'Mountbatten MRT',
                        'Newton MRT', 'Nibong LRT', 'Nicoll Highway MRT', 'Novena MRT', 'Oasis LRT', 'One-North MRT', 'Orchard MRT',
                        'Outram Park MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 
                        'Pioneer MRT', 'Potong Pasir MRT', 'Promenade MRT', 'Punggol MRT', 'Queenstown MRT', 'Raffles Place MRT', 'Redhill MRT',
                        'Riviera LRT', 'Rochor MRT', 'Sembawang MRT', 'Sengkang MRT', 'Serangoon MRT', 'Simei MRT', 'Sixth Avenue MRT', 
                        'Somerset MRT', 'Springleaf MRT', 'Stadium MRT', 'Stevens MRT', 'Sumang LRT', 'Tai Seng MRT', 'Tampines MRT', 
                        'Tampines East MRT', 'Tampines West MRT', 'Tanah Merah MRT', 'Tanjong Pagar MRT', 'Tanjong Rhu MRT', 'Teck Lee LRT', 
                        'Telok Ayer MRT', 'Telok Blangah MRT', 'Thanggam LRT', 'Tiong Bahru MRT', 'Toa Payoh MRT', 
                        'Tuas Crescent MRT', 'Tuas Link MRT', 'Tuas West Road MRT', 'Ubi MRT', 'Upper Changi MRT', 
                        'Woodlands MRT', 'Woodlands South MRT', 'Woodlands North MRT', 'Yew Tee MRT', 'Yio Chu Kang MRT', 'Yishun MRT']
    # Future stations - 'Tampines North MRT', 'Tengah MRT'

    mrt_coordinates = {}

    for mrt in mrt_stations:
        try:
            response = requests.get(f"https://developers.onemap.sg/commonapi/search?searchVal={mrt}&returnGeom=Y&getAddrDetails=Y")
            response.raise_for_status()
            data = response.json()
            # string (lat,long) as key
            # mrt_coordinates[f"{data['results'][0]['LATITUDE']},{data['results'][0]['LONGITUDE']}"] = mrt
            mrt_coordinates[mrt] = (float(data['results'][0]['LATITUDE']),float(data['results'][0]['LONGITUDE']))
        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
        except Exception as err:
            print(f'Other error occurred: {err}')
            print(f'Error for {mrt, data}')

    with open(filepath, 'w')as f:
        json.dump(mrt_coordinates, f, indent=4)

@timeit
@error_handler
def get_mrt_coordinates(filepath = 'static/mrt_dict.json'):
    '''
    Function to read saved mrt_coordinates from json file
    ## Parameters
    filepath : str
        filepath to json file
    Returns data : dictionary
    '''
    with open(filepath, 'r') as f:
        file = f.read()
        data = json.loads(file)
        return data


Load Json file and convert to numpy array to utilize matrix operations.

In [12]:
mrt_coordinates_dict = get_mrt_coordinates()

# Convert coordinates into numpy arrays
mrt_stations = np.array(list(mrt_coordinates_dict.keys()))
mrt_coordinates = np.array(list(mrt_coordinates_dict.values()))

get_mrt_coordinates() called at 	09:29:45 	execution time: 0.0011 seconds


### 3.4 Nearest MRT stations and Minimum distance/time
* Using the matrix operations, we are able to find the nearest MRT station by absolute distance 
* Then use OneMap's route_api_call() to get distance/time to MRT stations

In [13]:
@error_handler
def find_nearest_stations(geo_data_df : pd.DataFrame, mrt_stations : np.array=mrt_stations, mrt_coordinates : np.array=mrt_coordinates, 
                          n_nearest_stations: int=2, verbose : int=0):
    '''
    Function to determine nearest MRT station of the resale_flat based on latitude and longitude
    ## Parameters
    geo_data_df : pd.DataFrame
    mrt_stations : np.array
    mrt_coordinates : np.array
    n_nearest_stations: int=2
    verbose : int=0

    Returns a list of n_nearest stations
    '''
    # Matrix substraction to get difference with each MRT, convert to absolute values
    distance_diff = geo_data_df['numpy_array'] - mrt_coordinates
    absolute_dist = np.absolute(distance_diff)

    # Matrix sum over latitude and longitude of each entry
    sum_of_distances = np.sum(absolute_dist, axis=1)

    # Sort and search based on desired n_nearest_stations
    sorted_distances = np.sort(sum_of_distances)
    nearest_stations = []
    for n in range(n_nearest_stations):
        idx = np.where(sum_of_distances==sorted_distances[n])
        nearest_stations.append(mrt_stations[idx][0])

    if verbose==1:
        print(f'Difference in distances: \n{distance_diff[:5]}')
        print()
        print(f'Absolute difference: \n{absolute_dist[:5]}')
        print()
        print(f'Sum of distances \n {sum_of_distances[:5]}')
        print()
        print(f'Sorted distances\n{sorted_distances[:5]}')
        print()
        print(f'Top {n_nearest_stations}')
        print(nearest_stations)

    return nearest_stations

In [14]:
n_nearest_stations = 2
# Matrix operations to find nearest MRT stations for each row
nearest_stations = geo_data_df.apply(find_nearest_stations, n_nearest_stations=n_nearest_stations, axis=1, verbose=0)
nearest_stations_df = pd.DataFrame(nearest_stations.tolist(), index=geo_data_df.index, columns=['nearest_station_'+ str(x) for x in range(n_nearest_stations)])
geo_data_df = pd.concat([geo_data_df, nearest_stations_df], axis=1)
geo_data_df

Unnamed: 0_level_0,lat_long,postal_code,latitude,longitude,numpy_array,nearest_station_0,nearest_station_1
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
150072,"1.36622707120636,103.850085858983",560314,1.366227,103.850086,"[1.36622707120636, 103.850085858983]",Ang Mo Kio MRT,Bishan MRT
150073,"1.37340092645025,103.849073244454",560510,1.373401,103.849073,"[1.37340092645025, 103.849073244454]",Ang Mo Kio MRT,Yio Chu Kang MRT
150074,"1.36558833593063,103.840518883254",560220,1.365588,103.840519,"[1.36558833593063, 103.840518883254]",Mayflower MRT,Ang Mo Kio MRT
150075,"1.3652661423815,103.844538059044",562308,1.365266,103.844538,"[1.3652661423815, 103.844538059044]",Ang Mo Kio MRT,Mayflower MRT
150076,"1.37392238703482,103.855621370524",560541,1.373922,103.855621,"[1.37392238703482, 103.855621370524]",Ang Mo Kio MRT,Yio Chu Kang MRT
...,...,...,...,...,...,...,...
152254,"1.43515573230958,103.839804271982",760258,1.435156,103.839804,"[1.43515573230958, 103.839804271982]",Yishun MRT,Sembawang MRT
152255,"1.43611425409216,103.837605235958",760293,1.436114,103.837605,"[1.43611425409216, 103.837605235958]",Yishun MRT,Sembawang MRT
152256,"1.42171348124463,103.83554529409",760604,1.421713,103.835545,"[1.42171348124463, 103.83554529409]",Yishun MRT,Lentor MRT
152257,"1.42601954522146,103.82993946372",760723,1.426020,103.829939,"[1.42601954522146, 103.82993946372]",Yishun MRT,Sembawang MRT


### Get minimum distance/time using OneMap API call

In [15]:
def route_api_call(routeType: str, start: str, end: str, metric: str, credentials : str,
                   date = '01-26-2023', time_start = '07:35:00', mode = 'TRANSIT', 
                   maxWalkDistance = 1000, numItineraries = 2, verbose=0, recursive_call=None):
    '''
    Function to api call OneMap for routing
    ## Parameters
    routeType : str
        option between ['walk','drive','cycle', 'pt]
    Below only applicable if routeType == 'pt'
        date : str MM-DD-YYYY
            default '01-26-2023'
        time : str HH:MM:SS
            default '07:35:00'
        mode : str 
            choose between TRANSIT, BUS, RAIL
            default 'TRANSIT'
        maxWalkDistance : int
            max walking distance allowed, in meters
            default 1000
        numItineraries : int 
            number of suggested routes
            default 2
    verbose : int 
        1 to print time and distance, 2 for the whole json response
        default 0
    
    ### Returns (time, distance) for chosen routeType
        time is in seconds 
        total_distance is in metres.
    '''
    # Lag time between calls to ensure we stay within 250 calls per minute, 0.24 is calculated time
    # Removed, server lag response gives us an average of about 0.7s per call already, no need to slow down somemore
    # time.sleep(0.24)

    # Walk
    if routeType in ['walk','drive','cycle']:
        response = requests.get(f"https://developers.onemap.sg/privateapi/routingsvc/route?start={start}&end={end}&routeType={routeType}&token={credentials}")
        response.raise_for_status()
        data = response.json()
        time_taken = data['route_summary']['total_time']
        distance = data['route_summary']['total_distance']
        if verbose==1:
            print(f'Walking time: {time_taken}')
            print(f'Walking distance: {distance}')
    
    # Public transport
    elif routeType == 'pt':   
        response = requests.get(f"https://developers.onemap.sg/privateapi/routingsvc/route?start={start}&end={end}&routeType={routeType}&token={credentials}&date={date}&time={time_start}&mode={mode}&maxWalkDistance={maxWalkDistance}&numItineraries={numItineraries}")
        response.raise_for_status()
        data = response.json()

        summary = {'walkTime': data['plan']['itineraries'][0]['walkTime'],
                   'transitTime': data['plan']['itineraries'][0]['transitTime'],
                   'waitingTime': data['plan']['itineraries'][0]['waitingTime']
                   }
        distance = time_taken = sum(summary.values())
        pt_walk_distance = data['plan']['itineraries'][0]['walkDistance']
        if verbose==1:
            pprint(summary)
            print(f'Total public transport time: {time_taken}')
            print(f'Walk distance to public transport: {pt_walk_distance}')
    else:
        raise KeyError("Enter valid routeType, choose between 'walk','drive','cycle', 'pt'")
    
    # To end the call
    if verbose==2:
            pprint(data)
    '''# To Let us know if the retry on recursive call is successful
    if recursive_call:
        print('\tRetry successful')'''

    return time_taken if metric=='time' else distance

@timeit
@error_handler
def time_taken_to_station(geo_data_df, credentials, mrt_coordinates_dict=mrt_coordinates_dict,
                          n_nearest_stations=n_nearest_stations):
    '''
    Function to coordinate route_api_call() to build walking distance and minimum time to nearest mrts
    '''
    start = geo_data_df['lat_long']
    # Columns will depend on how many columns of nearest_stations we obtained previously, defaulted to 2
    columns = geo_data_df[['nearest_station_'+ str(x) for x in range(n_nearest_stations)]] 

    time_distance = []
    for index, mrt_station in enumerate(columns):
        # List comprehension to build latitude and longitude in string (1.121231,102.123123)
        list_of_strings = [str(x) for x in mrt_coordinates_dict[mrt_station]]
        end = ','.join(list_of_strings)

        # Only return closest station's walking distance
        if index==0:
            walk= route_api_call('walk', start, end, 'distance', credentials)
            if walk:
                time_distance.append(walk)
            else:
                time_distance.append(0)

        # Return time for each station
        pt = route_api_call('pt', start, end, 'time', credentials, numItineraries = 1)
        if pt:
            time_distance.append(pt)
        else:
            time_distance.append(0)

    return time_distance

Due to the large amount of API calls, we will split the data into batches to extract the data.

In [16]:
@error_handler
def split_df(geo_data_df: pd.DataFrame, interval: int=500):
    splitted_df_list = []
    for start in range(0, len(geo_data_df.index), interval):
        splitted_df_list.append(geo_data_df.iloc[start:start+interval , :])
    print(f'Number of dataframes split into: {len(splitted_df_list)}')
    return splitted_df_list

def iterate_function(splitted_df_list: list, results: list, func: function, start: int, stop: int):
    '''
    Appends to results (list) in place.
    '''
    print(f'Writing to {id(results)} with {len(results)} elements already present')
    for index, splitted_df in enumerate(splitted_df_list):
        if index >= start and index < stop:
            time_distance = splitted_df.apply(func, credentials=credentials, n_nearest_stations=n_nearest_stations, axis=1)
            results.append(time_distance)
            cont = input(f'Done with index {index}, continue? Y/N \n')
            if cont.lower() == 'n':
                break
    print(f'Length of updated results list: {len(results)}')

splitted_df_list = split_df(geo_data_df, interval=400)

Number of dataframes split into: 6


Run the code by batches while appending the results to a list inplace

In [17]:
credentials=get_token("venv/onemap.json")
time_distance_list = []
iterate_function(splitted_df_list, time_distance_list, time_taken_to_station, 0, len(splitted_df_list))

New token found
Updated token json
get_token() called at 	09:29:45 	execution time: 0.4343 seconds
Writing to results list with 0 elements already present
time_taken_to_station() called at 	09:29:45 	execution time: 0.8514 seconds
time_taken_to_station() called at 	09:29:46 	execution time: 0.7381 seconds
time_taken_to_station() called at 	09:29:47 	execution time: 1.0606 seconds
time_taken_to_station() called at 	09:29:48 	execution time: 1.0195 seconds
time_taken_to_station() called at 	09:29:49 	execution time: 1.0931 seconds
time_taken_to_station() called at 	09:29:50 	execution time: 1.0073 seconds
time_taken_to_station() called at 	09:29:51 	execution time: 1.0158 seconds
time_taken_to_station() called at 	09:29:52 	execution time: 1.3459 seconds
time_taken_to_station() called at 	09:29:53 	execution time: 1.1169 seconds
time_taken_to_station() called at 	09:29:55 	execution time: 1.2286 seconds
time_taken_to_station() called at 	09:29:56 	execution time: 0.9183 seconds
time_take

Put the DataFrame back together if all runs successful

In [45]:
if len(splitted_df_list) == len(time_distance_list):
    time_distance = pd.DataFrame(pd.concat(time_distance_list).to_dict()).transpose()
    time_distance.columns=['dist_to_station']+['time_route_'+ str(x) for x in range(n_nearest_stations)]
    display(time_distance)
else:
    raise IndexError('Mismatch in length of starting and results list')

Unnamed: 0,dist_to_station,time_route_0,time_route_1
150072,498,213,686
150073,463,306,492
150074,1150,523,533
150075,1102,409,694
150076,1201,650,500
...,...,...,...
152254,1195,380,705
152255,1103,474,799
152256,993,487,1339
152257,888,575,1030


### Determine minimum time

In [46]:
# temporary df to find minimum time among public transport times
temp_df = time_distance.drop(labels=['dist_to_station'], axis=1)
min_pt_time = temp_df.min(axis=1).rename('min_pt_time')
geo_data_df = pd.concat([time_distance.loc[:,'dist_to_station'],min_pt_time], axis=1)
                        # Unused columns ['lat_long', 'latitude', 'longitude', 'postal_code']+
geo_data_df

Unnamed: 0,dist_to_station,min_pt_time
150072,498,213
150073,463,306
150074,1150,523
150075,1102,409
150076,1201,500
...,...,...
152254,1195,380
152255,1103,474
152256,993,487
152257,888,575


## Tidying up the full dataframe

In [47]:
df = pd.concat([df, geo_data_df], axis=1)
df

Unnamed: 0,resale_price,year,month,timeseries_month,region,town,rooms,avg_storey,floor_area_sqm,remaining_lease,dist_to_station,min_pt_time
150072,298000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,2.0,8.0,44.0,53.750000,498,213
150073,305000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,2.0,8.0,44.0,56.166667,463,306
150074,372000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,3.0,2.0,82.0,53.333333,1150,523
150075,585000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,3.0,20.0,70.0,88.416667,1102,409
150076,350000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,3.0,5.0,68.0,56.833333,1201,500
...,...,...,...,...,...,...,...,...,...,...,...,...
152254,808000.0,2023,4,2023-04-01,North,Yishun,4.5,2.0,154.0,61.166667,1195,380
152255,892000.0,2023,4,2023-04-01,North,Yishun,4.5,2.0,169.0,68.333333,1103,474
152256,935000.0,2023,4,2023-04-01,North,Yishun,4.5,5.0,164.0,68.166667,993,487
152257,780000.0,2023,4,2023-04-01,North,Yishun,4.5,11.0,142.0,62.166667,888,575
