# HDB Resale Price Predictor & Visualisation

This project aims to create a data pipeline with the help of availale APIs (Data.gov.sg and OneMap) to build a web-based application for
1. HDB Price visualisation
2. HDB Price prediction

The prototype aims to read latest data directly from data.gov.sg and perform ETL (Extract, Transform, and Load) to a local/web database of choice.

In [56]:
import requests
import requests_cache
import numpy as np
import pandas as pd
import json
import logging
import time
from requests.exceptions import HTTPError
from pprint import pprint
from functools import wraps
from geopy.distance import geodesic as GD

## Data Wrangling Contents
1. API call data
2. Data Wrangling
3. Feature Engineering

## 1. Getting the data through API call

### Wrapper functions
* To time function calls
* To error handle HTTPerrors and other Exceptions
* To cache API calls

In [57]:
logging.basicConfig(filename='wrangling.log', filemode='a', format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.warning(f"{'-'*20}New run started {'-'*100}")

# Enable caching
session = requests_cache.CachedSession('F:\python_stuff\hdb_project_cache')

In [58]:
# Wrapper for timing function calls:
def timeit(func):
    '''
    Wrapper to time function call
    '''
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        '''
        *args and **kwargs here allow parameters for the original function to be taken in
        and passed to the function contained in the wrapper.
        '''
        current_time = time.strftime("%H:%M:%S", time.localtime())
        start = time.perf_counter()
        result = func(*args, **kwargs)
        end = time.perf_counter()
        time_taken = end-start
        print(f'{func.__name__}() called at \t{current_time} \texecution time: {time_taken:.4f} seconds')
        logging.info(f'{func.__name__}() called at \texecution time: {time_taken:.4f} seconds')
        return result
    return timeit_wrapper

def error_handler(func, max_attempts=3, delay=120):
    '''
    Wrapper to catch and handle errors
    '''
    @wraps(func)
    def error_handler_wrapper(*args, **kwargs):
        '''
        *args and **kwargs here allow parameters for the original function to be taken in
        and passed to the function contained in the wrapper, without needed to declare them in the wrapper function.
        '''
        for i in range(max_attempts):
            try:
                result = func(*args, **kwargs)
            except HTTPError as err:
                logging.error(f'{func.__name__}() encountered {err}')
                # Raise exception if we reach max tries
                if i == max_attempts:
                    raise HTTPError(f'Exceeded max tries of {max_attempts}')
                print(f'{func.__name__}() encountered {err}')

                # err.response gives us the Response object from requests module, we can call .status_code to get the code as int
                if err.response.status_code == 429:
                    print(f'Sleeping for {delay} seconds', end = '\t')
                    time.sleep(delay)
                    print('Retrying...', end='\t')
            except Exception as err:
                logging.error(f'{func.__name__}() encountered {err}') 
                print(f'{func.__name__}() encountered {err}')
                break
            else:
                return result
    return error_handler_wrapper

### Details for Data.gov.sg API call can be found at
https://data.gov.sg/dataset/ckan-datastore-search

In [59]:
@timeit
@error_handler
def get_token(location: str):
    '''
    Function to check if API token is still valid and updates API token if outdated
    ##Parameters
        location: filepath (str)
    Returns API token : str
    '''
    with open(location, 'r+') as fp:
        file = fp.read()
        data = json.loads(file)
        response = requests.post("https://developers.onemap.sg/privateapi/auth/post/getToken", data=data)
        token = response.json()
        if token['access_token'] != data['access_token']:
            print(f"New token found")
            data['access_token'] = token['access_token']
            data['expiry_timestamp'] = token['expiry_timestamp']
            fp.seek(0)
            json.dump(data, fp = fp, indent=4)
            print('Updated token json')
            data = json.loads(file)
        return data['access_token']

@timeit
@error_handler
def datagovsg_api_call(url: str, sort: str = 'month desc', limit: int = 100, 
                       months:list =[1,2,3,4,5,6,7,8,9,10,11,12], 
                       years:list =["2022"]) -> pd.DataFrame:
    '''
    Function to build the API call and construct the pandas dataframe
    ## Parameters
    url: str
        url for API, with resource_id parameters
    sort: str
        field, by ascending/desc, default by Latest month
    limit: int
        maximum entries (API default by OneMap is 100, if not specified)
    months: list
        months desired, int between 1-12
    years: list
        months desired , int
    Returns Dataframe of data : pd.DataFrame
    '''
    month_dict = '{"month":['
    for year in years:
        for month in months: # months 1-12
            month_dict = month_dict + f'"{year}-{str(month).zfill(2)}", '
    month_dict = month_dict[:-2] # Cancel out extra strings <, >
    month_dict = month_dict + ']}'
    url = url+f'&sort={sort}&filters={month_dict}'
    if limit: # API call's default is 100 even without specifying
        print(f'Call limit : {limit}')
        url = url+f'&limit={limit}'
    pprint(f'API call = {url}')
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()
    df = pd.DataFrame(data['result']['records'])
    return df

In [60]:
df = datagovsg_api_call('https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3', 
                        sort='month desc',
                        limit = 1000000,
                        months = [1,2,3,4,5,6,7,8,9,10,11,12],
                        years=[2022,2023])
df

Call limit : 1000000
('API call = '
 'https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3&sort=month '
 'desc&filters={"month":["2022-01", "2022-02", "2022-03", "2022-04", '
 '"2022-05", "2022-06", "2022-07", "2022-08", "2022-09", "2022-10", "2022-11", '
 '"2022-12", "2023-01", "2023-02", "2023-03", "2023-04", "2023-05", "2023-06", '
 '"2023-07", "2023-08", "2023-09", "2023-10", "2023-11", '
 '"2023-12"]}&limit=1000000')
datagovsg_api_call() called at 	19:27:37 	execution time: 2.2712 seconds


Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,remaining_lease,lease_commence_date,storey_range,_id,block
0,GEYLANG,4 ROOM,Model A,104,DAKOTA CRES,788000,2023-06,60 years 06 months,1984,10 TO 12,154888,58
1,GEYLANG,4 ROOM,Model A,93,CIRCUIT RD,875000,2023-06,92 years 06 months,2016,16 TO 18,154887,17B
2,GEYLANG,4 ROOM,Model A,93,CIRCUIT RD,770000,2023-06,92 years 07 months,2016,04 TO 06,154886,18D
3,GEYLANG,3 ROOM,Improved,58,PAYA LEBAR WAY,300000,2023-06,48 years 04 months,1972,04 TO 06,154885,91
4,GEYLANG,3 ROOM,Improved,58,PAYA LEBAR WAY,295000,2023-06,48 years 04 months,1972,01 TO 03,154884,91
...,...,...,...,...,...,...,...,...,...,...,...,...
39140,CHOA CHU KANG,4 ROOM,Premium Apartment,102,CHOA CHU KANG AVE 2,439888,2022-01,76 years 06 months,1999,13 TO 15,117232,296A
39141,CHOA CHU KANG,4 ROOM,Model A,104,CHOA CHU KANG AVE 2,420000,2022-01,72 years,1994,04 TO 06,117231,252
39142,CHOA CHU KANG,4 ROOM,Premium Apartment,101,CHOA CHU KANG AVE 2,410000,2022-01,76 years 06 months,1999,01 TO 03,117230,297C
39143,CLEMENTI,3 ROOM,New Generation,67,CLEMENTI AVE 2,400000,2022-01,55 years 06 months,1978,10 TO 12,117369,334


## 2. Data wrangling steps
1. Reindexed dataframe using _id (unique to every resale transaction)
2. Changed room types into float values, with Executive as 5.5 rooms (extra study/balcony/bathroom)
3. Storey range was converted to avg_storey, the avg floor would be used (every value is a difference of 3 storeys)
4. Resale_price, Floor area converted to float values
5. Month was converted into datetime format, to be used to detrend the time series moving average
6. Year/Month was separated into Year and Month for visualisation purposes
7. Remaining lease was converted into remaining months (float)
8. Update capitalisation and street naming conventions (for purpose of API call later)
9. Categorised towns into regions (North, West, East, North-East, Central) https://www.hdb.gov.sg/about-us/history/hdb-towns-your-home

In [61]:
@timeit
def clean_df(df: pd.DataFrame):
    '''
    Function to clean the raw dataframe
    ##Parameters
    pd.DataFrame
    ##Cleaning done
        1. Reindexed dataframe using _id (unique to every resale transaction)
        2. Changed room types into float values, with Executive as 4.5 rooms (extra study/balcony), and Multigeneration 6 rooms
        3. Storey range was converted to avg_storey, the avg floor would be used (every value is a difference of 3 storeys)
        4. Resale_price, Floor area converted to float values
        5. Month was converted into datetime format, to be used to detrend the time series moving average
        6. Year/Month was separated into Year and Month for visualisation purposes
        7. Remaining lease was converted into remaining months (float)
        8. Update capitalisation and street naming conventions (for purpose of API call later)
        9. Categorised towns into regions (North, West, East, North-East, Central) 
    Returns the cleaned dataframe
    '''
    try:
        # Start
        # Step 1: set index to overall id
        step = 1
        df.set_index('_id', inplace=True)
            
        # Step 2: Create feature "rooms", "avg_storey"
        def categorise_rooms(flat_type):
            '''
            Helper function for categorising number of rooms
            '''
            if flat_type[0] == 'E' or flat_type[0] == 'M':
                return 5.5
            else:
                return float(flat_type[0])
        
        step = 2
        df['rooms'] = df['flat_type'].apply(categorise_rooms)
        step = 3
        df['avg_storey'] = df['storey_range'].apply(lambda x: (int(x[:2])+int(x[-2:]))/2)

        # Step 4-6: Change dtypes
        df['resale_price'] = df['resale_price'].astype('float')
        df['floor_area_sqm'] = df['floor_area_sqm'].astype('float')
        step = 5
        df['timeseries_month'] = pd.to_datetime(df['month'], format="%Y-%m")
        step = 6
        df['year'] = df['timeseries_month'].dt.year
        df['month'] = df['timeseries_month'].dt.month
        step = 7
        df['lease_commence_date'] = df['lease_commence_date'].astype('int')
        
        # Calculate remaining_lease
        def year_month_to_year(remaining_lease):
            '''
            Helper function to change year & months, into years (float)
            '''
            remaining_lease = remaining_lease.split(' ')
            if len(remaining_lease) > 2:
                year = float(remaining_lease[0]) + float(remaining_lease[2])/12
            else:
                year = float(remaining_lease[0])
            return year
        
        df['remaining_lease'] = df['remaining_lease'].apply(year_month_to_year)

        step = 8
        # Step 8: Change capitalization of strings
        for column in df.columns:
            if df[column].dtype == 'O':
                df[column] = df[column].str.title()
        
        # Update address abbreviations for onemap API call
        abbreviations = {'Sth':'South', 
                        '[S][t][^.ri]':'Street ', 
                        '[S][t]$':'Street',
                        '[S][t][.]':'Saint', 
                        'Nth':'North', 
                        'Ave':'Avenue', 
                        'Dr':'Drive', 
                        'Rd':'Road'}
        for abbreviation, full in abbreviations.items():
            df['street_name'] = df['street_name'].str.replace(abbreviation, full, regex=True)
        
        # Step 9: Categorise town regions
        step = 9
        town_regions = {'Sembawang' : 'North',
                    'Woodlands' : 'North',
                    'Yishun' : 'North',
                    'Ang Mo Kio' : 'North-East',
                    'Hougang' : 'North-East',
                    'Punggol' : 'North-East',
                    'Sengkang' : 'North-East',
                    'Serangoon' : 'North-East',
                    'Bedok' : 'East',
                    'Pasir Ris' : 'East',
                    'Tampines' : 'East',
                    'Bukit Batok' : 'West',
                    'Bukit Panjang' : 'West',
                    'Choa Chu Kang' : 'West',
                    'Clementi' : 'West',
                    'Jurong East' : 'West',
                    'Jurong West' : 'West',
                    'Tengah' : 'West',
                    'Bishan' : 'Central',
                    'Bukit Merah' : 'Central',
                    'Bukit Timah' : 'Central',
                    'Central Area' : 'Central',
                    'Geylang' : 'Central',
                    'Kallang/Whampoa' : 'Central',
                    'Marine Parade' : 'Central',
                    'Queenstown' : 'Central',
                    'Toa Payoh' : 'Central'}      
        df['region'] = df['town'].map(town_regions)
    except Exception as err:
        print(f"Error at step {step}, error message: {err}")
    else:
        # Reorder columns
        temp_df = df[['block', 'street_name', 'town']]
        df = df[['resale_price', 'year', 'month', 'timeseries_month', 'region', 'town', 'rooms', 'avg_storey', 'floor_area_sqm', 'remaining_lease']]
                # Unused columns - 'lease_commence_date', 'flat_model', 'storey_range', 'flat_type'
    return df, temp_df

In [62]:
df, address_df = clean_df(df)
display(df.dtypes)
df

clean_df() called at 	19:27:39 	execution time: 0.6559 seconds


resale_price               float64
year                         int32
month                        int32
timeseries_month    datetime64[ns]
region                      object
town                        object
rooms                      float64
avg_storey                 float64
floor_area_sqm             float64
remaining_lease            float64
dtype: object

Unnamed: 0_level_0,resale_price,year,month,timeseries_month,region,town,rooms,avg_storey,floor_area_sqm,remaining_lease
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
154888,788000.0,2023,6,2023-06-01,Central,Geylang,4.0,11.0,104.0,60.500000
154887,875000.0,2023,6,2023-06-01,Central,Geylang,4.0,17.0,93.0,92.500000
154886,770000.0,2023,6,2023-06-01,Central,Geylang,4.0,5.0,93.0,92.583333
154885,300000.0,2023,6,2023-06-01,Central,Geylang,3.0,5.0,58.0,48.333333
154884,295000.0,2023,6,2023-06-01,Central,Geylang,3.0,2.0,58.0,48.333333
...,...,...,...,...,...,...,...,...,...,...
117232,439888.0,2022,1,2022-01-01,West,Choa Chu Kang,4.0,14.0,102.0,76.500000
117231,420000.0,2022,1,2022-01-01,West,Choa Chu Kang,4.0,5.0,104.0,72.000000
117230,410000.0,2022,1,2022-01-01,West,Choa Chu Kang,4.0,2.0,101.0,76.500000
117369,400000.0,2022,1,2022-01-01,West,Clementi,3.0,11.0,67.0,55.500000


## 3. Feature Engineering (Geodata)

Lastly, location plays a huge role in house pricing, hence

3.1 Obtaining latitude, longitude, postal codes

3.2 Distance to city center

3.3 Obtaining MRT locations

3.4 Determine nearest MRT and traveling time

### 3.1 Latitude & longitude from address
Using street name and block, I utilized OneMap API to obtain the latitude, longitude, and postal codes of each flat https://www.onemap.gov.sg/docs

In [63]:
@error_handler
def get_location_data(address_df: pd.DataFrame):
    # Getting latitude, longitude, postal code
    @timeit
    def get_lat_long(address_df : pd.DataFrame, sleeptime : float =0.15):
        '''
        API call to get latitude, longitude, and postal code
        ## Parameters
        df : pd.DataFrame
            dataframe for cleaning, should contain columns ['block'] and ['street_name]
        sleeptime : float
            Incorporates sleep time to not exceed a max of 250 calls per min
            Default 0.15s
        '''
        # Lag time between calls - No longer needed with Cache, since we will not likely exceed the call limit
        # time.sleep(sleeptime)

        # API call
        address = address_df['block'] + ', ' + address_df['street_name']
        try:
            call = f'https://developers.onemap.sg/commonapi/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y'
            # Caching is enabled in the session
            response = session.get(call)
            response.raise_for_status()
            data = response.json()

            # Returns a the results in string
            return data['results'][0]['LATITUDE'] + ',' + data['results'][0]['LONGITUDE'] + ' ' + data['results'][0]['POSTAL']
        except Exception as err:
            print(f'Error occurred - get_lat_long() API call: {err} on the following call:')
            pprint(call)
            return '0,0 0' # Still return 0 values

    def to_numpy_array(lat_long_df):
        # Build a numpy array from latitude and longitude
        combi = np.array([lat_long_df[0], lat_long_df[1]])
        return combi
    

    # This calls the API call function row wise
    position = address_df.apply(get_lat_long, axis=1)

    try:
        # Split the string into two columns (column 0 is the latitude and longitude, column 1 is the postal code)
        temp_df = position.str.split(expand=True)
        # Postal code
        temp_df.iloc[:,1] = temp_df.iloc[:,1].apply(lambda x: 0 if x=='NIL' else x)
        temp_df.iloc[:,1] = temp_df.iloc[:,1].astype('int')
        # Latitude and longitude split (by ,)
        lat_long_df = temp_df.iloc[:,0].str.split(pat=',', expand=True)
        lat_long_df = lat_long_df.astype('float')
        # Convert into numpy array, for faster matrix operations later
        numpy_array = lat_long_df.apply(to_numpy_array, axis=1)
        
    except Exception as err:
        print(f"Error occurred - Splitting data : {err}")
    else:
        geo_data_df = pd.concat([temp_df, lat_long_df, numpy_array], axis=1)
        geo_data_df.columns = ['lat_long', 'postal_code', 'latitude', 'longitude', 'numpy_array']
        return geo_data_df

In [64]:
geo_data_df= get_location_data(address_df)
display(geo_data_df.dtypes)
geo_data_df

get_lat_long() called at 	19:27:40 	execution time: 0.0036 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0034 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0034 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0031 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0029 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0029 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0022 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0028 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0026 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0025 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0027 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0031 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0036 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0040 seconds
get_lat_long() called at 	19:27:40 	execution time: 0.0026 sec

lat_long        object
postal_code     object
latitude       float64
longitude      float64
numpy_array     object
dtype: object

Unnamed: 0_level_0,lat_long,postal_code,latitude,longitude,numpy_array
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
154888,"1.30730729862781,103.889390130934",390058,1.307307,103.889390,"[1.30730729862781, 103.889390130934]"
154887,"1.32994002346249,103.889189084492",372017,1.329940,103.889189,"[1.32994002346249, 103.889189084492]"
154886,"1.32940434763773,103.889196979567",374018,1.329404,103.889197,"[1.32940434763773, 103.889196979567]"
154885,"1.32268116095086,103.886737921495",370091,1.322681,103.886738,"[1.32268116095086, 103.886737921495]"
154884,"1.32268116095086,103.886737921495",370091,1.322681,103.886738,"[1.32268116095086, 103.886737921495]"
...,...,...,...,...,...
117232,"1.37714154932429,103.740943329677",681296,1.377142,103.740943,"[1.37714154932429, 103.740943329677]"
117231,"1.37803539434682,103.745020692529",680252,1.378035,103.745021,"[1.37803539434682, 103.745020692529]"
117230,"1.37780775426379,103.74306718468",683297,1.377808,103.743067,"[1.37780775426379, 103.74306718468]"
117369,"1.31518502692041,103.768432708555",120334,1.315185,103.768433,"[1.31518502692041, 103.768432708555]"


### 3.2 Distance to city center

The central district of Singapore has the highest housing prices. Property nearer to the city centre tend to have a higher price.

We will make use of this to create a new feature to test if it is significant in model building.

In [65]:
@error_handler
def distance_to(df_series : pd.Series, to_address : str , dist_type : str='latlong', verbose : int=0):
    '''
    Function to determine distance to a location (from a series of locations in a dataframe
    ## Parameters
    df_series : pd.Series contains numpy array containing [latitude, longitude]
    to_address : str
        place and streetname
    dist_type : str
        type of distance (latlong, or geodesic)
    verbose : int
        whether to show the workings of the function

    Returns np.Series of distance between input and location
    '''
    # if an address is given
    if isinstance(to_address, str):
        call = f'https://developers.onemap.sg/commonapi/search?searchVal={to_address}&returnGeom=Y&getAddrDetails=Y'
        response = requests.get(call)
        response.raise_for_status()
        data = response.json()
        to_coordinates = np.array([float(data['results'][0]['LATITUDE']), float(data['results'][0]['LONGITUDE'])])

    if verbose==1:
        print(f'Coordinates of {to_address} : {to_coordinates}')

    def matrix_operations(from_coordinates, to_coordinates):
        # Matrix substraction to get difference 
        distance_diff = from_coordinates - to_coordinates
        absolute_dist = np.absolute(distance_diff)

        #Matrix sum over latitude and longitude of each entry
        sum_of_distances = np.sum(absolute_dist)

        if verbose==2:
            print(f'Difference in distances: \n{distance_diff}')
            print()
            print(f'Absolute difference: \n{absolute_dist}')
            print()
            print(f'Sum of distances \n {sum_of_distances}')
        
        return sum_of_distances

    def geodesic_operations(from_coordinates, coordinates):
        from_coordinates = tuple(from_coordinates)
        coordinates = tuple(coordinates)
        geodesic_dist = GD(from_coordinates, coordinates).kilometers
        return np.round(geodesic_dist,2)
    
    if dist_type == 'geodesic':
        diff_dist = df_series.apply(geodesic_operations, coordinates=to_coordinates)
    else:
        diff_dist = df_series.apply(matrix_operations, coordinates=to_coordinates)

    return diff_dist

In [66]:
dist_to_marina_bay = distance_to(geo_data_df['numpy_array'], 'Marina Bay', dist_type='geodesic', verbose=1)
dist_to_marina_bay = pd.Series(dist_to_marina_bay, name='dist_to_marina_bay')
df = pd.concat([df, dist_to_marina_bay, geo_data_df['latitude'], geo_data_df['longitude'], geo_data_df['postal_code']], axis=1)
df

Coordinates of Marina Bay : [  1.28466204 103.86100592]


Unnamed: 0_level_0,resale_price,year,month,timeseries_month,region,town,rooms,avg_storey,floor_area_sqm,remaining_lease,dist_to_marina_bay,latitude,longitude,postal_code
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
154888,788000.0,2023,6,2023-06-01,Central,Geylang,4.0,11.0,104.0,60.500000,4.03,1.307307,103.889390,390058
154887,875000.0,2023,6,2023-06-01,Central,Geylang,4.0,17.0,93.0,92.500000,5.91,1.329940,103.889189,372017
154886,770000.0,2023,6,2023-06-01,Central,Geylang,4.0,5.0,93.0,92.583333,5.86,1.329404,103.889197,374018
154885,300000.0,2023,6,2023-06-01,Central,Geylang,3.0,5.0,58.0,48.333333,5.09,1.322681,103.886738,370091
154884,295000.0,2023,6,2023-06-01,Central,Geylang,3.0,2.0,58.0,48.333333,5.09,1.322681,103.886738,370091
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117232,439888.0,2022,1,2022-01-01,West,Choa Chu Kang,4.0,14.0,102.0,76.500000,16.83,1.377142,103.740943,681296
117231,420000.0,2022,1,2022-01-01,West,Choa Chu Kang,4.0,5.0,104.0,72.000000,16.53,1.378035,103.745021,680252
117230,410000.0,2022,1,2022-01-01,West,Choa Chu Kang,4.0,2.0,101.0,76.500000,16.68,1.377808,103.743067,683297
117369,400000.0,2022,1,2022-01-01,West,Clementi,3.0,11.0,67.0,55.500000,10.84,1.315185,103.768433,120334


### 3.3 MRT Locations
The location of all MRT stations was also obtained using OneMap API and saved as a json file locally

In [67]:
@timeit
@error_handler
def update_mrt_coordinates(mrt_stations=None, filepath='static/mrt_dict.json'):
    '''
    Function to API call for MRT station coordinates and write to json file
    ## Parameters
    mrt_stations : list
        list of mrt station names, default to All stations if nothing is given
    filepath : str
        filepath and name of json file to write to, should end with .json
    Returns None
    '''
    if not mrt_stations:
        mrt_stations = ['Admiralty MRT', 'Aljunied MRT', 'Ang Mo Kio MRT', 'Bakau LRT', 'Bangkit LRT', 'Bartley MRT', 'Bayfront MRT',
                        'Bayshore MRT', 'Beauty World MRT', 'Bedok MRT', 'Bedok North MRT', 'Bedok Reservoir MRT', 'Bencoolen MRT',
                        'Bendemeer MRT', 'Bishan MRT', 'Boon Keng MRT', 'Boon Lay MRT', 'Botanic Gardens MRT', 'Braddell MRT',
                        'Bras Basah MRT', 'Buangkok MRT', 'Bugis MRT', 'Bukit Batok MRT', 'Bukit Brown MRT', 'Bukit Gombak MRT',
                        'Bukit Panjang MRT', 'Buona Vista MRT', 'Caldecott MRT', 'Cashew MRT', 'Changi Airport MRT',
                        'Chinatown MRT', 'Chinese Garden MRT', 'Choa Chu Kang MRT', 'City Hall MRT', 'Clarke Quay MRT',
                        'Clementi MRT', 'Commonwealth MRT', 'Compassvale LRT', 'Cove LRT', 'Dakota MRT', 'Dhoby Ghaut MRT',
                        'Downtown MRT', 'Xilin MRT', 'Tampines East MRT', 'Mayflower MRT', 'Upper Thomson MRT',
                        'Lentor MRT', 'Woodlands North MRT', 'Woodlands South MRT', 'Esplanade MRT', 'Eunos MRT',
                        'Expo MRT', 'Fajar LRT', 'Farmway LRT', 'Farrer Park MRT', 'Fort Canning MRT',
                        'Gardens by the Bay MRT', 'Geylang Bahru MRT', 'HarbourFront MRT', 'Haw Par Villa MRT', 'Hillview MRT',
                        'Holland Village MRT', 'Hougang MRT', 'Jalan Besar MRT', 'Joo Koon MRT', 'Jurong East MRT',
                        'Jurong West MRT', 'Kadaloor LRT', 'Kaki Bukit MRT', 'Kallang MRT', 'Kembangan MRT', 'Keppel MRT',
                        'King Albert Park MRT', 'Kovan MRT', 'Kranji MRT', 'Labrador Park MRT', 'Lakeside MRT', 'Lavender MRT',
                        'Layar LRT', 'Little India MRT', 'Lorong Chuan MRT', 'MacPherson MRT', 'Marina Bay MRT', 'Marina South Pier MRT',
                        'Marsiling MRT', 'Marymount MRT', 'Mattar MRT', 'Meridian LRT', 'Mountbatten MRT',
                        'Newton MRT', 'Nibong LRT', 'Nicoll Highway MRT', 'Novena MRT', 'Oasis LRT', 'One-North MRT', 'Orchard MRT',
                        'Outram Park MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 
                        'Pioneer MRT', 'Potong Pasir MRT', 'Promenade MRT', 'Punggol MRT', 'Queenstown MRT', 'Raffles Place MRT', 'Redhill MRT',
                        'Riviera LRT', 'Rochor MRT', 'Sembawang MRT', 'Sengkang MRT', 'Serangoon MRT', 'Simei MRT', 'Sixth Avenue MRT', 
                        'Somerset MRT', 'Springleaf MRT', 'Stadium MRT', 'Stevens MRT', 'Sumang LRT', 'Tai Seng MRT', 'Tampines MRT', 
                        'Tampines East MRT', 'Tampines West MRT', 'Tanah Merah MRT', 'Tanjong Pagar MRT', 'Tanjong Rhu MRT', 'Teck Lee LRT', 
                        'Telok Ayer MRT', 'Telok Blangah MRT', 'Thanggam LRT', 'Tiong Bahru MRT', 'Toa Payoh MRT', 
                        'Tuas Crescent MRT', 'Tuas Link MRT', 'Tuas West Road MRT', 'Ubi MRT', 'Upper Changi MRT', 
                        'Woodlands MRT', 'Woodlands South MRT', 'Woodlands North MRT', 'Yew Tee MRT', 'Yio Chu Kang MRT', 'Yishun MRT']
    # Future stations - 'Tampines North MRT', 'Tengah MRT'

    mrt_coordinates = {}
    for mrt in mrt_stations:
        response = requests.get(f"https://developers.onemap.sg/commonapi/search?searchVal={mrt}&returnGeom=Y&getAddrDetails=Y")
        response.raise_for_status()
        data = response.json()
        # string (lat,long) as key
        # mrt_coordinates[f"{data['results'][0]['LATITUDE']},{data['results'][0]['LONGITUDE']}"] = mrt
        mrt_coordinates[mrt] = (float(data['results'][0]['LATITUDE']),float(data['results'][0]['LONGITUDE']))
        
    with open(filepath, 'w')as f:
        json.dump(mrt_coordinates, f, indent=4)

@timeit
@error_handler
def get_mrt_coordinates(filepath = 'static/mrt_dict.json'):
    '''
    Function to read saved mrt_coordinates from json file
    ## Parameters
    filepath : str
        filepath to json file
    Returns data : dictionary
    '''
    with open(filepath, 'r') as f:
        file = f.read()
        data = json.loads(file)
        return data


Load Json file and convert to numpy array to utilize matrix operations.

In [68]:
mrt_coordinates_dict = get_mrt_coordinates()

# Convert coordinates into numpy arrays
mrt_stations = np.array(list(mrt_coordinates_dict.keys()))
mrt_coordinates = np.array(list(mrt_coordinates_dict.values()))

get_mrt_coordinates() called at 	19:34:29 	execution time: 0.0289 seconds


### 3.4 Nearest MRT stations and Minimum distance/time
* Using the matrix operations, we are able to find the nearest MRT station by absolute distance 
* Then use OneMap's route_api_call() to get distance/time to MRT stations

In [69]:
@error_handler
def find_nearest_stations(geo_data_df : pd.DataFrame, mrt_stations : np.array=mrt_stations, mrt_coordinates : np.array=mrt_coordinates, 
                          n_nearest_stations: int=2, verbose : int=0):
    '''
    Function to determine nearest MRT station of the resale_flat based on latitude and longitude
    ## Parameters
        geo_data_df : pd.DataFrame
        mrt_stations : np.array
        mrt_coordinates : np.array
        n_nearest_stations: int=2
        verbose : int=0

    Returns a list of n_nearest stations
    '''
    # Matrix substraction to get difference with each MRT, convert to absolute values
    distance_diff = geo_data_df['numpy_array'] - mrt_coordinates
    absolute_dist = np.absolute(distance_diff)

    # Matrix sum over latitude and longitude of each entry
    sum_of_distances = np.sum(absolute_dist, axis=1)

    # Sort and search based on desired n_nearest_stations
    sorted_distances = np.sort(sum_of_distances)
    nearest_stations = []
    for n in range(n_nearest_stations):
        idx = np.where(sum_of_distances==sorted_distances[n])
        from_coordinates = tuple(geo_data_df['numpy_array'])
        to_coordinates = tuple(mrt_coordinates[idx][0])
        geodesic_dist = GD(from_coordinates, to_coordinates).kilometers
        nearest_stations.append(mrt_stations[idx][0])
        nearest_stations.append(np.round(geodesic_dist,2))

    if verbose==1:
        print(f'Difference in distances: \n{distance_diff[:5]}')
        print()
        print(f'Absolute difference: \n{absolute_dist[:5]}')
        print()
        print(f'Sum of distances \n {sum_of_distances[:5]}')
        print()
        print(f'Sorted distances\n{sorted_distances[:5]}')
        print()
        print(f'Top {n_nearest_stations}')
        print(nearest_stations)

    return nearest_stations

In [70]:
n_nearest_stations = 1
# Matrix operations to find nearest MRT stations for each row
nearest_stations = geo_data_df.apply(find_nearest_stations, n_nearest_stations=n_nearest_stations, axis=1, verbose=0)
nearest_stations_df = pd.DataFrame(nearest_stations.tolist(), index=geo_data_df.index, columns=['nearest_station_'+ str(x) for x in range(n_nearest_stations)] + ['dist_to_station_'+ str(x) for x in range(n_nearest_stations)])
nearest_stations_df

Unnamed: 0_level_0,nearest_station_0,dist_to_station_0
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
154888,Dakota MRT,0.14
154887,MacPherson MRT,0.42
154886,MacPherson MRT,0.36
154885,MacPherson MRT,0.48
154884,MacPherson MRT,0.48
...,...,...
117232,Choa Chu Kang MRT,0.99
117231,Choa Chu Kang MRT,0.81
117230,Choa Chu Kang MRT,0.85
117369,Clementi MRT,0.36


In [71]:
df = pd.concat([df, nearest_stations_df], axis=1)
df

Unnamed: 0_level_0,resale_price,year,month,timeseries_month,region,town,rooms,avg_storey,floor_area_sqm,remaining_lease,dist_to_marina_bay,latitude,longitude,postal_code,nearest_station_0,dist_to_station_0
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
154888,788000.0,2023,6,2023-06-01,Central,Geylang,4.0,11.0,104.0,60.500000,4.03,1.307307,103.889390,390058,Dakota MRT,0.14
154887,875000.0,2023,6,2023-06-01,Central,Geylang,4.0,17.0,93.0,92.500000,5.91,1.329940,103.889189,372017,MacPherson MRT,0.42
154886,770000.0,2023,6,2023-06-01,Central,Geylang,4.0,5.0,93.0,92.583333,5.86,1.329404,103.889197,374018,MacPherson MRT,0.36
154885,300000.0,2023,6,2023-06-01,Central,Geylang,3.0,5.0,58.0,48.333333,5.09,1.322681,103.886738,370091,MacPherson MRT,0.48
154884,295000.0,2023,6,2023-06-01,Central,Geylang,3.0,2.0,58.0,48.333333,5.09,1.322681,103.886738,370091,MacPherson MRT,0.48
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117232,439888.0,2022,1,2022-01-01,West,Choa Chu Kang,4.0,14.0,102.0,76.500000,16.83,1.377142,103.740943,681296,Choa Chu Kang MRT,0.99
117231,420000.0,2022,1,2022-01-01,West,Choa Chu Kang,4.0,5.0,104.0,72.000000,16.53,1.378035,103.745021,680252,Choa Chu Kang MRT,0.81
117230,410000.0,2022,1,2022-01-01,West,Choa Chu Kang,4.0,2.0,101.0,76.500000,16.68,1.377808,103.743067,683297,Choa Chu Kang MRT,0.85
117369,400000.0,2022,1,2022-01-01,West,Clementi,3.0,11.0,67.0,55.500000,10.84,1.315185,103.768433,120334,Clementi MRT,0.36


In [72]:
name = input('Name save file: e.g. <2023_apr>\n')
if name != '':
    filename= f'static/{name}.csv'
    df.to_csv(filename)
    print(f'File saved as {filename}')

File saved as static/22_23_Jun.csv


# Retired code below, too slow due to numerous API calls

### Get minimum distance/time using OneMap API call

Due to the large amount of API calls, we will split the data into batches to extract the data.

Run the code by batches while appending the results to a list inplace

Put the DataFrame back together if all runs successful

### Determine minimum time

## Tidying up the full dataframe