# HDB Resale Price Predictor & Visualisation

This project aims to create a data pipeline with the help of availale APIs (Data.gov.sg and OneMap) to build a web-based application for
1. HDB Price visualisation
2. HDB Price prediction

The prototype aims to read latest data directly from data.gov.sg and perform ETL (Extract, Transform, and Load) to a local/web database of choice.

In [1]:
import requests
from requests.exceptions import HTTPError
import pandas as pd
from time import sleep
from pprint import pprint
import json

class DateTimeEncoder(json.JSONEncoder):
        #Override the default method
        def default(self, obj):
            if isinstance(obj, (date, datetime)):
                return obj.isoformat()

def get_token(location):
    '''
    Function to check if API token is still valid and updates API token if outdated
    Returns the API token
    '''
    try:
        with open(location, 'r+') as fp:
            file = fp.read()
            data = json.loads(file)
            response = requests.post("https://developers.onemap.sg/privateapi/auth/post/getToken", data=data)
            token = response.json()
            if token['access_token'] != data['access_token']:
                print(f"New token found")
                data['access_token'] = token['access_token']
                data['expiry_timestamp'] = token['expiry_timestamp']
                fp.seek(0)
                json.dump(data, fp = fp, indent=4)
                print('Updated token json')
                data = json.loads(file)
            return data['access_token']
    except Exception as err:
        print(err)

credentials = get_token("venv/onemap.json")

def datagovsg_api_call(url, sort = 'month desc', limit = 100, years=["2023"]):
    '''
    Function to build the API call and construct the pandas dataframe
    Inputs:
        url: url for API, with resource_id parameters
        sort: field, by ascending/desc
        limit: maximum entries (API default by OneMap is 100, if not specified)
        years: list of years data required
    Returns a pandas dataframe of the data
    '''
    month_dict = '{"month":['
    for year in years:
        for month in range(1,13):
            month_dict = month_dict + f'"{year}-{str(month).zfill(2)}", '
    month_dict = month_dict[:-2] 
    month_dict = month_dict + ']}'
    url = url+f'&sort={sort}&filters={month_dict}'
    if limit:
        print(f'Call limit : {limit}')
        url = url+f'&limit={limit}'
    pprint(f'API call = {url}')
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        df = pd.DataFrame(data['result']['records'])
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        return df

New token found
Updated token json


In [7]:
df = datagovsg_api_call('https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3')
df

Call limit : 100
('API call = '
 'https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3&sort=month '
 'desc&filters={"month":["2023-01", "2023-02", "2023-03", "2023-04", '
 '"2023-05", "2023-06", "2023-07", "2023-08", "2023-09", "2023-10", "2023-11", '
 '"2023-12"]}&limit=100')


Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,remaining_lease,lease_commence_date,storey_range,_id,block
0,BUKIT PANJANG,4 ROOM,Premium Apartment,93,SEGAR RD,560000,2023-05,88 years 07 months,2012,10 TO 12,152335,456
1,HOUGANG,4 ROOM,Model A,103,HOUGANG AVE 10,565000,2023-05,63 years 01 month,1987,04 TO 06,152374,512
2,BEDOK,EXECUTIVE,Maisonette,150,BEDOK RESERVOIR RD,855000,2023-05,61 years 06 months,1985,10 TO 12,152299,115
3,BUKIT BATOK,5 ROOM,Improved,112,BT BATOK WEST AVE 8,720000,2023-05,94 years 09 months,2019,01 TO 03,152320,445A
4,CHOA CHU KANG,4 ROOM,Model A,114,CHOA CHU KANG ST 53,500888,2023-05,71 years 01 month,1995,07 TO 09,152345,702
...,...,...,...,...,...,...,...,...,...,...,...,...
95,BUKIT PANJANG,4 ROOM,Model A,92,SENJA RD,555000,2023-05,91 years 07 months,2015,13 TO 15,152336,636B
96,BUKIT PANJANG,5 ROOM,Improved,122,BT PANJANG RING RD,590000,2023-05,65 years 04 months,1989,10 TO 12,152338,434
97,BUKIT MERAH,2 ROOM,Standard,43,HAVELOCK RD,260000,2023-05,48 years 11 months,1973,04 TO 06,152322,50
98,CENTRAL AREA,4 ROOM,Type S1,94,CANTONMENT RD,1218000,2023-05,86 years 09 months,2011,31 TO 33,152339,1A


In [15]:
# from dataprep.eda import create_report
# create_report(df).show()

## Data wrangling and feature engineering steps

1. Reindexed dataframe using _id (unique to every resale transaction)
2. Categorised towns into regions (North, West, East, North-East, Central) based on HDB's categorisation https://www.hdb.gov.sg/about-us/history/hdb-towns-your-home
3. Changed room types into float values, with Executive as 4.5 rooms (extra study/balcony), and Multigeneration 6 rooms
4. Floor area converted to float values
5. Month was converted into datetime format, to be used to detrend the time series moving average
6. Year/Month was separated into Year and Month for visualisation purposes
7. Storey range was converted to avg_storey, since unable to determine the floor, the avg floor would be used (every value is a difference of 3 storeys)
8. Remaining lease was converted into remaining months (float)

Lastly, location plays a huge role in house pricing, hence
1. Using street name and block, I utilized OneMap API to obtain the latitude, longitude, and postal codes of each flat https://www.onemap.gov.sg/docs
2. The location of all MRT stations was also obtained using OneMap API and saved as a json file locally
3. Using the two data above, I am able to determine the nearest MRT station
4. The minimum travelling time (walk and public transport) to the nearest MRT will be an additional feature of the dataset

In [8]:
def clean_df(df):
    '''
    function to clean the raw dataframe
    '''
    # Start
    # set index to overall id
    df.set_index('_id', inplace=True)
        
    # Create feature "rooms", "avg_storey"
    def categorise_rooms(flat_type):
        '''
        Helper function for categorising number of rooms
        '''
        if flat_type[0] == 'E':
            return 4.5
        elif flat_type[0] == 'M':
            return 6.0
        else:
            return float(flat_type[0])
        
    df['rooms'] = df['flat_type'].apply(categorise_rooms)
    df['avg_storey'] = df['storey_range'].apply(lambda x: (int(x[:2])+int(x[-2:]))/2)

    # Change dtypes
    df['lease_commence_date'] = df['lease_commence_date'].astype('int')
    df['resale_price'] = df['resale_price'].astype('float')
    df['floor_area_sqm'] = df['floor_area_sqm'].astype('float')
    df['timeseries_month'] = pd.to_datetime(df['month'], format="%Y-%m-%d")
    df['year'] = df['timeseries_month'].dt.year
    df['month'] = df['timeseries_month'].dt.month
    
    # Calculate remaining_lease
    def year_month_to_year(remaining_lease):
        '''
        Helper function to change year & months, into years (float)
        '''
        remaining_lease = remaining_lease.split(' ')
        if len(remaining_lease) > 2:
            year = float(remaining_lease[0]) + float(remaining_lease[2])/12
        else:
            year = float(remaining_lease[0])
        return year
    
    df['remaining_lease'] = df['remaining_lease'].apply(year_month_to_year)

    # Change capitalization of strings
    for column in df.columns:
        if df[column].dtype == 'O':
            df[column] = df[column].str.title()
    
    # Update address abbreviations for onemap API call
    df['original_street_name'] = df['street_name']
    abbreviations = {'Sth':'South', 
                     '[S][t][^.]':'Street ', 
                     '[S][t]$':'Street',
                     '[S][t][.]':'Saint', 
                     'Nth':'North', 
                     'Ave':'Avenue', 
                     'Dr':'Drive', 
                     'Rd':'Road'}
    for abbreviation, full in abbreviations.items():
        df['street_name'] = df['street_name'].str.replace(abbreviation, full, regex=True)
    
    # Categorise town regions
    town_regions = {'Sembawang' : 'North',
                'Woodlands' : 'North',
                'Yishun' : 'North',
                'Ang Mo Kio' : 'North-East',
                'Hougang' : 'North-East',
                'Punggol' : 'North-East',
                'Sengkang' : 'North-East',
                'Serangoon' : 'North-East',
                'Bedok' : 'East',
                'Pasir Ris' : 'East',
                'Tampines' : 'East',
                'Bukit Batok' : 'West',
                'Bukit Panjang' : 'West',
                'Choa Chu Kang' : 'West',
                'Clementi' : 'West',
                'Jurong East' : 'West',
                'Jurong West' : 'West',
                'Tengah' : 'West',
                'Bishan' : 'Central',
                'Bukit Merah' : 'Central',
                'Bukit Timah' : 'Central',
                'Central Area' : 'Central',
                'Geylang' : 'Central',
                'Kallang/Whampoa' : 'Central',
                'Marine Parade' : 'Central',
                'Queenstown' : 'Central',
                'Toa Payoh' : 'Central'}      
    df['region'] = df['town'].apply(lambda x: town_regions[x])

    # Getting latitude, longitude, postal code
    def get_lat_long(df):
        '''
        API call to get latitude, longitude, and postal code
        Incorporates sleep time to not exceed a max of 250 calls per min
        '''
        sleep(0.15)
        address = df['block'] + ', ' + df['street_name']
        try:
            call = f'https://developers.onemap.sg/commonapi/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y'
            response = requests.get(call)
            response.raise_for_status()
            data = response.json()
            return data['results'][0]['LATITUDE'] + ',' + data['results'][0]['LONGITUDE'] + ' ' + data['results'][0]['POSTAL']
        except HTTPError as http_err:
            print(f'HTTP error occurred during get_lat_long: {http_err}')
        except Exception as err:
            print(f'Error occurred during get_lat_long: {err} on the following call:')
            pprint(call)

    df['position'] = df.apply(get_lat_long, axis=1)
    try:
        df['postal_code'] = df['position'].apply(lambda x: x.split()[1]).astype('int')
        df['lat_long'] = df['position'].apply(lambda x: x.split()[0])
        # I need another split here to get floats
        df['lat'] = df['lat_long'].apply(lambda x: float(x.split(',')[0]))
        df['long'] = df['lat_long'].apply(lambda x: float(x.split(',')[1]))
        
    except Exception as err:
        print(f'Error splitting postal_code from lat_long: {err}')
    else:
        # Reorder columns
        df = df[['resale_price', 'year', 'month', 'timeseries_month', 'region', 'town', 'rooms', 'avg_storey', 'floor_area_sqm', 'remaining_lease',
                'lat_long', 'lat', 'long', 'postal_code']]
                # Unused columns - 'block', 'street_name', 'original_street_name', 'lease_commence_date', 'flat_model', 'storey_range', 'flat_type'
    return df

In [9]:
df = clean_df(df)
df.dtypes

resale_price               float64
year                         int64
month                        int64
timeseries_month    datetime64[ns]
region                      object
town                        object
rooms                      float64
avg_storey                 float64
floor_area_sqm             float64
remaining_lease            float64
lat_long                    object
lat                        float64
long                       float64
postal_code                  int32
dtype: object

In [10]:
# df.to_csv('check.csv')
df

Unnamed: 0_level_0,resale_price,year,month,timeseries_month,region,town,rooms,avg_storey,floor_area_sqm,remaining_lease,lat_long,lat,long,postal_code
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
152335,560000.0,2023,5,2023-05-01,West,Bukit Panjang,4.0,11.0,93.0,88.583333,"1.38754537018879,103.770442353455",1.387545,103.770442,670456
152374,565000.0,2023,5,2023-05-01,North-East,Hougang,4.0,5.0,103.0,63.083333,"1.37127315314256,103.88861964542",1.371273,103.888620,530512
152299,855000.0,2023,5,2023-05-01,East,Bedok,4.5,11.0,150.0,61.500000,"1.33067394377665,103.908738351193",1.330674,103.908738,470115
152320,720000.0,2023,5,2023-05-01,West,Bukit Batok,5.0,2.0,112.0,94.750000,"1.35097419119069,103.739240258216",1.350974,103.739240,651445
152345,500888.0,2023,5,2023-05-01,West,Choa Chu Kang,4.0,8.0,114.0,71.083333,"1.39179994537682,103.745621182307",1.391800,103.745621,680702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152336,555000.0,2023,5,2023-05-01,West,Bukit Panjang,4.0,14.0,92.0,91.583333,"1.38762470759976,103.758331445094",1.387625,103.758331,672636
152338,590000.0,2023,5,2023-05-01,West,Bukit Panjang,5.0,11.0,122.0,65.333333,"1.38545924836708,103.77050511557",1.385459,103.770505,670434
152322,260000.0,2023,5,2023-05-01,Central,Bukit Merah,2.0,5.0,43.0,48.916667,"1.28977813992267,103.826793758887",1.289778,103.826794,160050
152339,1218000.0,2023,5,2023-05-01,Central,Central Area,4.0,32.0,94.0,86.750000,"1.27784271048219,103.840965923691",1.277843,103.840966,85101


In [2]:
def update_mrt_coordinates(mrt_stations=None):
    '''
    Function to API call for MRT station coordinates
    Input: list of mrt station names, default to All stations if nothing is given
    '''
    if not mrt_stations:
        mrt_stations = ['Admiralty MRT', 'Aljunied MRT', 'Ang Mo Kio MRT', 'Bakau LRT', 'Bangkit LRT', 'Bartley MRT', 'Bayfront MRT',
                        'Bayshore MRT', 'Beauty World MRT', 'Bedok MRT', 'Bedok North MRT', 'Bedok Reservoir MRT', 'Bencoolen MRT',
                        'Bendemeer MRT', 'Bishan MRT', 'Boon Keng MRT', 'Boon Lay MRT', 'Botanic Gardens MRT', 'Braddell MRT',
                        'Bras Basah MRT', 'Buangkok MRT', 'Bugis MRT', 'Bukit Batok MRT', 'Bukit Brown MRT', 'Bukit Gombak MRT',
                        'Bukit Panjang MRT', 'Buona Vista MRT', 'Caldecott MRT', 'Cashew MRT', 'Changi Airport MRT',
                        'Chinatown MRT', 'Chinese Garden MRT', 'Choa Chu Kang MRT', 'City Hall MRT', 'Clarke Quay MRT',
                        'Clementi MRT', 'Commonwealth MRT', 'Compassvale LRT', 'Cove LRT', 'Dakota MRT', 'Dhoby Ghaut MRT',
                        'Downtown MRT', 'Xilin MRT', 'Tampines East MRT', 'Mayflower MRT', 'Upper Thomson MRT',
                        'Lentor MRT', 'Woodlands North MRT', 'Woodlands South MRT', 'Esplanade MRT', 'Eunos MRT',
                        'Expo MRT', 'Fajar LRT', 'Farmway LRT', 'Farrer Park MRT', 'Fort Canning MRT',
                        'Gardens by the Bay MRT', 'Geylang Bahru MRT', 'HarbourFront MRT', 'Haw Par Villa MRT', 'Hillview MRT',
                        'Holland Village MRT', 'Hougang MRT', 'Jalan Besar MRT', 'Joo Koon MRT', 'Jurong East MRT',
                        'Jurong West MRT', 'Kadaloor LRT', 'Kaki Bukit MRT', 'Kallang MRT', 'Kembangan MRT', 'Keppel MRT',
                        'King Albert Park MRT', 'Kovan MRT', 'Kranji MRT', 'Labrador Park MRT', 'Lakeside MRT', 'Lavender MRT',
                        'Layar LRT', 'Little India MRT', 'Lorong Chuan MRT', 'MacPherson MRT', 'Marina Bay MRT', 'Marina South Pier MRT',
                        'Marsiling MRT', 'Marymount MRT', 'Mattar MRT', 'Meridian LRT', 'Mountbatten MRT',
                        'Newton MRT', 'Nibong LRT', 'Nicoll Highway MRT', 'Novena MRT', 'Oasis LRT', 'One-North MRT', 'Orchard MRT',
                        'Outram Park MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 
                        'Pioneer MRT', 'Potong Pasir MRT', 'Promenade MRT', 'Punggol MRT', 'Queenstown MRT', 'Raffles Place MRT', 'Redhill MRT',
                        'Riviera LRT', 'Rochor MRT', 'Sembawang MRT', 'Sengkang MRT', 'Serangoon MRT', 'Simei MRT', 'Sixth Avenue MRT', 
                        'Somerset MRT', 'Springleaf MRT', 'Stadium MRT', 'Stevens MRT', 'Sumang LRT', 'Tai Seng MRT', 'Tampines MRT', 
                        'Tampines East MRT', 'Tampines West MRT', 'Tanah Merah MRT', 'Tanjong Pagar MRT', 'Tanjong Rhu MRT', 'Teck Lee LRT', 
                        'Telok Ayer MRT', 'Telok Blangah MRT', 'Thanggam LRT', 'Tiong Bahru MRT', 'Toa Payoh MRT', 
                        'Tuas Crescent MRT', 'Tuas Link MRT', 'Tuas West Road MRT', 'Ubi MRT', 'Upper Changi MRT', 
                        'Woodlands MRT', 'Woodlands South MRT', 'Woodlands North MRT', 'Yew Tee MRT', 'Yio Chu Kang MRT', 'Yishun MRT']
    # Future stations - 'Tampines North MRT', 'Tengah MRT'

    mrt_coordinates = {}

    for mrt in mrt_stations:
        try:
            response = requests.get(f"https://developers.onemap.sg/commonapi/search?searchVal={mrt}&returnGeom=Y&getAddrDetails=Y")
            response.raise_for_status()
            data = response.json()
            # string (lat,long) as key
            # mrt_coordinates[f"{data['results'][0]['LATITUDE']},{data['results'][0]['LONGITUDE']}"] = mrt
            mrt_coordinates[mrt] = (float(data['results'][0]['LATITUDE']),float(data['results'][0]['LONGITUDE']))
        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
        except Exception as err:
            print(f'Other error occurred: {err}')
            print(f'Error for {mrt, data}')

    with open('static/mrt_dict.json', 'w')as f:
        json.dump(mrt_coordinates, f, indent=4)

def get_mrt_coordinates(location = 'static/mrt_dict.json'):
    with open(location, 'r') as f:
        file = f.read()
        data = json.loads(file)
        return data


In [3]:
mrt_coordinates = get_mrt_coordinates()

In [4]:
start = "1.32283703302242,103.939124525951"
routeType = ['walk', 'pt', 'drive', 'cycle']
end = "1.36126901451361,103.854642365822"

start = "1.32953680475668,103.940406562732"
end = "1.44058856161847,103.800990519771"

for num in range(2):
    try:
        if num == 0:
            response = requests.get(f"https://developers.onemap.sg/privateapi/routingsvc/route?start={start}&end={end}&routeType={routeType[num]}&token={credentials}")
        else:
            date = '01-26-2023'
            time = '07:35:00'
            mode = 'TRANSIT'
            maxWalkDistance = 1000
            numItineraries = 3
            response = requests.get(f"https://developers.onemap.sg/privateapi/routingsvc/route?start={start}&end={end}&routeType={routeType[num]}&token={credentials}&date={date}&time={time}&mode={mode}&maxWalkDistance={maxWalkDistance}&numItineraries={numItineraries}")
        response.raise_for_status()
        data = response.json()
        # pprint(data)
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
        print(response.json())
    except Exception as err:
        print(f'Other error occurred: {err}')
    
    if num == 0:
        # walking will only have 1 best route
        walk_time = data['route_summary']['total_time']
        walk_distance = data['route_summary']['total_distance']
        print(f'Walk time: {walk_time}')
        print(f'Walk distance: {walk_distance}')
    else:
        pprint(data)


Walk time: 18681
Walk distance: 25950
{'debugOutput': {'pathCalculationTime': 69,
                 'pathTimes': [37, 20, 12],
                 'precalculationTime': 72,
                 'renderingTime': 1,
                 'timedOut': False,
                 'totalTime': 142},
 'elevationMetadata': {'ellipsoidToGeoidDifference': 7.3773084368695,
                       'geoidElevation': False},
 'plan': {'date': 1674689700000,
          'from': {'lat': 1.3295368047567,
                   'lon': 103.94040656273,
                   'name': 'Origin',
                   'orig': '',
                   'vertexType': 'NORMAL'},
          'itineraries': [{'duration': 4074,
                           'elevationGained': 0,
                           'elevationLost': 0,
                           'endTime': 1674694039000,
                           'fare': '1.65',
                           'legs': [{'agencyTimeZoneOffset': 28800000,
                                     'arrivalDelay': 0,
        