# HDB Resale Price Predictor & Visualisation

This project aims to create a data pipeline with the help of availale APIs (Data.gov.sg and OneMap) to build a web-based application for
1. HDB Price visualisation
2. HDB Price prediction

The prototype aims to read latest data directly from data.gov.sg and perform ETL (Extract, Transform, and Load) to a local/web database of choice.

In [287]:
import requests
import numpy as np
import pandas as pd
import json
from requests.exceptions import HTTPError
from pprint import pprint
from time import sleep, perf_counter
from functools import wraps

## Contents
1. API call data
2. Data Wrangling
3. Feature Engineering

## Getting the data through API call

In [288]:
# Wrapper for timing function calls:
def timeit(func):
    '''
    Wrapper to time function call
    '''
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        '''
        *args and **kwargs here allow parameters for the original function to be taken in
        and passed to the function contained in the wrapper.
        '''
        start = perf_counter()
        result = func(*args, **kwargs)
        end = perf_counter()
        time_taken = end-start
        print(f'Function {func.__name__}() execution time: {time_taken:.4f} seconds')
        return result
    return timeit_wrapper

In [289]:
def get_token(location: str):
    '''
    Function to check if API token is still valid and updates API token if outdated
    ##Parameters
        location: filepath (str)
    Returns API token : str
    '''
    try:
        with open(location, 'r+') as fp:
            file = fp.read()
            data = json.loads(file)
            response = requests.post("https://developers.onemap.sg/privateapi/auth/post/getToken", data=data)
            token = response.json()
            if token['access_token'] != data['access_token']:
                print(f"New token found")
                data['access_token'] = token['access_token']
                data['expiry_timestamp'] = token['expiry_timestamp']
                fp.seek(0)
                json.dump(data, fp = fp, indent=4)
                print('Updated token json')
                data = json.loads(file)
            return data['access_token']
    except Exception as err:
        print(err)

@timeit
def datagovsg_api_call(url: str, sort: str = 'month desc', limit: int = 100, 
                       months:list =[1,2,3,4,5,6,7,8,9,10,11,12], 
                       years:list =["2023"]) -> pd.DataFrame:
    '''
    Function to build the API call and construct the pandas dataframe
    ## Parameters
    url: str
        url for API, with resource_id parameters
    sort: str
        field, by ascending/desc, default by Latest month
    limit: int
        maximum entries (API default by OneMap is 100, if not specified)
    months: list
        months desired, int between 1-12
    years: list
        months desired , int
    Returns Dataframe of data : pd.DataFrame
    '''
    month_dict = '{"month":['
    for year in years:
        for month in months: # months 1-12
            month_dict = month_dict + f'"{year}-{str(month).zfill(2)}", '
    month_dict = month_dict[:-2] # Cancel out extra strings <, >
    month_dict = month_dict + ']}'
    url = url+f'&sort={sort}&filters={month_dict}'
    if limit: # API call's default is 100 even without specifying
        print(f'Call limit : {limit}')
        url = url+f'&limit={limit}'
    pprint(f'API call = {url}')
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        df = pd.DataFrame(data['result']['records'])
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        return df

In [290]:
credentials = get_token("venv/onemap.json")
df = datagovsg_api_call('https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3', 
                        sort='month desc',
                        limit = 100,
                        months = [4],
                        years=[2023])
df

Call limit : 100
('API call = '
 'https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3&sort=month '
 'desc&filters={"month":["2023-04"]}&limit=100')
Function datagovsg_api_call() execution time: 1.0154 seconds


Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,remaining_lease,lease_commence_date,storey_range,_id,block
0,ANG MO KIO,2 ROOM,Improved,44,ANG MO KIO AVE 3,298000,2023-04,53 years 09 months,1978,07 TO 09,143402,314
1,ANG MO KIO,2 ROOM,Improved,44,ANG MO KIO AVE 8,305000,2023-04,56 years 02 months,1980,07 TO 09,143407,510
2,ANG MO KIO,3 ROOM,New Generation,82,ANG MO KIO AVE 1,372000,2023-04,53 years 04 months,1977,01 TO 03,143423,220
3,ANG MO KIO,3 ROOM,Model A,70,ANG MO KIO AVE 1,585000,2023-04,88 years 05 months,2012,19 TO 21,143424,308B
4,ANG MO KIO,3 ROOM,New Generation,68,ANG MO KIO AVE 10,350000,2023-04,56 years 10 months,1981,04 TO 06,143466,541
...,...,...,...,...,...,...,...,...,...,...,...,...
95,BEDOK,3 ROOM,Improved,59,BEDOK NTH ST 3,308000,2023-04,54 years,1978,10 TO 12,143933,504
96,BEDOK,3 ROOM,New Generation,67,BEDOK NTH ST 3,340000,2023-04,55 years 03 months,1979,04 TO 06,143934,526
97,BEDOK,3 ROOM,New Generation,68,BEDOK NTH ST 3,370000,2023-04,55 years 04 months,1979,10 TO 12,143935,530
98,BEDOK,3 ROOM,Improved,59,BEDOK NTH ST 3,352000,2023-04,54 years,1978,13 TO 15,143936,501


In [291]:
# from dataprep.eda import create_report
# create_report(df).show()

## Data wrangling steps
1. Reindexed dataframe using _id (unique to every resale transaction)
2. Changed room types into float values, with Executive as 4.5 rooms (extra study/balcony), and Multigeneration 6 rooms
3. Storey range was converted to avg_storey, the avg floor would be used (every value is a difference of 3 storeys)
4. Resale_price, Floor area converted to float values
5. Month was converted into datetime format, to be used to detrend the time series moving average
6. Year/Month was separated into Year and Month for visualisation purposes
7. Remaining lease was converted into remaining months (float)
8. Update capitalisation and street naming conventions (for purpose of API call later)
9. Categorised towns into regions (North, West, East, North-East, Central) https://www.hdb.gov.sg/about-us/history/hdb-towns-your-home

In [292]:
@timeit
def clean_df(df: pd.DataFrame):
    '''
    Function to clean the raw dataframe
    ##Parameters
    pd.DataFrame
    ##Cleaning done
        1. Reindexed dataframe using _id (unique to every resale transaction)
        2. Changed room types into float values, with Executive as 4.5 rooms (extra study/balcony), and Multigeneration 6 rooms
        3. Storey range was converted to avg_storey, the avg floor would be used (every value is a difference of 3 storeys)
        4. Resale_price, Floor area converted to float values
        5. Month was converted into datetime format, to be used to detrend the time series moving average
        6. Year/Month was separated into Year and Month for visualisation purposes
        7. Remaining lease was converted into remaining months (float)
        8. Update capitalisation and street naming conventions (for purpose of API call later)
        9. Categorised towns into regions (North, West, East, North-East, Central) 
    Returns the cleaned dataframe
    '''
    try:
        # Start
        # Step 1: set index to overall id
        step = 1
        df.set_index('_id', inplace=True)
            
        # Step 2: Create feature "rooms", "avg_storey"
        def categorise_rooms(flat_type):
            '''
            Helper function for categorising number of rooms
            '''
            if flat_type[0] == 'E':
                return 4.5
            elif flat_type[0] == 'M':
                return 6.0
            else:
                return float(flat_type[0])
        
        step = 2
        df['rooms'] = df['flat_type'].apply(categorise_rooms)
        step = 3
        df['avg_storey'] = df['storey_range'].apply(lambda x: (int(x[:2])+int(x[-2:]))/2)

        # Step 4-6: Change dtypes
        df['resale_price'] = df['resale_price'].astype('float')
        df['floor_area_sqm'] = df['floor_area_sqm'].astype('float')
        step = 5
        df['timeseries_month'] = pd.to_datetime(df['month'], format="%Y-%m-%d")
        step = 6
        df['year'] = df['timeseries_month'].dt.year
        df['month'] = df['timeseries_month'].dt.month
        step = 7
        df['lease_commence_date'] = df['lease_commence_date'].astype('int')
        
        # Calculate remaining_lease
        def year_month_to_year(remaining_lease):
            '''
            Helper function to change year & months, into years (float)
            '''
            remaining_lease = remaining_lease.split(' ')
            if len(remaining_lease) > 2:
                year = float(remaining_lease[0]) + float(remaining_lease[2])/12
            else:
                year = float(remaining_lease[0])
            return year
        
        df['remaining_lease'] = df['remaining_lease'].apply(year_month_to_year)

        step = 8
        # Step 8: Change capitalization of strings
        for column in df.columns:
            if df[column].dtype == 'O':
                df[column] = df[column].str.title()
        
        # Update address abbreviations for onemap API call
        df['original_street_name'] = df['street_name']
        abbreviations = {'Sth':'South', 
                        '[S][t][^.ri]':'Street ', 
                        '[S][t]$':'Street',
                        '[S][t][.]':'Saint', 
                        'Nth':'North', 
                        'Ave':'Avenue', 
                        'Dr':'Drive', 
                        'Rd':'Road'}
        for abbreviation, full in abbreviations.items():
            df['street_name'] = df['street_name'].str.replace(abbreviation, full, regex=True)
        
        # Step 9: Categorise town regions
        step = 9
        town_regions = {'Sembawang' : 'North',
                    'Woodlands' : 'North',
                    'Yishun' : 'North',
                    'Ang Mo Kio' : 'North-East',
                    'Hougang' : 'North-East',
                    'Punggol' : 'North-East',
                    'Sengkang' : 'North-East',
                    'Serangoon' : 'North-East',
                    'Bedok' : 'East',
                    'Pasir Ris' : 'East',
                    'Tampines' : 'East',
                    'Bukit Batok' : 'West',
                    'Bukit Panjang' : 'West',
                    'Choa Chu Kang' : 'West',
                    'Clementi' : 'West',
                    'Jurong East' : 'West',
                    'Jurong West' : 'West',
                    'Tengah' : 'West',
                    'Bishan' : 'Central',
                    'Bukit Merah' : 'Central',
                    'Bukit Timah' : 'Central',
                    'Central Area' : 'Central',
                    'Geylang' : 'Central',
                    'Kallang/Whampoa' : 'Central',
                    'Marine Parade' : 'Central',
                    'Queenstown' : 'Central',
                    'Toa Payoh' : 'Central'}      
        df['region'] = df['town'].apply(lambda x: town_regions[x])
    except Exception as err:
        print(f"Error at step {step}, error message: {err}")
    else:
        # Reorder columns
        df = df[['resale_price', 'year', 'month', 'timeseries_month', 'region', 'town', 'rooms', 'avg_storey', 'floor_area_sqm', 'remaining_lease',
                 'block', 'street_name', 'original_street_name']]
                # Unused columns - 'lease_commence_date', 'flat_model', 'storey_range', 'flat_type'
    return df

In [293]:
df = clean_df(df)
df.dtypes

Function clean_df() execution time: 0.0188 seconds


resale_price                   float64
year                             int64
month                            int64
timeseries_month        datetime64[ns]
region                          object
town                            object
rooms                          float64
avg_storey                     float64
floor_area_sqm                 float64
remaining_lease                float64
block                           object
street_name                     object
original_street_name            object
dtype: object

In [294]:
df

Unnamed: 0_level_0,resale_price,year,month,timeseries_month,region,town,rooms,avg_storey,floor_area_sqm,remaining_lease,block,street_name,original_street_name
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
143402,298000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,2.0,8.0,44.0,53.750000,314,Ang Mo Kio Avenue 3,Ang Mo Kio Ave 3
143407,305000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,2.0,8.0,44.0,56.166667,510,Ang Mo Kio Avenue 8,Ang Mo Kio Ave 8
143423,372000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,3.0,2.0,82.0,53.333333,220,Ang Mo Kio Avenue 1,Ang Mo Kio Ave 1
143424,585000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,3.0,20.0,70.0,88.416667,308B,Ang Mo Kio Avenue 1,Ang Mo Kio Ave 1
143466,350000.0,2023,4,2023-04-01,North-East,Ang Mo Kio,3.0,5.0,68.0,56.833333,541,Ang Mo Kio Avenue 10,Ang Mo Kio Ave 10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
143933,308000.0,2023,4,2023-04-01,East,Bedok,3.0,11.0,59.0,54.000000,504,Bedok North Street 3,Bedok Nth St 3
143934,340000.0,2023,4,2023-04-01,East,Bedok,3.0,5.0,67.0,55.250000,526,Bedok North Street 3,Bedok Nth St 3
143935,370000.0,2023,4,2023-04-01,East,Bedok,3.0,11.0,68.0,55.333333,530,Bedok North Street 3,Bedok Nth St 3
143936,352000.0,2023,4,2023-04-01,East,Bedok,3.0,14.0,59.0,54.000000,501,Bedok North Street 3,Bedok Nth St 3


## 3. Feature Engineering (Location data)

Lastly, location plays a huge role in house pricing, hence

3.1 Obtaining latitude, longitude, postal codes

3.2 Obtaining MRT locations

3.3 Determine nearest MRT and traveling time

### 3.1 
Using street name and block, I utilized OneMap API to obtain the latitude, longitude, and postal codes of each flat https://www.onemap.gov.sg/docs

In [295]:
@timeit
def get_location_data(df: pd.DataFrame):
    # Getting latitude, longitude, postal code
    def get_lat_long(df : pd.DataFrame, sleeptime : float =0.15):
        '''
        API call to get latitude, longitude, and postal code
        ## Parameters
        df : pd.DataFrame
            dataframe for cleaning, should contain columns ['block'] and ['street_name]
        sleeptime : float
            Incorporates sleep time to not exceed a max of 250 calls per min
            Default 0.15s
        '''
        # Lag time between calls
        sleep(sleeptime)

        # API call
        address = df['block'] + ', ' + df['street_name']
        try:
            call = f'https://developers.onemap.sg/commonapi/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y'
            response = requests.get(call)
            response.raise_for_status()
            data = response.json()
            return data['results'][0]['LATITUDE'] + ',' + data['results'][0]['LONGITUDE'] + ' ' + data['results'][0]['POSTAL']
        except HTTPError as http_err:
            print(f'HTTP error - get_lat_long() API call: {http_err}')
        except Exception as err:
            print(f'Error occurred - get_lat_long() API call: {err} on the following call:')
            pprint(call)
            return '0,0 0' # Still return 0 values

    def to_numpy_array(lat_long_df):
        combi = np.array([lat_long_df[0], lat_long_df[1]])
        return combi
    

    # This calls the API call function row wise
    position = df.apply(get_lat_long, axis=1)

    try:
        temp_df = position.str.split(expand=True)
        temp_df.iloc[:,1] = temp_df.iloc[:,1].apply(lambda x: 0 if x=='NIL' else x)
        temp_df.iloc[:,1] = temp_df.iloc[:,1].astype('int')
        lat_long_df = temp_df.iloc[:,0].str.split(pat=',', expand=True)
        lat_long_df = lat_long_df.astype('float')
        numpy_array = lat_long_df.apply(to_numpy_array, axis=1)
        
    except Exception as err:
        print(f"Error occurred - Splitting data : {err}")
    else:
        geo_data_df = pd.concat([temp_df, lat_long_df, numpy_array], axis=1)
        geo_data_df.columns = ['lat_long', 'postal_code', 'latitude', 'longitude', 'numpy_array']
        return geo_data_df

In [296]:
geo_data_df= get_location_data(df)
geo_data_df.dtypes

Function get_location_data() execution time: 23.7905 seconds


lat_long        object
postal_code      int32
latitude       float64
longitude      float64
numpy_array     object
dtype: object

In [297]:
geo_data_df

Unnamed: 0_level_0,lat_long,postal_code,latitude,longitude,numpy_array
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
143402,"1.36622707120636,103.850085858983",560314,1.366227,103.850086,"[1.36622707120636, 103.850085858983]"
143407,"1.37340092645025,103.849073244454",560510,1.373401,103.849073,"[1.37340092645025, 103.849073244454]"
143423,"1.36558833593063,103.840518883254",560220,1.365588,103.840519,"[1.36558833593063, 103.840518883254]"
143424,"1.3652661423815,103.844538059044",562308,1.365266,103.844538,"[1.3652661423815, 103.844538059044]"
143466,"1.37392238703482,103.855621370524",560541,1.373922,103.855621,"[1.37392238703482, 103.855621370524]"
...,...,...,...,...,...
143933,"1.33188000305479,103.932389359044",460504,1.331880,103.932389,"[1.33188000305479, 103.932389359044]"
143934,"1.33414486865912,103.929639233409",460526,1.334145,103.929639,"[1.33414486865912, 103.929639233409]"
143935,"1.33371332018833,103.927020293564",460530,1.333713,103.927020,"[1.33371332018833, 103.927020293564]"
143936,"1.33164071248973,103.931167665142",460501,1.331641,103.931168,"[1.33164071248973, 103.931167665142]"


### 3.2 
The location of all MRT stations was also obtained using OneMap API and saved as a json file locally

In [298]:
def update_mrt_coordinates(mrt_stations=None, filepath='static/mrt_dict.json'):
    '''
    Function to API call for MRT station coordinates and write to json file
    ## Parameters
    mrt_stations : list
        list of mrt station names, default to All stations if nothing is given
    filepath : str
        filepath and name of json file to write to, should end with .json
    Returns None
    '''
    if not mrt_stations:
        mrt_stations = ['Admiralty MRT', 'Aljunied MRT', 'Ang Mo Kio MRT', 'Bakau LRT', 'Bangkit LRT', 'Bartley MRT', 'Bayfront MRT',
                        'Bayshore MRT', 'Beauty World MRT', 'Bedok MRT', 'Bedok North MRT', 'Bedok Reservoir MRT', 'Bencoolen MRT',
                        'Bendemeer MRT', 'Bishan MRT', 'Boon Keng MRT', 'Boon Lay MRT', 'Botanic Gardens MRT', 'Braddell MRT',
                        'Bras Basah MRT', 'Buangkok MRT', 'Bugis MRT', 'Bukit Batok MRT', 'Bukit Brown MRT', 'Bukit Gombak MRT',
                        'Bukit Panjang MRT', 'Buona Vista MRT', 'Caldecott MRT', 'Cashew MRT', 'Changi Airport MRT',
                        'Chinatown MRT', 'Chinese Garden MRT', 'Choa Chu Kang MRT', 'City Hall MRT', 'Clarke Quay MRT',
                        'Clementi MRT', 'Commonwealth MRT', 'Compassvale LRT', 'Cove LRT', 'Dakota MRT', 'Dhoby Ghaut MRT',
                        'Downtown MRT', 'Xilin MRT', 'Tampines East MRT', 'Mayflower MRT', 'Upper Thomson MRT',
                        'Lentor MRT', 'Woodlands North MRT', 'Woodlands South MRT', 'Esplanade MRT', 'Eunos MRT',
                        'Expo MRT', 'Fajar LRT', 'Farmway LRT', 'Farrer Park MRT', 'Fort Canning MRT',
                        'Gardens by the Bay MRT', 'Geylang Bahru MRT', 'HarbourFront MRT', 'Haw Par Villa MRT', 'Hillview MRT',
                        'Holland Village MRT', 'Hougang MRT', 'Jalan Besar MRT', 'Joo Koon MRT', 'Jurong East MRT',
                        'Jurong West MRT', 'Kadaloor LRT', 'Kaki Bukit MRT', 'Kallang MRT', 'Kembangan MRT', 'Keppel MRT',
                        'King Albert Park MRT', 'Kovan MRT', 'Kranji MRT', 'Labrador Park MRT', 'Lakeside MRT', 'Lavender MRT',
                        'Layar LRT', 'Little India MRT', 'Lorong Chuan MRT', 'MacPherson MRT', 'Marina Bay MRT', 'Marina South Pier MRT',
                        'Marsiling MRT', 'Marymount MRT', 'Mattar MRT', 'Meridian LRT', 'Mountbatten MRT',
                        'Newton MRT', 'Nibong LRT', 'Nicoll Highway MRT', 'Novena MRT', 'Oasis LRT', 'One-North MRT', 'Orchard MRT',
                        'Outram Park MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 
                        'Pioneer MRT', 'Potong Pasir MRT', 'Promenade MRT', 'Punggol MRT', 'Queenstown MRT', 'Raffles Place MRT', 'Redhill MRT',
                        'Riviera LRT', 'Rochor MRT', 'Sembawang MRT', 'Sengkang MRT', 'Serangoon MRT', 'Simei MRT', 'Sixth Avenue MRT', 
                        'Somerset MRT', 'Springleaf MRT', 'Stadium MRT', 'Stevens MRT', 'Sumang LRT', 'Tai Seng MRT', 'Tampines MRT', 
                        'Tampines East MRT', 'Tampines West MRT', 'Tanah Merah MRT', 'Tanjong Pagar MRT', 'Tanjong Rhu MRT', 'Teck Lee LRT', 
                        'Telok Ayer MRT', 'Telok Blangah MRT', 'Thanggam LRT', 'Tiong Bahru MRT', 'Toa Payoh MRT', 
                        'Tuas Crescent MRT', 'Tuas Link MRT', 'Tuas West Road MRT', 'Ubi MRT', 'Upper Changi MRT', 
                        'Woodlands MRT', 'Woodlands South MRT', 'Woodlands North MRT', 'Yew Tee MRT', 'Yio Chu Kang MRT', 'Yishun MRT']
    # Future stations - 'Tampines North MRT', 'Tengah MRT'

    mrt_coordinates = {}

    for mrt in mrt_stations:
        try:
            response = requests.get(f"https://developers.onemap.sg/commonapi/search?searchVal={mrt}&returnGeom=Y&getAddrDetails=Y")
            response.raise_for_status()
            data = response.json()
            # string (lat,long) as key
            # mrt_coordinates[f"{data['results'][0]['LATITUDE']},{data['results'][0]['LONGITUDE']}"] = mrt
            mrt_coordinates[mrt] = (float(data['results'][0]['LATITUDE']),float(data['results'][0]['LONGITUDE']))
        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
        except Exception as err:
            print(f'Other error occurred: {err}')
            print(f'Error for {mrt, data}')

    with open(filepath, 'w')as f:
        json.dump(mrt_coordinates, f, indent=4)

def get_mrt_coordinates(filepath = 'static/mrt_dict.json'):
    '''
    Function to read saved mrt_coordinates from json file
    ## Parameters
    filepath : str
        filepath to json file
    Returns data : dictionary
    '''
    with open(filepath, 'r') as f:
        file = f.read()
        data = json.loads(file)
        return data


Load Json file and convert to numpy array to utilize matrix operations.

In [299]:
mrt_coordinates_dict = get_mrt_coordinates()

# Convert coordinates into numpy arrays
mrt_stations = np.array(list(mrt_coordinates_dict.keys()))
mrt_coordinates = np.array(list(mrt_coordinates_dict.values()))

### 3.3 
* Using the two data above, I am able to determine the nearest MRT station.
* The minimum travelling time (walk and public transport) to the nearest MRT will be an additional feature of the dataset.

In [300]:
def find_nearest_stations(geo_data_df : pd.DataFrame, mrt_stations : np.array=mrt_stations, mrt_coordinates : np.array=mrt_coordinates, 
                          n_nearest_stations: int=2, verbose : int=0):
    '''
    Function to determine nearest MRT station of the resale_flat based on latitude and longitude
    ## Parameters
    geo_data_df : pd.DataFrame
    mrt_stations : np.array
    mrt_coordinates : np.array
    n_nearest_stations: int=2
    verbose : int=0

    Returns a list of n_nearest stations
    '''
    # Matrix substraction to get difference with each MRT, convert to absolute values
    distance_diff = geo_data_df['numpy_array'] - mrt_coordinates
    absolute_dist = np.absolute(distance_diff)

    # Matrix sum over latitude and longitude of each entry
    sum_of_distances = np.sum(absolute_dist, axis=1)

    # Sort and search based on desired n_nearest_stations
    sorted_distances = np.sort(sum_of_distances)
    nearest_stations = []
    for n in range(n_nearest_stations):
        idx = np.where(sum_of_distances==sorted_distances[n])
        nearest_stations.append(mrt_stations[idx][0])

    if verbose==1:
        print(f'Difference in distances: \n{distance_diff[:5]}')
        print()
        print(f'Absolute difference: \n{absolute_dist[:5]}')
        print()
        print(f'Sum of distances \n {sum_of_distances[:5]}')
        print()
        print(f'Sorted distances\n{sorted_distances[:5]}')
        print()
        print(f'Top {n_nearest_stations}')
        print(nearest_stations)

    return nearest_stations

In [301]:
nearest_stations = geo_data_df.apply(find_nearest_stations, n_nearest_stations=2, axis=1, verbose=0)
nearest_stations_df = pd.DataFrame(nearest_stations.tolist(), index=geo_data_df.index)
geo_data_df = pd.concat([geo_data_df, nearest_stations_df], axis=1)

In [305]:
def route_api_call(routeType, date = '01-26-2023', time = '07:35:00', mode = 'TRANSIT', maxWalkDistance = 1000, numItineraries = 2, verbose=0):
    '''
    Function to api call OneMap for routing
    ## Parameters
    routeType : str
        option between ['walk','drive','cycle', 'pt]
    Below only applicable if routeType == 'pt'
        date : str MM-DD-YYYY
            default '01-26-2023'
        time : str HH:MM:SS
            default '07:35:00'
        mode : str 
            choose between TRANSIT, BUS, RAIL
            default 'TRANSIT'
        maxWalkDistance : int
            max walking distance allowed, in meters
            default 1000
        numItineraries : int 
            number of suggested routes
            default 2
    verbose : int 
        1 to print time and distance, 2 for the whole json response
        default 0
    
    ### Returns (time, distance) for chosen routeType
        time is in seconds 
        total_distance is in metres.
    '''
    # Walk
    if routeType in ['walk','drive','cycle']:
        response = requests.get(f"https://developers.onemap.sg/privateapi/routingsvc/route?start={start}&end={end}&routeType={routeType}&token={credentials}")
        response.raise_for_status()
        data = response.json()
        time = data['route_summary']['total_time']
        distance = data['route_summary']['total_distance']
        if verbose==1:
            print(f'Total time: {time}')
            print(f'Total distance: {distance}')
        if verbose==2:
            pprint(data)

        return time, distance
    
    # Public transport
    elif routeType == 'pt':   
        response = requests.get(f"https://developers.onemap.sg/privateapi/routingsvc/route?start={start}&end={end}&routeType={routeType}&token={credentials}&date={date}&time={time}&mode={mode}&maxWalkDistance={maxWalkDistance}&numItineraries={numItineraries}")
        response.raise_for_status()
        data = response.json()
        summary = {'walkTime': data['plan']['itineraries'][0]['walkTime'],
                   'transitTime': data['plan']['itineraries'][0]['transitTime'],
                   'waitingTime': data['plan']['itineraries'][0]['waitingTime']
                   }
        pt_time = sum(summary.values())
        pt_walk_distance = data['plan']['itineraries'][0]['walkDistance']
        if verbose==1:
            pprint(summary)
            print(f'Total time: {pt_time}')
            print(f'Walk distance: {pt_walk_distance}')
        if verbose==2:
            pprint(data)

        return pt_time, pt_walk_distance
    else:
        raise KeyError("Enter valid routeType, choose between 'walk','drive','cycle', 'pt'")


def time_taken_to_station(geo_data_df, mrt_coordinates_dict=mrt_coordinates_dict, credentials=credentials,
                          n_nearest_stations=2):
    start = geo_data_df['lat_long']
    columns = geo_data_df[[x for x in range(n_nearest_stations)]]

    for index, mrt_station in enumerate(columns):
        list_of_strings = [str(x) for x in mrt_coordinates_dict[mrt_station]]
        end = ','.join(list_of_strings)
        try:
            walkTime, walkDistance = route_api_call('walk')
            pt_time, pt_walk_distance = route_api_call('pt', numItineraries = 1)
        except HTTPError as http_err:
            print(f'HTTP error occurred: {http_err}')
        except Exception as err:
            print(f'Other error occurred: {err}')
        else:
            geo_data_df['walkTime'+str(index)] = walkTime, walkDistance
            geo_data_df['pt_time'+str(index)] = pt_time, pt_walk_distance
    return geo_data_df

In [None]:
geo_data_df = geo_data_df.apply(time_taken_to_station, axis=1)
geo_data_df

In [None]:
'''routeType = ['walk', 'pt', 'drive', 'cycle']

start = "1.36622707120636,103.850085858983"
end = "1.36993284962262,103.84955809232"

for num in range(2):
    try:
        if num == 0:
            response = requests.get(f"https://developers.onemap.sg/privateapi/routingsvc/route?start={start}&end={end}&routeType={routeType[num]}&token={credentials}")
        else:
            date = '01-26-2023'
            time = '07:35:00'
            mode = 'TRANSIT'
            maxWalkDistance = 1000
            numItineraries = 1
            response = requests.get(f"https://developers.onemap.sg/privateapi/routingsvc/route?start={start}&end={end}&routeType={routeType[num]}&token={credentials}&date={date}&time={time}&mode={mode}&maxWalkDistance={maxWalkDistance}&numItineraries={numItineraries}")
        response.raise_for_status()
        data = response.json()
        # pprint(data)
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
        print(response.json())
    except Exception as err:
        print(f'Other error occurred: {err}')
    
    if num == 0:
        # walking will only have 1 best route
        pprint(data)
        walk_time = data['route_summary']['total_time']
        walk_distance = data['route_summary']['total_distance']
        print(f'Walk time: {walk_time}')
        print(f'Walk distance: {walk_distance}')
        print('-'*50)
    else:
        pprint(data)
        summary = {'walkTime': data['plan']['itineraries'][0]['walkTime'],
                   'transitTime': data['plan']['itineraries'][0]['transitTime'],
                   'waitingTime': data['plan']['itineraries'][0]['waitingTime']
                   }
        total_time = sum(summary.values())
        walk_distance = data['plan']['itineraries'][0]['walkDistance']
        print(f'Total time: {total_time}')
        print(f'Walk distance: {walk_distance}')'''

In [22]:
save = input('Save? y/n')
if save == 'y':
    df.to_csv('static/2023_APR.csv')