In [107]:
import requests
from requests.exceptions import HTTPError
import pandas as pd
from time import sleep
from pprint import pprint
from datetime import date

# add headers
# cache calls

def datagovsg_api_call(url, sort = 'month desc', limit = 10000, years=["2023"]):
    '''
    Function to build the API call and construct the pandas dataframe
    Inputs:
        url: url for API, with resource_id parameters
        sort: field, by ascending/desc
        limit: maximum entries
        years: list of years data required
    Returns a pandas dataframe of the data
    '''
    month_dict = '{"month":['
    for year in years:
        for month in range(1,13):
            month_dict = month_dict + f'"{year}-{str(month).zfill(2)}", '
    month_dict = month_dict[:-2] 
    month_dict = month_dict + ']}'
    url = url+f'&sort={sort}&filters={month_dict}'
    if limit:
        print(f'Call limit : {limit}')
        url = url+f'&limit={limit}'
    print(f'API call = {url}')
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        df = pd.DataFrame(data['result']['records'])
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
    else:
        return df

df = datagovsg_api_call('https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3')
df

Call limit : 10000
API call = https://data.gov.sg/api/action/datastore_search?resource_id=f1765b54-a209-4718-8d38-a39237f502b3&sort=month desc&filters={"month":["2023-01", "2023-02", "2023-03", "2023-04", "2023-05", "2023-06", "2023-07", "2023-08", "2023-09", "2023-10", "2023-11", "2023-12"]}&limit=10000


Unnamed: 0,town,flat_type,flat_model,floor_area_sqm,street_name,resale_price,month,remaining_lease,lease_commence_date,storey_range,_id,block
0,BEDOK,4 ROOM,Simplified,84,BEDOK RESERVOIR RD,470888,2023-04,63 years 01 month,1987,07 TO 09,150191,149
1,BEDOK,5 ROOM,Improved,121,BEDOK RESERVOIR RD,595000,2023-04,56 years 08 months,1980,07 TO 09,150208,710
2,BEDOK,5 ROOM,Improved,123,BEDOK RESERVOIR RD,595000,2023-04,58 years 02 months,1982,10 TO 12,150207,604
3,BEDOK,4 ROOM,New Generation,96,BEDOK RESERVOIR RD,490000,2023-04,61 years 03 months,1985,04 TO 06,150190,113
4,BEDOK,4 ROOM,Simplified,84,BEDOK RESERVOIR RD,520000,2023-04,62 years 06 months,1986,10 TO 12,150189,140
...,...,...,...,...,...,...,...,...,...,...,...,...
8359,ANG MO KIO,3 ROOM,New Generation,67,ANG MO KIO AVE 4,375000,2023-01,54 years 08 months,1978,10 TO 12,143442,110
8360,ANG MO KIO,3 ROOM,New Generation,68,ANG MO KIO AVE 4,375000,2023-01,57 years,1980,07 TO 09,143441,607
8361,ANG MO KIO,3 ROOM,New Generation,69,ANG MO KIO AVE 4,346800,2023-01,56 years 11 months,1980,01 TO 03,143440,163
8362,ANG MO KIO,2 ROOM,Improved,49,ANG MO KIO AVE 3,300000,2023-01,53 years 06 months,1977,04 TO 06,143399,323


In [108]:
# from dataprep.eda import create_report
# create_report(df).show()

Data wrangling and feature engineering

1. Town, keep but add new field (region) - str
2. Room types, change to numbers. Executive 4.5, multigeneration 6 - float
3. Examine flat model if similar to room types
4. Floor area keep - int - bin
5. Street name and block - api call into latitude longitude
6. Month - date
7. Storey range - int (every 3 stories is 1)
8. Lease commence - date, calculate remaining
9. _id change into index

In [109]:
def clean_df(df):
    '''
    function to clean the raw dataframe
    '''
    # Start
    # set index to overall id
    df.set_index('_id', inplace=True)
        
    # Create feature "rooms", "max_storey"
    def categorise_rooms(flat_type):
        '''
        Helper function for categorising number of rooms
        '''
        if flat_type[0] == 'E':
            return 4.5
        elif flat_type[0] == 'M':
            return 6.0
        else:
            return float(flat_type[0])
        
    df['rooms'] = df['flat_type'].apply(categorise_rooms)
    df['max_storey'] = df['storey_range'].apply(lambda x: int(x[-2:]))

    # Change dtypes
    df['lease_commence_date'] = df['lease_commence_date'].astype('int')
    df['resale_price'] = df['resale_price'].astype('float')
    df['floor_area_sqm'] = df['floor_area_sqm'].astype('float')
    df['month'] = pd.to_datetime(df['month'], format="%Y-%m-%d")
    
    # Calculate remaining_lease
    def year_month_to_year(remaining_lease):
        '''
        Helper function to change year & months, into years (float)
        '''
        remaining_lease = remaining_lease.split(' ')
        if len(remaining_lease) > 2:
            year = float(remaining_lease[0]) + float(remaining_lease[2])/12
        else:
            year = float(remaining_lease[0])
        return year
    
    df['remaining_lease'] = df['remaining_lease'].apply(year_month_to_year)

    # Change capitalization of strings
    for column in df.columns:
        if df[column].dtype == 'O':
            df[column] = df[column].str.title()
    
    # Update address abbreviations for onemap API call
    df['original_street_name'] = df['street_name']
    abbreviations = {'Sth':'South', 'St ':'Street ', 'St.':'Saint', 'Nth':'North', 'Ave':'Avenue', 'Dr':'Drive', 'Rd':'Road'}
    for abbreviation, full in abbreviations.items():
        df['street_name'] = df['street_name'].str.replace(abbreviation, full)
    
    # Categorise town regions
    town_regions = {'Sembawang' : 'North',
                'Woodlands' : 'North',
                'Yishun' : 'North',
                'Ang Mo Kio' : 'North-East',
                'Hougang' : 'North-East',
                'Punggol' : 'North-East',
                'Sengkang' : 'North-East',
                'Serangoon' : 'North-East',
                'Bedok' : 'East',
                'Pasir Ris' : 'East',
                'Tampines' : 'East',
                'Bukit Batok' : 'West',
                'Bukit Panjang' : 'West',
                'Choa Chu Kang' : 'West',
                'Clementi' : 'West',
                'Jurong East' : 'West',
                'Jurong West' : 'West',
                'Tengah' : 'West',
                'Bishan' : 'Central',
                'Bukit Merah' : 'Central',
                'Bukit Timah' : 'Central',
                'Central Area' : 'Central',
                'Geylang' : 'Central',
                'Kallang/Whampoa' : 'Central',
                'Marine Parade' : 'Central',
                'Queenstown' : 'Central',
                'Toa Payoh' : 'Central'}      
    df['region'] = df['town'].apply(lambda x: town_regions[x])

    # Getting latitude, longitude, postal code
    def get_lat_long(df):
        '''
        API call to get latitude, longitude, and postal code
        Incorporates 0.25 sleep time to have a max of 250 calls per min
        '''
        sleep(0.24)
        address = df['block'] + ', ' + df['street_name']
        try:
            call = f'https://developers.onemap.sg/commonapi/search?searchVal={address}&returnGeom=Y&getAddrDetails=Y'
            response = requests.get(call)
            response.raise_for_status()
            data = response.json()
            return data['results'][0]['LATITUDE'] + ',' + data['results'][0]['LONGITUDE'] + ' ' + data['results'][0]['POSTAL']
        except HTTPError as http_err:
            print(f'HTTP error occurred during get_lat_long: {http_err}')
        except Exception as err:
            print(f'Error occurred during get_lat_long: {err} on the following call:')
            pprint(call)

    df['position'] = df.apply(get_lat_long, axis=1)
    try:
        df['postal_code'] = df['position'].apply(lambda x: x.split()[1])
        df['lat_long'] = df['position'].apply(lambda x: x.split()[0])
    except Exception as err:
        print(f'Error splitting postal_code from lat_long: {err}')

    # Reorder columns
    df = df[['resale_price', 'month', 'region', 'town', 'rooms', 'max_storey', 'floor_area_sqm', 'remaining_lease',
             'lat_long', 'postal_code']]
    
             # Unused columns - 'block', 'street_name', 'original_street_name', 'lease_commence_date', 'flat_model', 'storey_range', 'flat_type'

    return df

In [None]:
df = clean_df(df)
df.dtypes

In [94]:
df.to_csv('check.csv')

In [None]:
start = df.loc[,'lat_long']
routeType = ['walk', 'drive', 'pt', 'cycle']
try:
    response = requests.get(f"https://developers.onemap.sg/privateapi/routingsvc/route?start={start}&end={end}&routeType={routeType}&token={token}")
    response.raise_for_status()
    data = response.json()
    pprint(data)
except HTTPError as http_err:
    print(f'HTTP error occurred: {http_err}')
except Exception as err:
    print(f'Other error occurred: {err}')

In [117]:
mrt_stations = ['Admiralty MRT', 'Aljunied MRT', 'Ang Mo Kio MRT', 'Bakau LRT', 'Bangkit LRT', 'Bartley MRT', 'Bayfront MRT',
                'Bayshore MRT', 'Beauty World MRT', 'Bedok MRT', 'Bedok North MRT', 'Bedok Reservoir MRT', 'Bencoolen MRT',
                'Bendemeer MRT', 'Bishan MRT', 'Boon Keng MRT', 'Boon Lay MRT', 'Botanic Gardens MRT', 'Braddell MRT',
                'Bras Basah MRT', 'Buangkok MRT', 'Bugis MRT', 'Bukit Batok MRT', 'Bukit Brown MRT', 'Bukit Gombak MRT',
                'Bukit Panjang MRT', 'Buona Vista MRT', 'Caldecott MRT', 'Cashew MRT', 'Changi Airport MRT',
                'Chinatown MRT', 'Chinese Garden MRT', 'Choa Chu Kang MRT', 'City Hall MRT', 'Clarke Quay MRT',
                'Clementi MRT', 'Commonwealth MRT', 'Compassvale LRT', 'Cove LRT', 'Dakota MRT', 'Dhoby Ghaut MRT',
                'Downtown MRT', 'Xilin MRT', 'Tampines East MRT', 'Mayflower MRT', 'Upper Thomson MRT',
                'Lentor MRT', 'Woodlands North MRT', 'Woodlands South MRT', 'Esplanade MRT', 'Eunos MRT',
                'Expo MRT', 'Fajar LRT', 'Farmway LRT', 'Farrer Park MRT', 'Fort Canning MRT', 'Gali Batu LRT',
                'Gardens by the Bay MRT', 'Geylang Bahru MRT', 'HarbourFront MRT', 'Haw Par Villa MRT', 'Hillview MRT',
                'Holland Village MRT', 'Hougang MRT', 'Jalan Besar MRT', 'Joo Koon MRT', 'Jurong East MRT',
                'Jurong West MRT', 'Kadaloor LRT', 'Kaki Bukit MRT', 'Kallang MRT', 'Kembangan MRT', 'Keppel MRT',
                'King Albert Park MRT', 'Kovan MRT', 'Kranji MRT', 'Labrador Park MRT', 'Lakeside MRT', 'Lavender MRT',
                'Layar LRT', 'Little India MRT', 'Lorong Chuan MRT', 'MacPherson MRT', 'Marina Bay MRT', 'Marina South Pier MRT',
                'Marsiling MRT', 'Marymount MRT', 'Mattar MRT', 'Meridian LRT', 'Mount Faber LRT', 'Mountbatten MRT',
                'Newton MRT', 'Nibong LRT', 'Nicoll Highway MRT', 'Novena MRT', 'Oasis LRT', 'One-North MRT', 'Orchard MRT',
                'Outram Park MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 'Paya Lebar MRT', 'Pasir Ris MRT', 
                'Pioneer MRT', 'Potong Pasir MRT', 'Promenade MRT', 'Punggol MRT', 'Queenstown MRT', 'Raffles Place MRT', 'Redhill MRT',
                  'Riviera LRT', 'Rochor MRT', 'Sembawang MRT', 'Sengkang MRT', 'Serangoon MRT', 'Simei MRT', 'Sixth Avenue MRT', 
                  'Somerset MRT', 'Springleaf MRT', 'Stadium MRT', 'Stevens MRT', 'Sumang LRT', 'Tai Seng MRT', 'Tampines MRT', 
                  'Tampines East MRT', 'Tampines West MRT', 'Tanah Merah MRT', 'Tanjong Pagar MRT', 'Tanjong Rhu MRT', 'Teck Lee LRT', 
                  'Telok Ayer MRT', 'Telok Blangah MRT', 'Tengah MRT', 'Thanggam LRT', 'Tiong Bahru MRT', 'Toa Payoh MRT', 
                  'Tampines North LRT', 'Tuas Crescent MRT', 'Tuas Link MRT', 'Tuas West Road MRT', 'Ubi MRT', 'Upper Changi MRT', 
                  'Woodlands MRT', 'Woodlands South MRT', 'Woodlands North MRT', 'Yew Tee MRT', 'Yio Chu Kang MRT', 'Yishun MRT']

mrt_coordinates = {}

for mrt in mrt_stations:
    try:
        response = requests.get(f"https://developers.onemap.sg/commonapi/search?searchVal={mrt}&returnGeom=Y&getAddrDetails=Y")
        response.raise_for_status()
        data = response.json()
        mrt_coordinates[f"{data['results'][0]['LATITUDE']},{data['results'][0]['LONGITUDE']}"] = mrt
    except HTTPError as http_err:
        print(f'HTTP error occurred: {http_err}')
    except Exception as err:
        print(f'Other error occurred: {err}')
        print(mrt, data)
pprint(mrt_coordinates)

Other error occurred: list index out of range
Gali Batu LRT {'found': 0, 'totalNumPages': 0, 'pageNum': 1, 'results': []}
Other error occurred: list index out of range
Mount Faber LRT {'found': 0, 'totalNumPages': 0, 'pageNum': 1, 'results': []}
Other error occurred: list index out of range
Tengah MRT {'found': 0, 'totalNumPages': 0, 'pageNum': 1, 'results': []}
Other error occurred: list index out of range
Tampines North LRT {'found': 0, 'totalNumPages': 0, 'pageNum': 1, 'results': []}
{'1.26538938374901,103.821530157095': 'HarbourFront MRT',
 '1.26977220124441,103.830031459524': 'Keppel MRT',
 '1.27070647717762,103.809761611219': 'Telok Blangah MRT',
 '1.27102703612006,103.862447515736': 'Marina South Pier MRT',
 '1.27225417749656,103.802631578766': 'Labrador Park MRT',
 '1.276410298755,103.854595522263': 'Marina Bay MRT',
 '1.27656131737246,103.845725186759': 'Tanjong Pagar MRT',
 '1.27848725876303,103.867454664421': 'Gardens by the Bay MRT',
 '1.27944638178916,103.852840829581': 'D