# Get Raw data
> The following are variables that people typically consider when buying HDBs

In [None]:
import pandas as pd
import requests
import os
from datetime import datetime
import time

In [None]:
date_today = datetime.today().strftime("%d_%m_%Y")

# Resale flat prices
> Source: https://data.gov.sg/dataset/resale-flat-prices

In [None]:
data_gov_url = 'https://data.gov.sg/api/action/datastore_search'

In [None]:
resale_flat_prices_resource_ids = [
    '8c00bf08-9124-479e-aeca-7cc411d884c4', # 2000 - Feb 2012
    '83b2fc37-ce8c-4df4-968b-370fd818138b', # Mar 2012 - Dec 2014
    '1b702208-44bf-4829-b620-4615ee19b57c', # Jan 2015 - Dec 2016
    '42ff9cfe-abe5-4b54-beda-c88f9bb438ee', # Jan 2017 - Today
]

In [None]:
results = []

for resource_id in resale_flat_prices_resource_ids:

    params_data = {
    'resource_id': resource_id,
    'limit': 10000000
    }

    res = requests.get(data_gov_url, params=params_data)
    records = res.json()['result']['records']

    df = pd.DataFrame(records)
    results.append(df)

resale_flat_prices_df = pd.concat(results)

In [None]:
file_path = os.path.join('..', 'Raw Data', f'resale flat prices ({date_today}).csv')
resale_flat_prices_df.to_csv(file_path, index=False)

# HDB Property information
> Source: https://data.gov.sg/dataset/hdb-property-information

In [None]:
params_data = {
'resource_id': '482bfa14-2977-4035-9c61-c85f871daf4e',
'limit': 10000000
}

res = requests.get(data_gov_url, params=params_data)
records = res.json()['result']['records']

hdb_property_info_df = pd.DataFrame(records)

In [None]:
file_path = os.path.join('..', 'Raw Data', 'HDB property information.csv')
hdb_property_info_df.to_csv(file_path, index=False)

# MRT & Bus information
> Sources:
> - https://github.com/hxchua/datadoubleconfirm/blob/master/datasets/mrtsg.csv
> - https://datamall.lta.gov.sg/content/datamall/en/dynamic-data.html

In [None]:
# Get MRT information
MRT_url  = 'https://raw.githubusercontent.com/hxchua/datadoubleconfirm/master/datasets/mrtsg.csv'
mrt_df = pd.read_csv(MRT_url)

In [None]:
# Save MRT information
file_path = os.path.join('..', 'Raw Data', 'MRT information.csv')
mrt_df.to_csv(file_path, index=False)

In [None]:
# Get Bus stops information

# Prepare API key to be sent through headers
LTA_API_KEY = os.environ["LTA_API_KEY"]

headers_data = {
    'AccountKey': LTA_API_KEY
}

In [None]:
bus_stop_dfs = []

bus_stops_url = 'http://datamall2.mytransport.sg/ltaodataservice/BusStops'
params_data = {'$skip': 0}

while True:
    response = requests.get(bus_stops_url, headers=headers_data, params=params_data)
    bus_stop_df = pd.DataFrame(response.json()['value'])
    bus_stop_dfs.append(bus_stop_df)

    # Since the API can only return 500 records at a time, when the request returns less than 500 records, it means we have collected all data
    if len(bus_stop_df) < 500:
        break
    
    # Get data for next 500 records
    params_data['$skip'] += 500

    # datamall is keeping a track of my API usage!
    time.sleep(1)

In [None]:
# Save bus stop information
bus_stops_df = pd.concat(bus_stop_dfs)

file_path = os.path.join('..', 'Raw Data', 'Bus Stops information.csv')
bus_stops_df.to_csv(file_path, index=False)

In [None]:
bus_services_dfs = []

bus_services_url = 'http://datamall2.mytransport.sg/ltaodataservice/BusRoutes'
params_data = {'$skip': 0}

while True:
    response = requests.get(bus_services_url, headers=headers_data, params=params_data)
    bus_services_df = pd.DataFrame(response.json()['value'])
    bus_services_dfs.append(bus_services_df)

    # Since the API can only return 500 records at a time, when the request returns less than 500 records, it means we have collected all data
    if len(bus_services_df) < 500:
        break
    
    # Get data for next 500 records
    params_data['$skip'] += 500

    # datamall is keeping a track of my API usage!
    time.sleep(1)

In [None]:
# Save bus services information
bus_services_df = pd.concat(bus_services_dfs)

file_path = os.path.join('..', 'Raw Data', 'Bus Services information.csv')
bus_services_df.to_csv(file_path, index=False)

# School information
> Source: https://data.gov.sg/dataset/school-directory-and-information

In [None]:
params_data = {
    'resource_id': 'ede26d32-01af-4228-b1ed-f05c45a1d8ee',
    'limit': 10000000
}
res = requests.get(data_gov_url, params=params_data)
records = res.json()['result']['records']

school_info_df = pd.DataFrame(records)

In [None]:
# Save school information
file_path = os.path.join('..', 'Raw Data', 'Schools information.csv')
school_info_df.to_csv(file_path, index=False)

# Supermarket information
> Source: https://data.gov.sg/dataset/listing-of-licensed-supermarkets

In [None]:
params_data = {
    'resource_id': 'df586152-d00f-4b15-b667-9e268f1b60df',
    'limit': 10000000
}
res = requests.get(data_gov_url, params=params_data)
records = res.json()['result']['records']

supermarket_info_df = pd.DataFrame(records)

In [None]:
# Save supermarket information
file_path = os.path.join('..', 'Raw Data', 'supermarkets information.csv')
supermarket_info_df.to_csv(file_path, index=False)

# Parks information
> Source: https://data.gov.sg/dataset/parkssg?resource_id=42662ff5-321c-423e-a0fc-1282bf32e2b5

<br>
Note: This data was downloaded. I don't think I am able to get this data through an API unfortunately (Whoever is seeing this, let me know if I am wrong! Would love to collect this data automatically)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=8affb79e-2cc6-4186-86e6-a88dc9ffb80e' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>