Use virtual environment: ors_env

# <font color = 'purple'>HDB Properties Preprocessing
HDB is the Housing & Development Board of Singapore - a government body which develops public housing. HDB properties comprise the vast majority of apartment buildings in Singapore, and they are much more affordable than private apartments. Hence, we will focus our apartment search on HDB properties.

The [HDB Property Information dataset on data.gov.sg](https://data.gov.sg/dataset/hdb-property-information) provides the address of HDB properties across Singapore, as well as other details about each property. In this notebook, we preprocess this dataset to facilitate further analysis in subsequent notebooks:   
- Extract properties with residential units for sale (HDB also owns non-residential properties, and rental apartments)
- Estimate number of residents in each apartment block
- Calculate remaining lease for each apartment block (new HDB flats are sold on a 99-year lease, after which ownership goes back to the government to facilitate redevelopment. The remaining lease is a key consideration when purchasing a resale flat)
- Geocoding (get longitude and latitude of each apartment block to facilitate geospatial analysis)
- Identify town which each apartment block is located in

In [1]:
import os
os.chdir(r"C:\Users\sharo\Documents\Postgrad\My Data Science Portfolio\Transforming the Way We Search For Flats")

In [2]:
import folium
import geopandas as gpd
import time
import pandas as pd
import numpy as np
import math
import requests
from tqdm import tqdm

## <font color = 'blue'> Import HDB Property Data

In [3]:
fp = r"HDB Property Info\hdb-property-information.csv"
hdb_properties = pd.read_csv(fp)
hdb_properties.head()

Unnamed: 0,blk_no,street,max_floor_lvl,year_completed,residential,commercial,market_hawker,miscellaneous,multistorey_carpark,precinct_pavilion,...,3room_sold,4room_sold,5room_sold,exec_sold,multigen_sold,studio_apartment_sold,1room_rental,2room_rental,3room_rental,other_room_rental
0,1,BEACH RD,16,1970,Y,Y,N,N,N,N,...,138,1,2,0,0,0,0,0,0,0
1,1,BEDOK STH AVE 1,14,1975,Y,N,N,Y,N,N,...,204,0,2,0,0,0,0,0,0,0
2,1,CANTONMENT RD,2,2010,N,Y,N,N,N,N,...,0,0,0,0,0,0,0,0,0,0
3,1,CHAI CHEE RD,15,1982,Y,N,N,N,N,N,...,0,10,92,0,0,0,0,0,0,0
4,1,CHANGI VILLAGE RD,4,1975,Y,Y,N,N,N,N,...,54,0,1,0,0,0,0,0,0,0


In [4]:
hdb_properties.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12380 entries, 0 to 12379
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   blk_no                 12380 non-null  object
 1   street                 12380 non-null  object
 2   max_floor_lvl          12380 non-null  int64 
 3   year_completed         12380 non-null  int64 
 4   residential            12380 non-null  object
 5   commercial             12380 non-null  object
 6   market_hawker          12380 non-null  object
 7   miscellaneous          12380 non-null  object
 8   multistorey_carpark    12380 non-null  object
 9   precinct_pavilion      12380 non-null  object
 10  bldg_contract_town     12380 non-null  object
 11  total_dwelling_units   12380 non-null  int64 
 12  1room_sold             12380 non-null  int64 
 13  2room_sold             12380 non-null  int64 
 14  3room_sold             12380 non-null  int64 
 15  4room_sold         

In [5]:
#check for duplicates
hdb_properties.duplicated(subset=['blk_no','street'],keep=False).values.any()

False

In [6]:
#drop columns we don't need
hdb_properties.drop(columns=['commercial', 'market_hawker', 'miscellaneous','precinct_pavilion'],inplace=True)
hdb_properties.head()

Unnamed: 0,blk_no,street,max_floor_lvl,year_completed,residential,multistorey_carpark,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,exec_sold,multigen_sold,studio_apartment_sold,1room_rental,2room_rental,3room_rental,other_room_rental
0,1,BEACH RD,16,1970,Y,N,KWN,142,0,1,138,1,2,0,0,0,0,0,0,0
1,1,BEDOK STH AVE 1,14,1975,Y,N,BD,206,0,0,204,0,2,0,0,0,0,0,0,0
2,1,CANTONMENT RD,2,2010,N,N,CT,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,CHAI CHEE RD,15,1982,Y,N,BD,102,0,0,0,10,92,0,0,0,0,0,0,0
4,1,CHANGI VILLAGE RD,4,1975,Y,N,PRC,55,0,0,54,0,1,0,0,0,0,0,0,0


## <font color = 'blue'> Extract Properties with Residential Sold Flats

In [7]:
#extract resi blocks
resi_blks = hdb_properties[hdb_properties['residential']=='Y'].drop(columns=['residential','multistorey_carpark'])

#calculate total number of sold and rental units in the block
resi_blks['n_sold'] = resi_blks.loc[:,['1room_sold', '2room_sold', '3room_sold', '4room_sold', '5room_sold',
       'exec_sold', 'multigen_sold', 'studio_apartment_sold']].sum(skipna=True,axis=1)

resi_blks['n_rental'] = resi_blks.loc[:,['1room_rental','2room_rental', '3room_rental', 
                                         'other_room_rental']].sum(skipna=True,axis=1)
#extract only blocks with sold units
resi_blks = resi_blks[resi_blks['n_sold']>0]
resi_blks.reset_index(drop=True,inplace=True)

resi_blks.head()

Unnamed: 0,blk_no,street,max_floor_lvl,year_completed,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,exec_sold,multigen_sold,studio_apartment_sold,1room_rental,2room_rental,3room_rental,other_room_rental,n_sold,n_rental
0,1,BEACH RD,16,1970,KWN,142,0,1,138,1,2,0,0,0,0,0,0,0,142,0
1,1,BEDOK STH AVE 1,14,1975,BD,206,0,0,204,0,2,0,0,0,0,0,0,0,206,0
2,1,CHAI CHEE RD,15,1982,BD,102,0,0,0,10,92,0,0,0,0,0,0,0,102,0
3,1,CHANGI VILLAGE RD,4,1975,PRC,55,0,0,54,0,1,0,0,0,0,0,0,0,55,0
4,1,DELTA AVE,25,1982,BM,96,0,0,0,0,96,0,0,0,0,0,0,0,96,0


## <font color = 'blue'> Estimated number of residents in each apartment block

In [8]:
# Mean household size by flat type - from HDB 2018 Sample Household Survey
household_size = pd.DataFrame.from_dict({'flat_type':['1room_sold', '2room_sold', '3room_sold', '4room_sold', '5room_sold','exec_sold'],
                                        'n_pax':[1.9,2.5,2.5,3.3,3.5,3.8]}).pivot_table(values=['n_pax'],columns=['flat_type'])

#add estimates for flat types not in SHS
household_size['multigen_sold'] = household_size['5room_sold']
household_size['studio_apartment_sold'] = np.mean([household_size['1room_sold'],household_size['2room_sold']]) #SHS report mentioned that 1-room includes 1-room SAs while 2-room includes 2-room SAs
household_size['1room_rental'] = household_size['1room_sold']
household_size['2room_rental'] = household_size['2room_sold']
household_size['3room_rental'] = household_size['3room_sold']
household_size['other_room_rental'] = 3.1 #overall mean household size from SHS

household_size = household_size.melt(value_vars=list(household_size.columns),var_name='flat_type',value_name='n_pax',ignore_index=True)

household_size

Unnamed: 0,flat_type,n_pax
0,1room_sold,1.9
1,2room_sold,2.5
2,3room_sold,2.5
3,4room_sold,3.3
4,5room_sold,3.5
5,exec_sold,3.8
6,multigen_sold,3.5
7,studio_apartment_sold,2.2
8,1room_rental,1.9
9,2room_rental,2.5


In [9]:
resi_blks_npax = resi_blks.loc[:,['blk_no', 'street','1room_sold', '2room_sold', '3room_sold', '4room_sold', '5room_sold',
       'exec_sold', 'multigen_sold', 'studio_apartment_sold', '1room_rental',
       '2room_rental', '3room_rental', 'other_room_rental']]

resi_blks_npax = resi_blks_npax.melt(id_vars=['blk_no','street'],
                                     value_vars=['1room_sold', '2room_sold', '3room_sold', '4room_sold', '5room_sold',
                                               'exec_sold', 'multigen_sold', 'studio_apartment_sold', '1room_rental',
                                               '2room_rental', '3room_rental', 'other_room_rental'],
                                     var_name='flat_type',value_name='n_units'
                                     ).sort_values(['blk_no','street'])

#merge with household size estimates
resi_blks_npax = resi_blks_npax.merge(household_size, how="left", on='flat_type')

#total number of residents by flat type
resi_blks_npax['n_residents'] = resi_blks_npax['n_units']*resi_blks_npax['n_pax']

resi_blks_npax.head(15)

Unnamed: 0,blk_no,street,flat_type,n_units,n_pax,n_residents
0,1,BEACH RD,1room_sold,0,1.9,0.0
1,1,BEACH RD,2room_sold,1,2.5,2.5
2,1,BEACH RD,3room_sold,138,2.5,345.0
3,1,BEACH RD,4room_sold,1,3.3,3.3
4,1,BEACH RD,5room_sold,2,3.5,7.0
5,1,BEACH RD,exec_sold,0,3.8,0.0
6,1,BEACH RD,multigen_sold,0,3.5,0.0
7,1,BEACH RD,studio_apartment_sold,0,2.2,0.0
8,1,BEACH RD,1room_rental,0,1.9,0.0
9,1,BEACH RD,2room_rental,0,2.5,0.0


In [10]:
#sum up for each block
resi_blks_npax2=resi_blks_npax.drop(columns = ['flat_type','n_units','n_pax']).groupby(['blk_no','street'],as_index=False).sum()
resi_blks_npax2.head()

Unnamed: 0,blk_no,street,n_residents
0,1,BEACH RD,357.8
1,1,BEDOK STH AVE 1,517.0
2,1,CHAI CHEE RD,355.0
3,1,CHANGI VILLAGE RD,138.5
4,1,DELTA AVE,336.0


In [11]:
resi_blks_npax2.isnull().values.any() #check for nulls

False

In [12]:
#merge back with resi_blks dataframe
resi_blks2 = resi_blks.merge(resi_blks_npax2, how='left', on=['blk_no','street'])

resi_blks2.head()

Unnamed: 0,blk_no,street,max_floor_lvl,year_completed,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,...,exec_sold,multigen_sold,studio_apartment_sold,1room_rental,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents
0,1,BEACH RD,16,1970,KWN,142,0,1,138,1,...,0,0,0,0,0,0,0,142,0,357.8
1,1,BEDOK STH AVE 1,14,1975,BD,206,0,0,204,0,...,0,0,0,0,0,0,0,206,0,517.0
2,1,CHAI CHEE RD,15,1982,BD,102,0,0,0,10,...,0,0,0,0,0,0,0,102,0,355.0
3,1,CHANGI VILLAGE RD,4,1975,PRC,55,0,0,54,0,...,0,0,0,0,0,0,0,55,0,138.5
4,1,DELTA AVE,25,1982,BM,96,0,0,0,0,...,0,0,0,0,0,0,0,96,0,336.0


In [13]:
resi_blks2['n_residents'].isnull().values.any() #ensure all blocks matched

False

## <font color = 'blue'> Remaining lease for each apartment block

In [14]:
resi_blks3 = resi_blks2.copy()
resi_blks3['remaining_lease'] = 99-(2021-resi_blks3['year_completed'])
resi_blks3.drop(columns=['year_completed'],inplace=True)
resi_blks3.head()

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,multigen_sold,studio_apartment_sold,1room_rental,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease
0,1,BEACH RD,16,KWN,142,0,1,138,1,2,...,0,0,0,0,0,0,142,0,357.8,48
1,1,BEDOK STH AVE 1,14,BD,206,0,0,204,0,2,...,0,0,0,0,0,0,206,0,517.0,53
2,1,CHAI CHEE RD,15,BD,102,0,0,0,10,92,...,0,0,0,0,0,0,102,0,355.0,60
3,1,CHANGI VILLAGE RD,4,PRC,55,0,0,54,0,1,...,0,0,0,0,0,0,55,0,138.5,53
4,1,DELTA AVE,25,BM,96,0,0,0,0,96,...,0,0,0,0,0,0,96,0,336.0,60


## <font color = 'blue'> Geocoding

In [15]:
#form the address to search for
resi_addresses = resi_blks3.loc[:,['blk_no','street']]

resi_addresses['addr'] = resi_addresses['blk_no']+" "+resi_addresses['street']

resi_addresses.head()

Unnamed: 0,blk_no,street,addr
0,1,BEACH RD,1 BEACH RD
1,1,BEDOK STH AVE 1,1 BEDOK STH AVE 1
2,1,CHAI CHEE RD,1 CHAI CHEE RD
3,1,CHANGI VILLAGE RD,1 CHANGI VILLAGE RD
4,1,DELTA AVE,1 DELTA AVE


In [16]:
#function to get coordinates using onemap API
def getcoordinates(address):
    req = requests.get('https://developers.onemap.sg/commonapi/search?searchVal='+address+'&returnGeom=Y&getAddrDetails=Y&pageNum=1')
    resultsdict = eval(req.text) #evaluate as python expression
    #keep only those with results
    if len(resultsdict['results'])>0:
        return resultsdict['results'][0]['LATITUDE'], resultsdict['results'][0]['LONGITUDE'], resultsdict['results'][0]['POSTAL'] #i believe the '[0]' returns the top entry
    else:
        pass

**Test on first few properties**

In [17]:
test_addresses = resi_addresses.head(5)
test_addresses

Unnamed: 0,blk_no,street,addr
0,1,BEACH RD,1 BEACH RD
1,1,BEDOK STH AVE 1,1 BEDOK STH AVE 1
2,1,CHAI CHEE RD,1 CHAI CHEE RD
3,1,CHANGI VILLAGE RD,1 CHANGI VILLAGE RD
4,1,DELTA AVE,1 DELTA AVE


In [18]:
coordinateslist= []
failed_count = 0
addresslist=test_addresses['addr']
print(f"Total no. of addresses: {len(addresslist)}")
for address in tqdm(addresslist, desc="Geocoding..."):
    try: #try executing the following
        coordinates = getcoordinates(address)
        if len(coordinates)>0:
            coordinateslist.append(coordinates)
            
    except: #if you receive an error, do this          
        failed_count += 1
        print(f"No. of addresses with no coordinates (cummulative): {failed_count}")
        coordinateslist.append(None)

Total no. of addresses: 5


Geocoding...: 100%|██████████████████████████████████████████████████████████████████████| 5/5 [05:21<00:00, 64.32s/it]


**Execute for full dataset**

In [None]:
coordinateslist= []
failed_count = 0
addresslist = resi_addresses['addr']
print(f"Total no. of addresses: {len(addresslist)}")
for address in tqdm(addresslist, desc="Geocoding..."):
    try: #try executing the following
        coordinates = getcoordinates(address)
        if len(coordinates)>0:
            coordinateslist.append(coordinates)
            
    except: #if you receive an error, do this          
        failed_count += 1
        print(f"No. of addresses with no coordinates (cummulative): {failed_count}")
        coordinateslist.append(None)

In [None]:
df_coordinates = pd.DataFrame(coordinateslist)
df_combined = resi_addresses.join(df_coordinates)
df_combined  = df_combined .rename(columns={0:'Latitude', 1:'Longitude', 2:'Postcode'}).drop(columns = ['addr'])

In [22]:
df_combined.head()

Unnamed: 0,blk_no,street,Latitude,Longitude,Postcode
0,1,BEACH RD,1.303671,103.864479,190001
1,1,BEDOK STH AVE 1,1.320852,103.933721,460001
2,1,CHAI CHEE RD,1.327969,103.922716,461001
3,1,CHANGI VILLAGE RD,1.38861,103.988093,500001
4,1,DELTA AVE,1.292075,103.828584,160001


In [24]:
#merge with resi_blks
resi_blks4 = resi_blks3.merge(df_combined,how="left",on=['blk_no','street'])
resi_blks4.head()

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode
0,1,BEACH RD,16,KWN,142,0,1,138,1,2,...,0,0,0,142,0,357.8,48,1.303671,103.864479,190001
1,1,BEDOK STH AVE 1,14,BD,206,0,0,204,0,2,...,0,0,0,206,0,517.0,53,1.320852,103.933721,460001
2,1,CHAI CHEE RD,15,BD,102,0,0,0,10,92,...,0,0,0,102,0,355.0,60,1.327969,103.922716,461001
3,1,CHANGI VILLAGE RD,4,PRC,55,0,0,54,0,1,...,0,0,0,55,0,138.5,53,1.38861,103.988093,500001
4,1,DELTA AVE,25,BM,96,0,0,0,0,96,...,0,0,0,96,0,336.0,60,1.292075,103.828584,160001


### Fix Issues
**Dealing with nulls** - where geocoding was unsuccessful

In [25]:
#Extract rows where geocoding results (i.e. Lat, Long, PC) are null
resi_blks4_nulls = resi_blks4[(resi_blks4.Latitude.isnull()) | (resi_blks4.Longitude.isnull()) | 
                              (resi_blks4.Postcode.isnull()) | (resi_blks4.Postcode=="NIL")]
resi_blks4_nulls

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode
18,1,ST. GEORGE'S RD,12,KWN,154,0,0,0,154,0,...,0,0,0,154,0,508.2,53,,,
44,10,ST. GEORGE'S RD,4,KWN,12,0,0,0,12,0,...,0,0,0,12,0,39.6,62,,,
374,11,ST. GEORGE'S RD,12,KWN,114,0,0,0,114,0,...,0,0,0,114,0,376.2,62,,,
685,12,ST. GEORGE'S RD,4,KWN,10,0,0,0,10,0,...,0,0,0,10,0,33.0,62,,,
955,13,ST. GEORGE'S RD,25,KWN,96,0,0,0,0,96,...,0,0,0,96,0,336.0,62,,,
1159,14,ST. GEORGE'S RD,12,KWN,150,0,0,123,22,2,...,0,1,0,149,1,397.2,62,,,
1329,15,ST. GEORGE'S RD,12,KWN,88,0,0,0,88,0,...,0,0,0,88,0,290.4,62,,,
1492,16,ST. GEORGE'S RD,4,KWN,24,0,0,8,16,0,...,0,0,0,24,0,72.8,62,,,
1657,17,ST. GEORGE'S RD,4,KWN,16,0,0,0,16,0,...,0,0,0,16,0,52.8,62,,,
1791,18,ST. GEORGE'S RD,12,KWN,88,0,0,0,88,0,...,0,0,0,88,0,290.4,62,,,


In [26]:
#noticed that almost all the null entries were at St. George's Road. On OneMap website, Saint is spelt out in full. 
resi_blks4_nulls['street'] = resi_blks4_nulls['street'].str.replace(pat="ST.", repl="SAINT",case=False,regex=False)

#For the remaining blocks, CENTRAL is spelled as CTRL
resi_blks4_nulls['street'] = resi_blks4_nulls['street'].str.replace(pat="CTRL", repl="CENTRAL",case=False,regex=False)

resi_blks4_nulls

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resi_blks4_nulls['street'] = resi_blks4_nulls['street'].str.replace(pat="ST.", repl="SAINT",case=False,regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  resi_blks4_nulls['street'] = resi_blks4_nulls['street'].str.replace(pat="CTRL", repl="CENTRAL",case=False,regex=False)


Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode
18,1,SAINT GEORGE'S RD,12,KWN,154,0,0,0,154,0,...,0,0,0,154,0,508.2,53,,,
44,10,SAINT GEORGE'S RD,4,KWN,12,0,0,0,12,0,...,0,0,0,12,0,39.6,62,,,
374,11,SAINT GEORGE'S RD,12,KWN,114,0,0,0,114,0,...,0,0,0,114,0,376.2,62,,,
685,12,SAINT GEORGE'S RD,4,KWN,10,0,0,0,10,0,...,0,0,0,10,0,33.0,62,,,
955,13,SAINT GEORGE'S RD,25,KWN,96,0,0,0,0,96,...,0,0,0,96,0,336.0,62,,,
1159,14,SAINT GEORGE'S RD,12,KWN,150,0,0,123,22,2,...,0,1,0,149,1,397.2,62,,,
1329,15,SAINT GEORGE'S RD,12,KWN,88,0,0,0,88,0,...,0,0,0,88,0,290.4,62,,,
1492,16,SAINT GEORGE'S RD,4,KWN,24,0,0,8,16,0,...,0,0,0,24,0,72.8,62,,,
1657,17,SAINT GEORGE'S RD,4,KWN,16,0,0,0,16,0,...,0,0,0,16,0,52.8,62,,,
1791,18,SAINT GEORGE'S RD,12,KWN,88,0,0,0,88,0,...,0,0,0,88,0,290.4,62,,,


In [27]:
#repeat geocoding

#form the address to search for
new_addresses = resi_blks4_nulls.loc[:,['blk_no','street']]
new_addresses['addr'] = new_addresses['blk_no']+" "+new_addresses['street']

coordinateslist= []
failed_count = 0
addresslist = new_addresses['addr']
print(f"Total no. of addresses: {len(addresslist)}")
for address in tqdm(addresslist, desc="Geocoding..."):
    try: #try executing the following
        coordinates = getcoordinates(address)
        if len(coordinates)>0:
            coordinateslist.append(coordinates)
            
    except: #if you receive an error, do this          
        failed_count += 1
        print(f"No. of addresses with no coordinates (cummulative): {failed_count}")
        coordinateslist.append(None)

Total no. of addresses: 28


Geocoding...: 100%|████████████████████████████████████████████████████████████████████| 28/28 [29:54<00:00, 64.07s/it]


Replace these entries in resi_blks4 with new geocoding results

In [28]:
#Match coordinates with address
df_newcoordinates = pd.DataFrame(coordinateslist)
df_newcoordinates = new_addresses.reset_index(drop=True).join(df_newcoordinates)
df_newcoordinates  = df_newcoordinates.rename(columns={0:'Latitude', 1:'Longitude', 2:'Postcode'}).drop(columns = ['addr'])
df_newcoordinates.head()

Unnamed: 0,blk_no,street,Latitude,Longitude,Postcode
0,1,SAINT GEORGE'S RD,1.32340263694069,103.86163080083,320001
1,10,SAINT GEORGE'S RD,1.32374648815059,103.863412150847,320010
2,11,SAINT GEORGE'S RD,1.32292326712558,103.863230452849,320011
3,12,SAINT GEORGE'S RD,1.32293079450577,103.862889951139,320012
4,13,SAINT GEORGE'S RD,1.32323455697139,103.862732454087,320013


In [29]:
#remove the original lat long and pc columns
resi_blks4_fixednulls = resi_blks4_nulls.drop(columns=['Latitude','Longitude','Postcode'])

#add new ones
resi_blks4_fixednulls = resi_blks4_fixednulls.merge(df_newcoordinates,how="left",on=['blk_no','street'])

resi_blks4_fixednulls.head()

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode
0,1,SAINT GEORGE'S RD,12,KWN,154,0,0,0,154,0,...,0,0,0,154,0,508.2,53,1.32340263694069,103.86163080083,320001
1,10,SAINT GEORGE'S RD,4,KWN,12,0,0,0,12,0,...,0,0,0,12,0,39.6,62,1.32374648815059,103.863412150847,320010
2,11,SAINT GEORGE'S RD,12,KWN,114,0,0,0,114,0,...,0,0,0,114,0,376.2,62,1.32292326712558,103.863230452849,320011
3,12,SAINT GEORGE'S RD,4,KWN,10,0,0,0,10,0,...,0,0,0,10,0,33.0,62,1.32293079450577,103.862889951139,320012
4,13,SAINT GEORGE'S RD,25,KWN,96,0,0,0,0,96,...,0,0,0,96,0,336.0,62,1.32323455697139,103.862732454087,320013


In [30]:
resi_blks4_fixednulls.isnull().values.any()

False

In [31]:
#extract the rows that originally did not have nulls
resi_blks4_notnulls = resi_blks4[(resi_blks4.Latitude.notnull()) & (resi_blks4.Longitude.notnull()) & 
                                 (resi_blks4.Postcode.notnull()) & (resi_blks4.Postcode!="NIL")]

#append to this those rows with fixednulls
resi_blks5 = resi_blks4_notnulls.append(resi_blks4_fixednulls, ignore_index=True)

resi_blks5.head()

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode
0,1,BEACH RD,16,KWN,142,0,1,138,1,2,...,0,0,0,142,0,357.8,48,1.303671,103.864479,190001
1,1,BEDOK STH AVE 1,14,BD,206,0,0,204,0,2,...,0,0,0,206,0,517.0,53,1.320852,103.933721,460001
2,1,CHAI CHEE RD,15,BD,102,0,0,0,10,92,...,0,0,0,102,0,355.0,60,1.327969,103.922716,461001
3,1,CHANGI VILLAGE RD,4,PRC,55,0,0,54,0,1,...,0,0,0,55,0,138.5,53,1.38861,103.988093,500001
4,1,DELTA AVE,25,BM,96,0,0,0,0,96,...,0,0,0,96,0,336.0,60,1.292075,103.828584,160001


In [32]:
resi_blks5.isnull().values.any() #ensure no more nulls!

False

In [33]:
resi_blks5[resi_blks5['Postcode']=='NIL']

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode
9858,215,CHOA CHU KANG CENTRAL,11,CCK,90,0,0,0,90,0,...,0,0,0,90,0,297.0,67,1.38308302434129,103.747076627693,NIL
9859,216,CHOA CHU KANG CENTRAL,4,CCK,16,0,0,0,16,0,...,0,0,0,16,0,52.8,67,1.38308302434129,103.747076627693,NIL


In [43]:
#no choice but to key these in manually:
resi_blks6 = resi_blks5.copy()
resi_blks6.loc[9858,'Postcode'] = '680215'
resi_blks6.loc[9859,'Postcode'] = '680216'

resi_blks6[resi_blks6['street']=="CHOA CHU KANG CENTRAL"]

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode
9858,215,CHOA CHU KANG CENTRAL,11,CCK,90,0,0,0,90,0,...,0,0,0,90,0,297.0,67,1.38308302434129,103.747076627693,680215
9859,216,CHOA CHU KANG CENTRAL,4,CCK,16,0,0,0,16,0,...,0,0,0,16,0,52.8,67,1.38308302434129,103.747076627693,680216


**Check for duplicates**

In [44]:
duplicates = resi_blks6[resi_blks6.duplicated(subset=['Latitude','Longitude','Postcode'],keep=False)]
duplicates

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode
2252,21,HOUGANG AVE 3,12,HG,130,0,0,106,22,1,...,0,0,0,130,0,344.9,55,1.359257,103.887676,530211
2284,211,HOUGANG ST 21,4,HG,34,0,0,34,0,0,...,0,0,0,34,0,85.0,62,1.359257,103.887676,530211


In [46]:
#fix this manually
resi_blks6.loc[2252,'Postcode'] = '530021'
resi_blks6.loc[2252,'Latitude'] = '1.364246'
resi_blks6.loc[2252,'Longitude'] = '103.891478'
resi_blks6[resi_blks6['street']=="HOUGANG AVE 3"]

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode
9,1,HOUGANG AVE 3,14,HG,204,0,0,200,0,4,...,0,0,0,204,0,514.0,52,1.364131,103.893001,530001
1319,15,HOUGANG AVE 3,16,HG,180,0,0,0,180,0,...,0,0,0,180,0,594.0,53,1.362658,103.891922,530015
1480,16,HOUGANG AVE 3,20,HG,76,0,0,0,0,76,...,0,0,0,76,0,266.0,53,1.362822,103.891047,530016
1642,17,HOUGANG AVE 3,12,HG,132,0,0,110,22,0,...,0,0,0,132,0,347.6,55,1.363434,103.890978,530017
1776,18,HOUGANG AVE 3,12,HG,132,0,0,0,132,0,...,0,0,0,132,0,435.6,55,1.363217,103.89193,530018
1926,19,HOUGANG AVE 3,12,HG,132,0,0,110,22,0,...,0,0,0,132,0,347.6,55,1.36379,103.891813,530019
2015,2,HOUGANG AVE 3,13,HG,236,0,0,231,0,4,...,0,1,0,235,1,594.0,52,1.363137,103.893185,530002
2044,20,HOUGANG AVE 3,12,HG,132,0,0,110,22,0,...,0,0,0,132,0,347.6,55,1.363832,103.890738,530020
2252,21,HOUGANG AVE 3,12,HG,130,0,0,106,22,1,...,0,0,0,130,0,344.9,55,1.364246,103.891478,530021
2446,22,HOUGANG AVE 3,12,HG,132,0,0,110,22,0,...,0,0,0,132,0,347.6,55,1.364285,103.890468,530022


In [37]:
resi_blks6[resi_blks6.duplicated(subset=['Latitude','Longitude','Postcode'],keep=False)]

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,2room_rental,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode


## <font color = 'blue'> Identify Town

In [49]:
# import lookup table
fp = r"HDB Property Info\hdb town legend.csv"
town_legend = pd.read_csv(fp,header=None).rename(columns={0:'legend'})
town_legend = town_legend['legend'].str.split(" - ",expand=True).rename(columns={0:'bldg_contract_town',1:'hdb_town'})

town_legend.head()

Unnamed: 0,bldg_contract_town,hdb_town
0,AMK,ANG MO KIO
1,BB,BUKIT BATOK
2,BD,BEDOK
3,BH,BISHAN
4,BM,BUKIT MERAH


In [50]:
# merge with main dataframe
resi_blks7 = resi_blks6.merge(town_legend,how='left',on='bldg_contract_town')
resi_blks7.head()

Unnamed: 0,blk_no,street,max_floor_lvl,bldg_contract_town,total_dwelling_units,1room_sold,2room_sold,3room_sold,4room_sold,5room_sold,...,3room_rental,other_room_rental,n_sold,n_rental,n_residents,remaining_lease,Latitude,Longitude,Postcode,hdb_town
0,1,BEACH RD,16,KWN,142,0,1,138,1,2,...,0,0,142,0,357.8,48,1.303671,103.864479,190001,KALLANG/WHAMPOA
1,1,BEDOK STH AVE 1,14,BD,206,0,0,204,0,2,...,0,0,206,0,517.0,53,1.320852,103.933721,460001,BEDOK
2,1,CHAI CHEE RD,15,BD,102,0,0,0,10,92,...,0,0,102,0,355.0,60,1.327969,103.922716,461001,BEDOK
3,1,CHANGI VILLAGE RD,4,PRC,55,0,0,54,0,1,...,0,0,55,0,138.5,53,1.38861,103.988093,500001,PASIR RIS
4,1,DELTA AVE,25,BM,96,0,0,0,0,96,...,0,0,96,0,336.0,60,1.292075,103.828584,160001,BUKIT MERAH


In [51]:
resi_blks7['hdb_town'].isnull().values.any() #ensure all matched

False

## <font color = 'blue'> Save

In [54]:
fp=r"Clean Datasets\resi_blks2.csv"
print('Check Filepath To Avoid Accidental Overwrite')
confirm = input("Proceed to Save File? [Y/N]: ")
if confirm =='Y':
    resi_blks7.to_csv(fp,index=False)
    print('Saved!')
else:
    print('Not Saved.')

Check Filepath To Avoid Accidental Overwrite
Proceed to Save File? [Y/N]: Y
Saved!
