# Distance to Amenities Using GeoPy


Most people who purchase houses in Singapore would want the houses to be located near amenities such as MRTs (Mass Rapid Transit), Malls and/or Schools. Seeing that Singapore is a financial hub, some might even want to live near the Central Business District.

We have utlized the package [GeoPy](https://geopy.readthedocs.io/en/stable/) to calculate the distance of a property from various amenities.

<div class="alert alert-block alert-info">
<b>Note:</b> Added the code to calculate time taken because retrieving the distance would require more time. This code was created separately so that we could only run it once on the clean data set produced by the notebook. -EDA.ipynb </div>

In [1]:
#Import libraries
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings(action='ignore')
from geopy import geocoders, distance
from geopy.distance import geodesic
import requests
import json
from datetime import datetime

In [2]:
#Read data 
train_df = pd.read_csv("clean_train_df.csv")
test_df = pd.read_csv("clean_test_df.csv")
commercial_df = pd.read_csv("auxiliary-data/sg-commerical-centres.csv")
mrt_df = pd.read_csv("auxiliary-data/sg-mrt-stations.csv")
primary_school_df = pd.read_csv("auxiliary-data/sg-primary-schools.csv")
secondary_school_df = pd.read_csv("auxiliary-data/sg-secondary-schools.csv")
shopmall_df = pd.read_csv("auxiliary-data/sg-shopping-malls.csv")
print(train_df.shape,test_df.shape)

(20145, 11) (6966, 10)


## Distance from Changi Airport

In [3]:
# Calculate distance from the Changi airport

#get the location of changi airport 
changi = geocoders.Nominatim(user_agent = 'kaggle_ds').geocode('Changi Airport')
changi_latlon = (changi.latitude, changi.longitude)

start_time = datetime.now()
print("Start Time: {}" .format(start_time))

flats_latlon_train = [(flat['lat'], flat['lng']) for idx, flat in train_df.iterrows()]
flats_latlon_test = [(flat['lat'], flat['lng']) for idx, flat in test_df.iterrows()]


train_df['dist_from_changi_airport'] =  [distance.distance(changi_latlon, x).km for x in flats_latlon_train]
test_df['dist_from_changi_airport'] =  [distance.distance(changi_latlon, x).km for x in flats_latlon_test]
train_df['dist_from_changi_airport'] =np.round(train_df['dist_from_changi_airport'] ,2)
test_df['dist_from_changi_airport']  = np.round(test_df['dist_from_changi_airport'] ,2)

end_time = datetime.now() 
print("End Time: {}" .format(end_time))
print('Total time taken:' + '\033[1m' + str(end_time - start_time) + '\033[0m')

Start Time: 2022-11-06 07:40:33.044581
End Time: 2022-11-06 07:40:39.312933
Total time taken:[1m0:00:06.268352[0m


In [4]:
print(changi)

Changi Airport, 70, T1 Boulevard, Changi, Singapore, Southeast, 819661, Singapore


## Distance from the Central Business District (Raffles Place MRT)

In [5]:
mrt_df[mrt_df['name'].str.contains('aff')]

Unnamed: 0,code,line,name,opening_year,lat,lng,subzone,planning_area
79,ew14,ew,raffles place,1987,1.283933,103.851463,raffles place,downtown core
139,ns26,ns,raffles place,1987,1.283933,103.851463,raffles place,downtown core


In [6]:
mrt_df[mrt_df['name']=='raffles place'].lng.values[0]

103.851463066212

In [7]:
#get the location of changi airport 
rp_lat =mrt_df[mrt_df['name']=='raffles place'].lat.values[0]
rp_lng = mrt_df[mrt_df['name']=='raffles place'].lng.values[0]
raffles_latlon = (rp_lat,rp_lng)

start_time = datetime.now()
print("Start Time: {}" .format(start_time))

train_df['dist_from_cbd'] =  [distance.distance(raffles_latlon, x).km for x in flats_latlon_train]
test_df['dist_from_cbd'] =  [distance.distance(raffles_latlon, x).km for x in flats_latlon_test]
train_df['dist_from_cbd'] =np.round(train_df['dist_from_cbd'] ,2)
test_df['dist_from_cbd']  = np.round(test_df['dist_from_cbd'] ,2)

end_time = datetime.now() 
print("End Time: {}" .format(end_time))
print('Total time taken:' + '\033[1m' + str(end_time - start_time) + '\033[0m')

Start Time: 2022-11-06 07:40:39.476929
End Time: 2022-11-06 07:40:44.784040
Total time taken:[1m0:00:05.307111[0m


In [8]:
raffles_latlon

(1.28393326234538, 103.851463066212)

## Distance from the nearest shopmall

In [9]:
#function to compute min distance from various amenties 
def compute_distance(flat, aux):
    flat_lat, flat_lng = flat[['lat','lng']]
    min_distance = 9999.0
    for aux_lat, aux_lng in zip(aux['lat'], aux['lng']):
        distance = geodesic((flat_lat, flat_lng), (aux_lat, aux_lng)).km
        if distance < min_distance:
            min_distance = distance
    return np.round(min_distance,2)

In [10]:
#Distance from the nearest shopmall
shopmall_df=shopmall_df[['name','lat','lng']]
shopmall_df=shopmall_df.drop_duplicates()

start_time = datetime.now()
print("Start Time: {}" .format(start_time))

train_df["dist_from_nearest_mall"] = train_df.apply(compute_distance, axis=1, aux=shopmall_df)
test_df["dist_from_nearest_mall"] = test_df.apply(compute_distance, axis=1, aux=shopmall_df)

end_time = datetime.now() 
print("End Time: {}" .format(end_time))
print('Total time taken:' + '\033[1m' + str(end_time - start_time) + '\033[0m')

Start Time: 2022-11-06 07:40:44.887039
End Time: 2022-11-06 07:55:23.836491
Total time taken:[1m0:14:38.949452[0m


## Distance from the nearest secondary school

In [11]:
#Distance from the nearest secondary school
secondary_school_df=secondary_school_df[['name','lat','lng']]
secondary_school_df=secondary_school_df.drop_duplicates()

start_time = datetime.now()
print("Start Time: {}" .format(start_time))

train_df["dist_from_nearest_sschool"] = train_df.apply(compute_distance, axis=1, aux=secondary_school_df)
test_df["dist_from_nearest_sschool"] = test_df.apply(compute_distance, axis=1, aux=secondary_school_df)

end_time = datetime.now() 
print("End Time: {}" .format(end_time))
print('Total time taken:' + '\033[1m' + str(end_time - start_time) + '\033[0m')

Start Time: 2022-11-06 07:55:23.849494
End Time: 2022-11-06 08:09:00.902092
Total time taken:[1m0:13:37.052598[0m


## Distance from the nearest primary school

In [12]:
#Distance from the nearest primary school
primary_school_df=primary_school_df[['name','lat','lng']]
primary_school_df=primary_school_df.drop_duplicates()

start_time = datetime.now()
print("Start Time: {}" .format(start_time))

train_df["dist_from_nearest_pschool"] = train_df.apply(compute_distance, axis=1, aux=primary_school_df)
test_df["dist_from_nearest_pschool"] = test_df.apply(compute_distance, axis=1, aux=primary_school_df)

end_time = datetime.now() 
print("End Time: {}" .format(end_time))
print('Total time taken:' + '\033[1m' + str(end_time - start_time) + '\033[0m')

Start Time: 2022-11-06 08:09:00.915094
End Time: 2022-11-06 08:25:32.632526
Total time taken:[1m0:16:31.717432[0m


## Distance from the nearest commercial

In [13]:
#Distance from the nearest commercial 
commercial_df=commercial_df[['name','lat','lng']]
commercial_df=commercial_df.drop_duplicates()

start_time = datetime.now()
print("Start Time: {}" .format(start_time))

train_df["dist_from_nearest_comm"] = train_df.apply(compute_distance, axis=1, aux=commercial_df)
test_df["dist_from_nearest_comm"] = test_df.apply(compute_distance, axis=1, aux=commercial_df)

end_time = datetime.now() 
print("End Time: {}" .format(end_time))
print('Total time taken:' + '\033[1m' + str(end_time - start_time) + '\033[0m')

Start Time: 2022-11-06 08:25:32.645527
End Time: 2022-11-06 08:28:56.565743
Total time taken:[1m0:03:23.920216[0m


## Distance from the nearest mrt

In [14]:
#Distance and walking time from the nearest mrt station 
mrt_df=mrt_df[['name','lat','lng']]
mrt_df=mrt_df.drop_duplicates()

start_time = datetime.now()
print("Start Time: {}" .format(start_time))

train_df["dist_from_nearest_mrt"] = train_df.apply(compute_distance, axis=1, aux=mrt_df)
test_df["dist_from_nearest_mrt"] = test_df.apply(compute_distance, axis=1, aux=mrt_df)

end_time = datetime.now() 
print("End Time: {}" .format(end_time))
print('Total time taken:' + '\033[1m' + str(end_time - start_time) + '\033[0m')

Start Time: 2022-11-06 08:28:56.578745
End Time: 2022-11-06 08:40:19.364333
Total time taken:[1m0:11:22.785588[0m


In [15]:
#check if any null value
train_df.isnull().sum()

property_type                0
num_beds                     0
size_sqft                    0
floor_level                  0
furnishing                   0
lat                          0
lng                          0
subzone                      0
planning_area                0
price                        0
years_left                   0
dist_from_changi_airport     0
dist_from_cbd                0
dist_from_nearest_mall       0
dist_from_nearest_sschool    0
dist_from_nearest_pschool    0
dist_from_nearest_comm       0
dist_from_nearest_mrt        0
dtype: int64

In [16]:
train_df.head()

Unnamed: 0,property_type,num_beds,size_sqft,floor_level,furnishing,lat,lng,subzone,planning_area,price,years_left,dist_from_changi_airport,dist_from_cbd,dist_from_nearest_mall,dist_from_nearest_sschool,dist_from_nearest_pschool,dist_from_nearest_comm,dist_from_nearest_mrt
0,hdb,3,1115,unknown,unspecified,1.414399,103.837196,yishun south,yishun,514500.0,65,17.97,14.51,0.62,0.18,0.27,3.34,0.57
1,hdb,4,1575,unknown,unspecified,1.372597,103.875625,serangoon north,serangoon,995400.0,69,12.67,10.17,0.55,0.29,0.12,2.39,1.73
2,condo,4,3070,low,partial,1.298773,103.895798,mountbatten,marine parade,8485000.0,999,12.19,5.2,0.82,0.89,0.89,2.16,1.32
3,condo,3,958,unknown,partial,1.312364,103.803271,farrer court,bukit timah,2626000.0,1000,21.21,6.22,0.91,1.1,1.09,1.61,0.72
4,condo,2,732,unknown,unspecified,1.273959,103.843635,anson,downtown core,1764000.0,103,18.58,1.41,0.43,1.54,0.46,1.86,0.37


In [17]:
test_df.head()

Unnamed: 0,property_type,num_beds,size_sqft,floor_level,furnishing,lat,lng,subzone,planning_area,years_left,dist_from_changi_airport,dist_from_cbd,dist_from_nearest_mall,dist_from_nearest_sschool,dist_from_nearest_pschool,dist_from_nearest_comm,dist_from_nearest_mrt
0,condo,1,463.0,unknown,unfurnished,1.344334,103.87869,upper paya lebar,serangoon,999,12.3,7.33,0.99,0.34,0.34,1.04,0.26
1,condo,3,1033.0,high,unspecified,1.380281,103.943878,pasir ris west,pasir ris,94,5.56,14.81,0.28,1.4,1.08,1.44,1.0
2,condo,1,570.0,unknown,fully,1.294668,103.850074,bras basah,museum,84,16.9,1.2,0.21,0.56,0.34,0.16,0.25
3,hdb,3,1216.0,unknown,unspecified,1.37312,103.746094,keat hong,choa chu kang,94,27.03,15.32,0.5,1.06,0.7,4.5,1.37
4,hdb,3,936.0,unknown,unspecified,1.341468,103.849047,braddell,toa payoh,50,15.62,6.37,1.01,0.36,0.42,0.99,0.27


In [18]:
#Store the data in csv for futher preprocessing, feature engineering and modeling
#Please uncomment it if you need to generate the CSV again
train_df.to_csv('train_df_for_model.csv',index=False)
test_df.to_csv('test_df_for_model.csv',index=False)

# References

1. https://medium.com/@michael.wy.ong/web-scrape-geospatial-data-analyse-singapores-property-price-part-i-276caba320b
2. https://towardsdatascience.com/geocoding-singapore-coordinates-onemap-api-3e1542bf26f7
3. https://medium.com/@seinchyi/machine-learning-for-singapore-resale-hdb-pt-1-data-preparation-8cdc2df8e24f

# Appendix

We can use the below below working codes(with fixable bugs)to calculate the walking time between nearest mrt or cbd. But due to time constraint this can be explored in future work.

This task can be done using [OneMap API](https://www.onemap.gov.sg/docs/#search). We generated a token for it and were utlizing it to calculate walking time.

In [57]:
# token ='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOjk0NDIsInVzZXJfaWQiOjk0NDIsImVtYWlsIjoiZTA3MjQyNzBAdS5udXMuZWR1IiwiZm9yZXZlciI6ZmFsc2UsImlzcyI6Imh0dHA6XC9cL29tMi5kZmUub25lbWFwLnNnXC9hcGlcL3YyXC91c2VyXC9zZXNzaW9uIiwiaWF0IjoxNjY3NDU0NjczLCJleHAiOjE2Njc4ODY2NzMsIm5iZiI6MTY2NzQ1NDY3MywianRpIjoiMDBhNmNmNjgwMDhjMWE0ZDcwNzZmOWYwMDhhYmZjNTQifQ.rM_cVzU_0ll19fyzpiSgEzcODccZum0YTIn52dQRUyE'

# start = "{},{}".format(train_df.loc[1012,"lat"],train_df.loc[1012,"lng"])
# print(start)
# end = "{},{}".format(rp_lat,rp_lng)
# print(end)
# url= "https://developers.onemap.sg/privateapi/routingsvc/route?start={}&end={}&routeType=walk&date=2022-11-03&time=07:35:00&mode=TRANSIT&maxWalkDistance=1000&numItineraries=3&token={}".format(start,end,token)
# response = requests.get(url)
# print(response)
# data = json.loads(response.text) 
# print(data)
# print(index)
# print(data['route_summary']['total_time'])


1.3271094,103.7964042
1.28393326234538,103.851463066212
<Response [200]>
{'status_message': 'Found route between points', 'route_geometry': 'oebGqvoxRBI?GOEKB{@QQE_@KKCm@O}@WIASEqA]MC[Ks@Q_AUOCmA[MCi@OSGGAGIAE?C?GNm@BGJSHYBOJc@DWBEFQ@[Pq@NMD@BKBINi@BIFS@A@QGECKZkABIH[DORu@DEFS@ABIBK@KBIBI@KDM@E@EDSBK@CBKH]J_@AAJYR}@AALWBKDMBGNDDQF@FQIAV}@nGoKb@u@Ve@j@eAVe@@g@~@{A\\_@BGHK?AR]BE@E@AFOFO@Cd@w@BEBEZi@FMb@s@FKBE??P[HQLSJOPWJOfAcBNEHMDS?@FDB?v@qAR]DMHOBEHKBEHAT_@Xe@DA?ECIDM\\aA@IHUDc@@I@QB]Dc@@[D_@Fi@DEFGLIPCNAL@`@Dj@HVDLB@CDOVo@DKZHD@?CRUHKDEFGBEBEHKAGFKJETiB@EN?BCD@H@vAPdAYfB{@JEh@UDCZOFC^OHEtAo@l@Ij@A\\BDENAf@Ax@E`@CTBf@Hd@Px@JzA\\\\BHCH?X?\\?\\@H?N@P?RENGTU@CBI@U@_@PYLUPQDABAHO?CDKBCBEBEBE?A@ECK?CBCPMD?L?LAB?AAEKAEAC?KDOBGHIDCF?F?HAFCFEDCDEDCREDAh@GIYDO^Dl@DnA?x@Ct@M^MbAe@b@]NIXOZG`@GfBGz@Ed@Gv@Qj@UZSVG?GLGZQ\\MTIFA@@RIFEVGPEXGBAj@KTCZCHAHA`@?H@HDBFBCDE@G?YBC`ABL?F]Fi@DS?C?AFq@Da@@ABE?EDORc@L[JOXm@\\w@FMTs@J]@EFK@CBEVo@BIBIPo@@I\\q@JCB??E?GBE?QX]JI??L?ZFHe@n@c@PMZMZGVGDAp@CJ?F@F?D@^DJCn@

In [None]:
# start_time = datetime.now()
# print("Start Time: {}" .format(start_time))

# token ='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOjk0NDIsInVzZXJfaWQiOjk0NDIsImVtYWlsIjoiZTA3MjQyNzBAdS5udXMuZWR1IiwiZm9yZXZlciI6ZmFsc2UsImlzcyI6Imh0dHA6XC9cL29tMi5kZmUub25lbWFwLnNnXC9hcGlcL3YyXC91c2VyXC9zZXNzaW9uIiwiaWF0IjoxNjY3NDU0NjczLCJleHAiOjE2Njc4ODY2NzMsIm5iZiI6MTY2NzQ1NDY3MywianRpIjoiMDBhNmNmNjgwMDhjMWE0ZDcwNzZmOWYwMDhhYmZjNTQifQ.rM_cVzU_0ll19fyzpiSgEzcODccZum0YTIn52dQRUyE'


# for i, (index, row) in enumerate(train_df.iterrows()): 
#     start = "{},{}".format(train_df.loc[index,"lat"],train_df.loc[index,"lng"])
#     end = "{},{}".format(rp_lat,rp_lng)
#     url= "https://developers.onemap.sg/privateapi/routingsvc/route?start={}&end={}&routeType=walk&date=2019-05-17&time=07:35:00&mode=TRANSIT&maxWalkDistance=1000&numItineraries=3&token={}".format(start,end,token)
#     response = requests.get(url)
#     data = json.loads(response.text) 
#     #print(index)
#     try:
#         train_df.loc[index,"walking_time_to_cbd"]=data['route_summary']['total_time']
#     except:
#         train_df.loc[index,"walking_time_to_cbd"]=None
        
# end_time = datetime.now() 
# print("End Time: {}" .format(end_time))
# print('Total time taken:' + '\033[1m' + str(end_time - start_time) + '\033[0m')

In [27]:
# def compute_dist_nearst_mrt_cor(flat, aux):
#     flat_lat, flat_lng = flat[['lat','lng']]
#     min_distance = 9999.0
#     for aux_lat, aux_lng in zip(aux['lat'], aux['lng']):
#         distance = geodesic((flat_lat, flat_lng), (aux_lat, aux_lng)).km
#         if distance < min_distance:
#             min_distance = distance
#             flat['dist_from_nearest_mrt'] =np.round(min_distance,2)
#             flat['nearest_mrt_lat'] = aux_lat
#             flat['nearest_mrt_lng'] = aux_lng
#     return flat

In [None]:
# token ='eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJzdWIiOjk0NDIsInVzZXJfaWQiOjk0NDIsImVtYWlsIjoiZTA3MjQyNzBAdS5udXMuZWR1IiwiZm9yZXZlciI6ZmFsc2UsImlzcyI6Imh0dHA6XC9cL29tMi5kZmUub25lbWFwLnNnXC9hcGlcL3YyXC91c2VyXC9zZXNzaW9uIiwiaWF0IjoxNjY3NDU0NjczLCJleHAiOjE2Njc4ODY2NzMsIm5iZiI6MTY2NzQ1NDY3MywianRpIjoiMDBhNmNmNjgwMDhjMWE0ZDcwNzZmOWYwMDhhYmZjNTQifQ.rM_cVzU_0ll19fyzpiSgEzcODccZum0YTIn52dQRUyE'

# nearest_stn_walking_time=[]
# for i, (index, row) in enumerate(train_df.iterrows()): 
#     start = "{},{}".format(train_df.loc[index,"lat"],train_df.loc[index,"lng"])
#     end = "{},{}".format(train_df.loc[index,"nearest_mrt_lat"],train_df.loc[index,"nearest_mrt_lng"])
#     url= "https://developers.onemap.sg/privateapi/routingsvc/route?start={}&end={}&routeType=walk&date=2022-11-03&time=07:35:00&mode=TRANSIT&maxWalkDistance=1000&numItineraries=3&token={}".format(start,end,token)
#     response = requests.get(url)
#     data = json.loads(response.text) 
#     print(index)
#     #print(data['route_summary']['total_time'])
#     #train.loc[index,"nearest_stn_walking_time"]=data['route_summary']['total_time']
#     try:
#         nearest_stn_walking_time.append(data['route_summary']['total_time'])
#     except:
#         nearest_stn_walking_time.append(None)
  