# Feature Engineering

In [2]:
import pandas as pd
import numpy as np

In [3]:
mrt_coordinates_df = pd.read_csv('mrt_coordinates.csv', index_col = 0)

In [4]:
mrt_coordinates_df

Unnamed: 0,Station,latitude,longitude
0,Jurong East,1.333295,103.742154
1,Bukit Batok,1.351989,103.765416
2,Bukit Gombak,1.358730,103.751913
3,Choa Chu Kang,1.385335,103.744352
4,Yew Tee,1.397129,103.747330
...,...,...,...
140,Upper Changi,1.369787,103.983271
141,Expo,1.335383,103.962375
142,Woodlands North,1.447782,103.785136
143,Woodlands,1.430788,103.762362


In [5]:
hdb_coordinates_df = pd.read_csv('hdb_coordinates.csv', index_col=0)
hdb_coordinates_df

Unnamed: 0,address,latitude,longitude
0,406 ANG MO KIO AVE 10,1.362005,103.853880
1,108 ANG MO KIO AVE 4,1.370943,103.837975
2,602 ANG MO KIO AVE 5,1.380709,103.835368
3,465 ANG MO KIO AVE 10,1.366201,103.857201
4,601 ANG MO KIO AVE 5,1.381041,103.835132
...,...,...,...
9423,676A YISHUN RING RD,1.421452,103.843328
9424,187B BEDOK NTH ST 4,1.330499,103.939996
9425,450B BT BATOK WEST AVE 6,1.352358,103.744396
9426,451B BT BATOK WEST AVE 6,1.352484,103.743415


#### Distance from nearest MRT in km

Use geopy package to calculate distance between two places, using their respective latitude and longitude values.

In [6]:
pip install geopy

Collecting geopy
  Downloading geopy-2.3.0-py3-none-any.whl (119 kB)
Collecting geographiclib<3,>=1.52
  Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-2.0 geopy-2.3.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\raych\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [7]:
from geopy import distance

In [8]:
hdb_coordinates_df
mrt_coordinates_df

Unnamed: 0,Station,latitude,longitude
0,Jurong East,1.333295,103.742154
1,Bukit Batok,1.351989,103.765416
2,Bukit Gombak,1.358730,103.751913
3,Choa Chu Kang,1.385335,103.744352
4,Yew Tee,1.397129,103.747330
...,...,...,...
140,Upper Changi,1.369787,103.983271
141,Expo,1.335383,103.962375
142,Woodlands North,1.447782,103.785136
143,Woodlands,1.430788,103.762362


In [81]:
mrt_coordinates_df.rename(columns = {'Station': 'Name'}, inplace=True)

In [82]:
hdb_coordinates_dict = hdb_coordinates_df.to_dict('records')
mrt_coordinates_dict = mrt_coordinates_df.to_dict('records')

In [83]:
mrt_coordinates_dict[0]

{'Name': 'Jurong East',
 'latitude': 1.33329506563598,
 'longitude': 103.742153884191}

In [84]:
# hdb is the dictionary of a hdb's specific coordinates
# places is a list of places, each place being a dictionary with its specific coordinates (mrt/mall)
def get_shortest_distance(hdb, places):
    hdb_lat_long = (hdb['latitude'], hdb['longitude'])
    shortest_distance = 100
    current_place = ""

    for place in places:
        place_lat_long = (place['latitude'], place['longitude'])
        current_distance = distance.distance(hdb_lat_long, place_lat_long).km
        if current_distance < shortest_distance:
            shortest_distance = current_distance
            current_place = place['Name']
    
    return (shortest_distance, current_place)

In [85]:
for hdb in hdb_coordinates_dict:
    shortest_distance = get_shortest_distance(hdb, mrt_coordinates_dict)
    hdb['mrt_shortest_dist'] = shortest_distance[0]
    hdb['mrt_shortest_dist_name'] = shortest_distance[1]
    

In [86]:
hdb_coordinates_df = pd.DataFrame(hdb_coordinates_dict)
hdb_coordinates_df

Unnamed: 0,address,latitude,longitude,mrt_shortest_dist,mall_shortest_dist,mrt_shortest_dist_name
0,406 ANG MO KIO AVE 10,1.362005,103.853880,0.957270,1.013992,Ang Mo Kio
1,108 ANG MO KIO AVE 4,1.370943,103.837975,1.288554,0.894266,Ang Mo Kio
2,602 ANG MO KIO AVE 5,1.380709,103.835368,1.056960,1.525573,Yio Chu Kang
3,465 ANG MO KIO AVE 10,1.366201,103.857201,0.932964,0.893796,Ang Mo Kio
4,601 ANG MO KIO AVE 5,1.381041,103.835132,1.079288,1.569306,Yio Chu Kang
...,...,...,...,...,...,...
9423,676A YISHUN RING RD,1.421452,103.843328,0.541079,0.443476,Yishun
9424,187B BEDOK NTH ST 4,1.330499,103.939996,0.843049,1.258851,Tanah Merah
9425,450B BT BATOK WEST AVE 6,1.352358,103.744396,1.093790,0.569993,Bukit Gombak
9426,451B BT BATOK WEST AVE 6,1.352484,103.743415,1.171116,0.676788,Bukit Gombak


Sanity Check

Lets check the first row of hdb_coordinates_df, to see if the mrt_shortest dist is calculated correctly.

The nearest mrt to 406 Ang Mo Kio ave 10 is very clearly Ang Mo Kio Station. Lets see if the calculations match.

In [88]:
ang_mo_kio_coordinates = mrt_coordinates_df[mrt_coordinates_df['Name'] == 'Ang Mo Kio'][['latitude', 'longitude']].values[0]

In [89]:
# convert list to tuple
ang_mo_kio_coordinates = (ang_mo_kio_coordinates[0], ang_mo_kio_coordinates[1])
ang_mo_kio_coordinates

(1.36942855699191, 103.849455226442)

In [90]:
dist_from_amk = distance.distance((hdb_coordinates_dict[0]['latitude'], hdb_coordinates_dict[0]['longitude']), 
                  (ang_mo_kio_coordinates)).km
dist_from_amk

0.9572697821512559

In [91]:
hdb_coordinates_df.loc[0,'mrt_shortest_dist'] == dist_from_amk

True

#### Distance from nearest Mall in km

In [92]:
mall_coordinates_df = pd.read_csv('shopping_mall_coordinates.csv', index_col=0)
mall_coordinates_df

Unnamed: 0,Mall,latitude,longitude
0,100 AM,1.274683,103.843488
1,313@Somerset,1.301014,103.838361
2,Aperia,1.309711,103.864326
3,Balestier Hill Shopping Centre,1.325596,103.842572
4,Bugis Cube,1.298141,103.855635
...,...,...,...
150,Gek Poh Shopping Centre,1.348744,103.697732
151,Rochester Mall,1.305408,103.788447
152,Taman Jurong Shopping Centre,1.334845,103.720462
153,West Coast Plaza,1.303697,103.766131


In [93]:
mall_coordinates_df.rename(columns = {'Mall': 'Name'}, inplace = True)

In [94]:
mall_coordinates_dict = mall_coordinates_df.to_dict('records')
mall_coordinates_dict

[{'Name': '100 AM',
  'latitude': 1.27468281482263,
  'longitude': 103.843488359469},
 {'Name': '313@Somerset',
  'latitude': 1.30101436404056,
  'longitude': 103.838360664485},
 {'Name': 'Aperia',
  'latitude': 1.3097112065077,
  'longitude': 103.864326436447},
 {'Name': 'Balestier Hill Shopping Centre',
  'latitude': 1.32559594839311,
  'longitude': 103.842571612968},
 {'Name': 'Bugis Cube',
  'latitude': 1.2981408343975,
  'longitude': 103.855635339249},
 {'Name': 'Bugis Junction',
  'latitude': 1.2991371723215,
  'longitude': 103.855450325604},
 {'Name': 'Bugis+',
  'latitude': 1.30095171530648,
  'longitude': 103.855172625542},
 {'Name': 'Capitol Piazza',
  'latitude': 1.29307884763132,
  'longitude': 103.851261982149},
 {'Name': 'Cathay Cineleisure Orchard',
  'latitude': 1.30152101873533,
  'longitude': 103.836429655016},
 {'Name': 'The Centrepoint',
  'latitude': 1.30145045537088,
  'longitude': 103.840034074858},
 {'Name': 'City Square Mall',
  'latitude': 1.31138865009152,
  

In [95]:
for hdb in hdb_coordinates_dict:
    shortest_distance = get_shortest_distance(hdb, mall_coordinates_dict)
    hdb['mall_shortest_dist'] = shortest_distance[0]
    hdb['mall_shortest_dist_name'] = shortest_distance[1]

In [96]:
hdb_coordinates_df = pd.DataFrame(hdb_coordinates_dict)
hdb_coordinates_df

Unnamed: 0,address,latitude,longitude,mrt_shortest_dist,mall_shortest_dist,mrt_shortest_dist_name,mall_shortest_dist_name
0,406 ANG MO KIO AVE 10,1.362005,103.853880,0.957270,1.013992,Ang Mo Kio,AMK Hub
1,108 ANG MO KIO AVE 4,1.370943,103.837975,1.288554,0.894266,Ang Mo Kio,Broadway Plaza
2,602 ANG MO KIO AVE 5,1.380709,103.835368,1.056960,1.525573,Yio Chu Kang,Broadway Plaza
3,465 ANG MO KIO AVE 10,1.366201,103.857201,0.932964,0.893796,Ang Mo Kio,myVillage At Serangoon Garden
4,601 ANG MO KIO AVE 5,1.381041,103.835132,1.079288,1.569306,Yio Chu Kang,Broadway Plaza
...,...,...,...,...,...,...,...
9423,676A YISHUN RING RD,1.421452,103.843328,0.541079,0.443476,Yishun,Wisteria Mall
9424,187B BEDOK NTH ST 4,1.330499,103.939996,0.843049,1.258851,Tanah Merah,Bedok Mall
9425,450B BT BATOK WEST AVE 6,1.352358,103.744396,1.093790,0.569993,Bukit Gombak,West Mall
9426,451B BT BATOK WEST AVE 6,1.352484,103.743415,1.171116,0.676788,Bukit Gombak,West Mall


In [97]:
# Save progress first
hdb_coordinates_df.to_csv('hdb_added_features.csv')