In [1]:
import pandas as pd

In [2]:
df = pd.read_json("../amenities-vancouver.json", lines=True)

In [3]:
# value_counts to dataframe from https://stackoverflow.com/a/53869812
d = pd.DataFrame(df["amenity"].value_counts(ascending=True))
amenity_count = d.reset_index()
amenity_count.columns = ["amenity", "count"]
amenity_count

Unnamed: 0,amenity,count
0,money_transfer,1
1,biergarten,1
2,meditation_centre,1
3,juice_bar,1
4,gym,1
...,...,...
133,waste_basket,1137
134,fast_food,1147
135,bicycle_parking,1423
136,restaurant,2472


In [4]:
# maybe it is more interesting to ask for amenities that are more common on the map
# for example if there is only 1 luggage_locker on the entire map, 
# then when I request for loggage_locker in area A, it will either return no results or the result near the luggage_locker. This isn't very interesting.
df = df.merge(amenity_count, on="amenity")
df = df[df["count"] > 30]
df

Unnamed: 0,lat,lon,timestamp,amenity,name,tags,count
0,49.260812,-123.125736,2020-03-20T18:22:12.000-07:00,cafe,Starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ...",1066
1,49.193580,-123.180788,2019-09-14T05:00:22.000-07:00,cafe,Tim Hortons,"{'brand:wikidata': 'Q175106', 'level': '0', 'b...",1066
2,49.228400,-122.848383,2019-11-20T11:48:37.000-08:00,cafe,Tim Hortons,"{'brand:wikidata': 'Q175106', 'addr:housenumbe...",1066
3,49.212449,-122.919749,2019-07-02T01:10:30.000-07:00,cafe,Tim Hortons,"{'brand:wikidata': 'Q175106', 'website': 'http...",1066
4,49.212659,-122.919347,2019-07-02T01:10:30.000-07:00,cafe,Starbucks,"{'brand:wikidata': 'Q37158', 'official_name': ...",1066
...,...,...,...,...,...,...,...
17343,49.049702,-122.291780,2018-06-20T15:11:12.000-07:00,social_facility,BC Schizophrenia Society,{'website': 'http://www.bcss.org/branches/lowe...,50
17344,49.241639,-123.121114,2019-09-13T13:57:05.000-07:00,social_facility,St. Vincent's Heather Campus of Care,"{'addr:housenumber': '4875', 'social_facility:...",50
17345,49.241730,-123.121108,2019-09-13T13:57:05.000-07:00,social_facility,Tapestry Foundation for Health Care,"{'addr:housenumber': '4865', 'social_facility:...",50
17346,49.198466,-122.844950,2018-03-26T17:06:23.000-07:00,social_facility,Surrey Urban Outreach Society,"{'addr:housenumber': '10776', 'surrey:addrid':...",50


In [5]:
# We see that there are many occurances of certain amenities, and these could be interesting inputs for us to search for on the map
df["amenity"].value_counts(ascending=False)[:20]

bench               3738
restaurant          2472
bicycle_parking     1423
fast_food           1147
waste_basket        1137
cafe                1066
post_box             972
toilets              552
bank                 460
drinking_water       322
pharmacy             311
parking              307
parking_entrance     274
dentist              248
bicycle_rental       202
fuel                 202
pub                  189
post_office          188
bar                  177
recycling            139
Name: amenity, dtype: int64

In [6]:
# flattening the tags to extract useful information for training our data
df["city"] = pd.json_normalize(df["tags"])["addr:city"]
df["postcode"] = pd.json_normalize(df["tags"])["addr:postcode"]
df["street"] = pd.json_normalize(df["tags"])["addr:street"]

In [36]:
# We can see that we have some data points that are misspelled or similar
# We can either remove these data points or fix them up and not discard them
df["city"].value_counts()

Vancouver          1307
Richmond            250
Surrey              240
Burnaby             219
Langley             126
North Vancouver      88
Maple Ridge          75
Coquitlam            49
New Westminster      38
Port Coquitlam       38
West Vancouver       36
Delta                34
Port Moody           24
White Rock           19
Abbotsford           13
Mission               6
Pitt Meadows          3
Bowen Island          2
Deroche               2
Langley               1
Name: city, dtype: int64

In [37]:
# Manually replace them with the correct ones
# For the simplicity for the training model, we don't want cities with too little datapoints since this will just be "discarded" during training
# therefore, we group them with the closest municipal/city
df = df.replace(["vancouver","Vancovuer", "Vancouver, BC, Canada"], "Vancouver")
df = df.replace(["North Vancouver City", "District of North Vancouver"], "North Vancouver")
df = df.replace("Abbosford", "Abbotsford")
df = df.replace("Langley", "Langley")
df = df.replace(["Hatzic", "Lake Errock"], "Mission")
df = df.replace(["Township of Langley", "City of Langley/Township of Langley Border", "Langley Township", "Fort Langley", "Aldergrove"], "Langley")

df.to_json("../filtered-vancouver-all.json", orient="records", lines=True)

In [7]:
# create json with features that seem useful 
training_df = df[df["city"].notna()]
training_df = training_df[["lon", "lat", "amenity", "name", "city", "postcode", "street"]]

training_df.to_json("../filtered-vancouver-training.json", orient="records", lines=True)

In [8]:
# create json with testing data 
training_df = df[df["city"].notna() == False]
training_df = training_df[["lon", "lat", "amenity", "name", "city", "postcode", "street"]]

training_df.to_json("../filtered-vancouver-testing.json", orient="records", lines=True)