In [None]:
import pandas as pd

In [None]:
df = pd.read_json("../amenities-vancouver.json", lines=True)

In [None]:
# value_counts to dataframe from https://stackoverflow.com/a/53869812
d = pd.DataFrame(df["amenity"].value_counts(ascending=True))
amenity_count = d.reset_index()
amenity_count.columns = ["amenity", "count"]
amenity_count

In [None]:
# maybe it is more interesting to ask for amenities that are more common on the map
# for example if there is only 1 luggage_locker on the entire map, 
# then when I request for loggage_locker in area A, it will either return no results or the result near the luggage_locker. This isn't very interesting.
df = df.merge(amenity_count, on="amenity")
df = df[df["count"] > 30]
df

In [None]:
# We see that there are many occurances of certain amenities, and these could be interesting inputs for us to search for on the map
df["amenity"].value_counts(ascending=False)[:20]

In [None]:
# flattening the tags to extract useful information for training our data
df["city"] = pd.json_normalize(df["tags"])["addr:city"]
df["postcode"] = pd.json_normalize(df["tags"])["addr:postcode"]
df["street"] = pd.json_normalize(df["tags"])["addr:street"]

In [None]:
# We can see that we have some data points that are misspelled or similar
# We can either remove these data points or fix them up and not discard them
df["city"].value_counts()

In [None]:
# Manually replace them with the correct ones
# For the simplicity for the training model, we don't want cities with too little datapoints since this will just be "discarded" during training
# therefore, we group them with the closest municipal/city
df = df.replace(["vancouver","Vancovuer", "Vancouver, BC, Canada"], "Vancouver")
df = df.replace(["North Vancouver City", "District of North Vancouver"], "North Vancouver")
df = df.replace("Abbosford", "Abbotsford")
df = df.replace("Langley", "Langley")
df = df.replace(["Hatzic", "Lake Errock"], "Mission")
df = df.replace(["Township of Langley", "City of Langley/Township of Langley Border", "Langley Township", "Fort Langley", "Aldergrove"], "Langley")

df.to_json("../filtered-vancouver-all.json", orient="records", lines=True)
df["city"].value_counts() 

In [None]:
# create json with features that seem useful 
training_df = df[df["city"].notna()]
training_df = training_df[["lon", "lat", "amenity", "name", "city", "postcode", "street"]]
training_df.to_json("../filtered-vancouver-training.json", orient="records", lines=True)
training_df


In [None]:
# create json with testing data 
training_df = df[df["city"].notna() == False]
training_df = training_df[["lon", "lat", "amenity", "name", "city", "postcode", "street"]]

training_df.to_json("../filtered-vancouver-testing.json", orient="records", lines=True)
training_df


In [None]:
# perhaps we have too little data points for each region to make the training work
# since we cannot just get more data points, maybe we need to decrease the amount of regions
# and combine regions so that each region has more data points for the training 
training_df_small = df[df["city"].notna()]
training_df_small = training_df_small[["lon", "lat", "amenity", "name", "city", "postcode", "street"]]
training_df_small
training_df_small = training_df_small.replace(["North Vancouver", "West Vancouver"], "Vancouver")
training_df_small = training_df_small.replace("Port Coquitlam", "Coquitlam")
training_df_small = training_df_small.replace("Pitt Meadows", "Maple Ridge")
training_df_small = training_df_small.replace(["Langley", "Delta"], "Surrey")
training_df_small = training_df_small.replace("New Westminster", "Burnaby")
training_df_small = training_df_small[
    (training_df_small["city"] == "Vancouver") | 
    (training_df_small["city"] == "Surrey") | 
    (training_df_small["city"] == "Burnaby") |  
    (training_df_small["city"] == "Richmond") |  
    (training_df_small["city"] == "Coquitlam")
]
training_df_small.to_json("../filtered-vancouver-training-6-category.json", orient="records", lines=True)
training_df_small["city"].value_counts() 


In [None]:
# maybe it is also useful to sort by amenity occurances and discard the ones that do not occur that often
training_df_small_remove_amenity = training_df_small.copy()
d = training_df_small_remove_amenity["amenity"].value_counts()
amenity_count = d.reset_index()
amenity_count.columns = ["amenity", "count"]
training_df_small_remove_amenity = training_df_small_remove_amenity.merge(amenity_count, on="amenity")
training_df_small_remove_amenity = training_df_small_remove_amenity[training_df_small_remove_amenity["count"] >= 10]
training_df_small_remove_amenity.drop("count", axis=1)

training_df_small_remove_amenity.to_json("../filtered-vancouver-training-amenity-removed.json", orient="records", lines=True)