# This notebook is for "Segmenting and Clustering Toronto Neighbourhoods"

In [1]:
#Import libraries
import pandas as pd

### Read HTML file into a Pandas Data Frame

In [2]:
# Read the HTML file into a Pandas data frame
a = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', header=0)

# Create the initial dataframe
df = pd.DataFrame(data = a[0])
df.rename(columns={"Postal Code":"PostalCode"}, inplace=True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Ignore cells where Borough is "Not Assigned"

In [3]:
df = df[(df.Borough != "Not assigned")]
df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Combine all Neighbourhoods for each Postal Code

In [4]:
df_grp = pd.DataFrame(df.groupby(["PostalCode","Borough"])["Neighbourhood"].apply(lambda x: ", ".join(x))).reset_index()
df_grp.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Assign Borough to Neighbourhood whereever Neighbourhood is "Not assigned"

In [7]:
df_grp.loc[df_grp.Neighbourhood == "Not assigned", "Neighbourhood"] = df_grp.Borough
df_grp[df_grp.Neighbourhood == "Not assigned"].head() ##No rows should be displayed

Unnamed: 0,PostalCode,Borough,Neighbourhood


In [8]:
df_grp.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Number of Rows in Final Data Frame

In [9]:
df_grp.shape

(103, 3)

# Fetch Latitude And Longitude Coordinates 

### Let us first try fetch coordinates using geocoder package

In [8]:
# Install geocoder if required
#!conda install -c conda-forge geocoder --yes

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geocoder


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ratelim-0.1.6              |             py_2           6 KB  conda-forge
    geocoder-1.38.1            |             py_1          53 KB  conda-forge
    python_abi-3.6             |          1_cp36m           4 KB  conda-forge
    openssl-1.1.1h             |       h516909a_0         2.1 MB  conda-forge
    ca-certificates-2020.6.20  |       hecda079_0         145 KB  conda-forge
    certifi-2020.6.20          |   py36h9880bd3_2         151 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geocoder:        1.38.1-py_1              conda-forge
    python_a

In [13]:
# Get latitude and longitude using geocoder
#import geocoder


#import geocoder

#df_grp_ll = df_grp.copy()

#postal_codes = df_grp['PostalCode'].tolist()

# Search for Latitude and Longitude of Postal Codes
#for postal_code in postal_codes:
#    lat_lng_coords = None

    # While loop to get coordinates
#    print('{}, Toronto, Ontario'.format(postal_code))
#    while(lat_lng_coords is None):
#      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
#      lat_lng_coords = g.latlng

#    latitude = lat_lng_coords[0]
#    longitude = lat_lng_coords[1]
    
#    df_grp_ll.loc[df_grp_ll.PostalCode == postal_code, 'Latitude'] = latitude
#    df_grp_ll.loc[df_grp_ll.PostalCode == postal_code, 'Longitude'] = longitude

# Pickle Data Frame to Avoid Multiple Calls
#df_grp_ll.to_pickle('./Toronto_Coordinates.pkl')

# Read from Pickle File
#df_grp_ll = pd.read_pickle('./Toronto_Coordinates.pkl')

### Unable to fetch coordinates using geocoder and hence using csv file in below cell

In [10]:
df_coords = pd.read_csv("http://cocl.us/Geospatial_data/Geospatial_Coordinates.csv")
df_coords.rename(columns={"Postal Code":"PostalCode"}, inplace=True)

In [11]:
df_coords

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


### Join Coordinates Data With Original Data Frame 

In [14]:
#Join data frames using column PostalCode
df_join = df_grp.join(df_coords.set_index("PostalCode"), on="PostalCode")

### Finally, Display Updated Data Frame

In [15]:
# Display Data Frame
df_join

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
