In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2019.11.28 |       hecc5488_0         145 KB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    certifi-2019.11.28         |           py36_0         149 KB  conda-forge
    openssl-1.1.1d             |       h516909a_0         2.1 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following packages will be UPDATED:

    ca-

## Query wikipedia link
Parse the data

In [40]:
import requests
import lxml.html as lh
import pandas as pd
import numpy as np

url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [41]:
#Check the length of the first 10 rows ensuring we have three elements in each
[len(T) for T in tr_elements[:10]]


[3, 3, 3, 3, 3, 3, 3, 3, 3, 3]

In [42]:
#Since out first row is the header, data is stored on the second row onwards
# define the dataframe columns
 
col= [('PostalCode', []), ('Borough', []), ('Neighborhood', [])]

for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 10, the //tr data is not from our table 
    if len(T)!=3:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        #if i>0:
        #Convert any numerical value to integers
            #try:
                #data=int(data)
            #except:
                #pass
        #Append the data to the empty list of the i'th column
        if i==2:
            data=data.strip()#remove newline term from each data
            #print (data)
        col[i][1].append(data)
        #Increment i for the next column
        i+=1
        


In [43]:
#checking all columns have same number of items
[len(C) for (title,C) in col]

[287, 287, 287]

## Converting into dataframe and cleaning
dropping rows where Borough is Not assigned

In [44]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)
#df.drop(['Postcode'],axis=1,inplace = True)
df.drop(df[df['Borough'] =='Not assigned'].index, inplace = True) 


df.dtypes

PostalCode      object
Borough         object
Neighborhood    object
dtype: object

In [45]:
df.head(20)

Unnamed: 0,PostalCode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor
7,M7A,Downtown Toronto,Queen's Park
9,M9A,Queen's Park,Not assigned
10,M1B,Scarborough,Rouge
11,M1B,Scarborough,Malvern
13,M3B,North York,Don Mills North


### combining neighborhoods under same Postalcode separated by comma ','

In [46]:

df2 = (df[['PostalCode','Borough','Neighborhood']].groupby(['PostalCode','Borough'])['Neighborhood']
       .apply(lambda x: ','.join(set(x)))
       .reset_index())
df2.head()


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill"
2,M1E,Scarborough,"West Hill,Morningside,Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Checking if any item is 'Not assigned' in Neighborhood
#### if so assign corresponding Borough Value

In [47]:
print(df2[df2['Neighborhood']=='Not assigned'])

   PostalCode       Borough  Neighborhood
93        M9A  Queen's Park  Not assigned


In [48]:
# using numpy to replace the Not assigned Neighborhood with corresponding Borough value
df2['Neighborhood'] = np.where((df2.Neighborhood == 'Not assigned'),df2.Borough,df2.Neighborhood)


In [49]:
print(df2[df2['Neighborhood']=='Not assigned'])

Empty DataFrame
Columns: [PostalCode, Borough, Neighborhood]
Index: []


In [50]:
print(df2[df2['PostalCode']=='M9A'])

   PostalCode       Borough  Neighborhood
93        M9A  Queen's Park  Queen's Park


In [51]:
print (df2.shape)

(103, 3)


### Since geocoder did not work, using the csv file to load long lat.

In [52]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_xxxxxxxx = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='_',
    ibm_auth_endpoint="https://iam.ng.bluemix.net/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_xxxxxxxxx.get_object(Bucket='machinelearningexercises-donotdelete-pr-ojtrxbzvrd3udm',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Aligning column name of above with df2 (postalcode table with neighborhoods)

In [53]:
df_data_1.rename(columns={'Postal Code':'PostalCode',
                          }, 
                 inplace=True)
df_data_1.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [54]:
df2.head()
df2.to_csv('PostalcodetoNeighbourhood.csv')

In [55]:

df_nbh=pd.merge(df2,df_data_1,on='PostalCode', how='outer')
df_nbh.head(100)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Port Union,Rouge Hill",43.784535,-79.160497
2,M1E,Scarborough,"West Hill,Morningside,Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park,Ionview,East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea,Oakridge,Golden Mile",43.711112,-79.284577
8,M1M,Scarborough,"Scarborough Village West,Cliffside,Cliffcrest",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff,Cliffside West",43.692657,-79.264848


In [57]:
print(df_nbh.shape)

(103, 5)
