## Importing libraries 

In [8]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans
!pip install folium
import folium # map rendering library"

print("Libraries imported.")

Collecting folium
[?25l  Downloading https://files.pythonhosted.org/packages/a4/f0/44e69d50519880287cc41e7c8a6acc58daa9a9acf5f6afc52bcc70f69a6d/folium-0.11.0-py2.py3-none-any.whl (93kB)
[K     |████████████████████████████████| 102kB 11.2MB/s ta 0:00:01
Collecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.11.0
Libraries imported.


# extracting data from wikipedia using beautiful soup

In [9]:
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [13]:
soup = BeautifulSoup(data, 'html.parser')
postalCodeList = []
boroughList = []
neighborhoodList = []

In [14]:
soup.find('table').find_all('tr')

# find all the rows of the table
soup.find('table').find_all('tr')

# for each row of the table, find all the table data
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')

In [19]:
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text)
        boroughList.append(cells[1].text)
        neighborhoodList.append(cells[2].text.rstrip('\n'))

In [46]:
#dataframe is created
toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

toronto_df.head()


#droppin the rows with borough = not assigned

toronto_df_dropna = toronto_df[toronto_df.Borough != "Not assigned\n"].reset_index(drop=True)
toronto_df_dropna.head()
toronto_df_dropna['PostalCode']=toronto_df_dropna['PostalCode'].str.replace('\n','',regex = False)
toronto_df_dropna['Borough'] = toronto_df_dropna['Borough'].str.replace('\n','')
df = toronto_df_dropna.copy()
df


Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [54]:
#grouping the repeating values

df_grouped = df.groupby(['PostalCode','Borough'],as_index = False).agg(lambda x: ",".join(x))
df_grouped['Neighborhood'] = df_grouped['Neighborhood'].str.replace('\n','')
df_grouped

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge,Malvern, Rouge,Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek,Rouge H..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill,Guildwood, M..."
3,M1G,Scarborough,"Woburn,Woburn,Woburn"
4,M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae"
5,M1J,Scarborough,"Scarborough Village,Scarborough Village,Scarbo..."
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park,Ke..."
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge,Golden Mile, C..."
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village Wes..."
9,M1N,Scarborough,"Birch Cliff, Cliffside West,Birch Cliff, Cliff..."


# assigning borough to neighbourhood if it is Not assigned

In [50]:

for index, row in df_grouped.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]
df_grouped

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge,Malvern, Rouge,Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek,Rouge H..."
2,M1E,Scarborough,"Guildwood, Morningside, West Hill,Guildwood, M..."
3,M1G,Scarborough,"Woburn,Woburn,Woburn"
4,M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae"
5,M1J,Scarborough,"Scarborough Village,Scarborough Village,Scarbo..."
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park,Ke..."
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge,Golden Mile, C..."
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village Wes..."
9,M1N,Scarborough,"Birch Cliff, Cliffside West,Birch Cliff, Cliff..."


# testing whether the dataframe is correct or not

In [51]:
column_names = ["PostalCode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(df_grouped[df_grouped["PostalCode"]==postcode], ignore_index=True)
    
test_df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M5G,Downtown Toronto,"Central Bay Street,Central Bay Street,Central ..."
1,M2H,North York,"Hillcrest Village,Hillcrest Village,Hillcrest ..."
2,M4B,East York,"Parkview Hill, Woodbine Gardens,Parkview Hill,..."
3,M1J,Scarborough,"Scarborough Village,Scarborough Village,Scarbo..."
4,M4G,East York,"Leaside,Leaside,Leaside"
5,M4M,East Toronto,"Studio District,Studio District,Studio District"
6,M1R,Scarborough,"Wexford, Maryvale,Wexford, Maryvale,Wexford, M..."
7,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."
8,M9L,North York,"Humber Summit,Humber Summit,Humber Summit"
9,M5V,Downtown Toronto,"CN Tower, King and Spadina, Railway Lands, Har..."


# after scraping the data and cleaning the data , the length of data

In [53]:
df_grouped.shape

(103, 3)

# part 2 
### laoding geospatial_coordinates csv

In [119]:

import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_27c32e3df09740d3a607f4f8e98d5f5b = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='PteNW_djTdtxPXliBgv-kCli5YecECdz8TxpFGy_sfCf',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3.ap-geo.objectstorage.service.networklayer.com')

body = client_27c32e3df09740d3a607f4f8e98d5f5b.get_object(Bucket='capstone-donotdelete-pr-oiulrdxbqmes1c',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

coordinates = pd.read_csv(body)
coordinates.head()
coordinates.rename(columns={'Postal Code':'PostalCode'},inplace = True)

In [120]:
df_new = df_grouped.merge(coordinates, on="PostalCode", how="left")
df_new.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge,Malvern, Rouge,Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek,Rouge H...",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill,Guildwood, M...",43.763573,-79.188711
3,M1G,Scarborough,"Woburn,Woburn,Woburn",43.770992,-79.216917
4,M1H,Scarborough,"Cedarbrae,Cedarbrae,Cedarbrae",43.773136,-79.239476


In [121]:

column_names = ["PostalCode", "Borough", "Neighborhood"]
test_df = pd.DataFrame(columns=column_names)

test_list = ["M5G", "M2H", "M4B", "M1J", "M4G", "M4M", "M1R", "M9V", "M9L", "M5V", "M1B", "M5A"]

for postcode in test_list:
    test_df = test_df.append(df_new[df_grouped["PostalCode"]==postcode], ignore_index=True)
    
test_df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,Borough,Latitude,Longitude,Neighborhood,PostalCode
0,Downtown Toronto,43.657952,-79.387383,"Central Bay Street,Central Bay Street,Central ...",M5G
1,North York,43.803762,-79.363452,"Hillcrest Village,Hillcrest Village,Hillcrest ...",M2H
2,East York,43.706397,-79.309937,"Parkview Hill, Woodbine Gardens,Parkview Hill,...",M4B
3,Scarborough,43.744734,-79.239476,"Scarborough Village,Scarborough Village,Scarbo...",M1J
4,East York,43.70906,-79.363452,"Leaside,Leaside,Leaside",M4G
5,East Toronto,43.659526,-79.340923,"Studio District,Studio District,Studio District",M4M
6,Scarborough,43.750072,-79.295849,"Wexford, Maryvale,Wexford, Maryvale,Wexford, M...",M1R
7,Etobicoke,43.739416,-79.588437,"South Steeles, Silverstone, Humbergate, Jamest...",M9V
8,North York,43.756303,-79.565963,"Humber Summit,Humber Summit,Humber Summit",M9L
9,Downtown Toronto,43.628947,-79.39442,"CN Tower, King and Spadina, Railway Lands, Har...",M5V
