# This notebook is used for Coursera Data Science Capstone Project
## Author: Priya Kulandaivelu
### Segmenting and Clustering Neighborhoods in Toronto
### Creating Toronto Neighborhood Dataset 

In [64]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [65]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

In [66]:
data = requests.get(url).text
table_data = []

In [67]:
bsoup = BeautifulSoup(data,'html5lib')
table = bsoup.find('table')

In [68]:
for row in table.findAll('td'):
    cell = {}
    if row.span.text == 'Not assigned': #Ignore cells that do not borough/neighborhood info
        pass
    else:
        #cell text is formatted as <PostalCode><borough>(Neighborhood)
        cell['Postal Code'] = row.text[1:4] #Postal code is 3 characters. start with 2nd character as new line is first character
        cell['Borough'] = (row.span.text).split('(')[0] #borough starts from character 4 and stops at the first (
        
        if ('(' in row.span.text):
            #Clean up neighborhood - Replace / with , to separate neighborhoods with comma
            #Remove ) character
            #Clean up white spaces
            cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /', ',')).replace(')', ' ')).strip(' ') 
            #print(cell)   
        table_data.append(cell)

#print(table_data)

In [69]:
df = pd.DataFrame(table_data)
df['Borough'] = df['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown TorontoStn A',
                                      'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                      'EtobicokeNorthwest':'Etobicoke Northwest',
                                      'East YorkEast Toronto': 'East York/East Toronto',
                                      'MississaugaCanada Post Gateway Processing Centre':'Mississauga' 
                                      })

In [70]:
df.shape

(103, 3)

In [71]:
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


In [75]:
#Add latitude and longitude to the dataset
#load csv file with coordinates into a df
import os, types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.

if os.environ.get('RUNTIME_ENV_LOCATION_TYPE') == 'external':
    endpoint_290bf376803544bda5c2ef450923d33a = 'https://s3.us.cloud-object-storage.appdomain.cloud'
else:
    endpoint_290bf376803544bda5c2ef450923d33a = 'https://s3.private.us.cloud-object-storage.appdomain.cloud'

client_290bf376803544bda5c2ef450923d33a = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='NV6l6xRgvUEP0kTEKRdD1fZDC86vBTyogPMWd-6Ua2vY',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url=endpoint_290bf376803544bda5c2ef450923d33a)

body = client_290bf376803544bda5c2ef450923d33a.get_object(Bucket='datasciencecapstone-donotdelete-pr-g6d1mr8teorwp2',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

latlon_df = pd.read_csv(body)
latlon_df.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [76]:
#Join the two data frames on postal code so that the original df is updated with latitude and longitude values
df.join(latlon_df.set_index('Postal Code'), on='Postal Code')

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Queen's Park,Ontario Provincial Government,43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto Business,Enclave of M4L,43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509
