# Peer-graded Assignment: Segmenting and Clustering Neighborhoods in Toronto
Explore, segment, and cluster the neighborhoods in the city of Toronto based on the postalcode and borough information

In [145]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
print('Ready')

Ready


## Part 1: Scrapping the web page, creating and cleaning the Data Frame

### Scrape the web page

In [146]:
html_doc = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')
try:
    html_doc.raise_for_status()
except Exception as exc:
    print(f'There was a problem: {exc}')
    
print('Ready')

Ready


Parse scrapped web page using BeautifulSoup

In [147]:
soup = BeautifulSoup(html_doc.text, 'html.parser')

### Create DataFrame
Select table and rows that holds Postal Code, Borough and Neighbourhood. Iterate through each row, get its value, place the value into a new row and push the row in the list.

In [148]:
rows = soup.select('.wikitable > tbody > tr')

df_rows = []

for i in range(len(rows)):
    row = rows[i].find_all(['th', 'td'])
    df_row =[]
    for td in row:
        try:
            df_row.append(td.text.replace('\n', ''))
        except:
            continue
    if len(df_row) > 0:
        df_rows.append(df_row)

Create data frame using ```df_rows``` list.

In [149]:
df = pd.DataFrame(df_rows[1:], columns=df_rows[0])
df.head(15)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


### Data Wrangling

Remove rows if the ```df['Borough']``` is 'Not assigned' and reset index

In [150]:
idx = df[df['Borough'] == 'Not assigned' ].index
df.drop(idx, inplace=True)
df.reset_index(inplace=True, drop = True)
df[0:10]

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


If ```df['Neighbourhood']``` has value of 'Not assigned' replace it with the borough's name.

In [151]:
df['Neighbourhood'] = np.where((df.Neighbourhood == 'Not assigned'), df.Borough, df.Neighbourhood)

In [152]:
df.shape

(103, 3)

## Part 2: Finding the latitude and the longitude coordinates using the postal code 

### Get coordinates using geocoder
**Note:** This mathode took too long to process and was interupted before end processing

In [191]:
#!pip install geocoder

In [None]:
#import geocoder

#df2=df.copy()
#df2.head()
#df2['Latitude'] = float('nan')
#df2['Longitude'] = float('nan')
#df2.head()

#for i in df2.index:
#    lat_lng_coords = None
#    while(lat_lng_coords is None):
#        temp = df2.iloc[i]['Postal Code']
#        g = geocoder.google(f'{temp}, Toronto, Ontario')
#        lat_lng_coords = g.latlng
#
#    df2.loc[i, ['Latitude']] = lat_lng_coords[0]
#    df2.loc[i, ['Longitude']] = lat_lng_coords[1]

#print('Ready')

### Get coordinates using downloaded .csv document

Create data frame from Geospatial_Coordinates.csv file

In [235]:
import types
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_c2efb76b4b6b42258510492597901f5f = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='AOJEvaU-d27j4AB4UfJtixLXfXc4gXkWix7rUb8xteW_',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_c2efb76b4b6b42258510492597901f5f.get_object(Bucket='datasciencecourseracapstone-donotdelete-pr-woppyhzlezhae6',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

coordinates = pd.read_csv(body)
coordinates.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [236]:
coordinates.shape

(103, 3)

In [237]:
df2 = df.copy()
df2.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


Merge data frames on Postal Code

In [243]:
df3 = pd.merge(df2, coordinates, on='Postal Code')
df3.head(15)

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [239]:
df3.shape

(103, 5)

Verify that data frames are mereged correctly. If ```assert``` returns an error data frame merge is not correct.

In [244]:
common_postal_code = set(df2['Postal Code']) & set(coordinates['Postal Code'])
assert set(df3['Postal Code']) == common_postal_code

Check if there are missing values in Latitude and Longitude columns.

In [245]:
df3[df3['Latitude'].isnull()]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude


In [246]:
df3[df3['Longitude'].isnull()]

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude


## Part 3: Explore and cluster the neighborhoods in Toronto