# IBM Applied Data Science Capstone project

In [55]:
import pandas as pd
import numpy as np

In [23]:
import requests
import lxml
from bs4 import BeautifulSoup

In [218]:
from geopy.geocoders import Nominatim 

## Table of Contents

<div class="alert alert-block alert-info" style="margin-top: 20px">

<font size = 3>

1. <a href="#item1">Download and Explore Dataset</a>

2. <a href="#item2">Explore Neighborhoods in New York City</a>

3. <a href="#item3">Analyze Each Neighborhood</a>

4. <a href="#item4">Cluster Neighborhoods</a>

5. <a href="#item5">Examine Clusters</a>    
</font>
</div>

# Part1. Create DataFrame with Postal Codes

### Get data into data frame

In [13]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
page_soup = BeautifulSoup(page.content, 'html.parser')     

In [17]:
table_raw = page_soup.table

In [45]:
tbl_header = []
for l in table_raw.find_all('th'):
    tbl_header.append(l.string.strip('\n'))

In [46]:
tbl_header

['Postal code', 'Borough', 'Neighborhood']

In [50]:
tbl_content = []
for l in table_raw.find_all('td'):
    tbl_content.append(l.string.strip('\n'))

In [52]:
len(tbl_content)

540

In [60]:
n_cols = len(tbl_header)
tbl_content_split = [tbl_content[x:x+n_cols] for x in range(0, len(tbl_content), n_cols)]

In [64]:
toronto_postal_codes_raw = pd.DataFrame(tbl_content_split, columns=tbl_header)

### Cleanup data

In [194]:
new_df = toronto_postal_codes_raw[toronto_postal_codes_raw['Borough'] != 'Not assigned']

In [195]:
# There are no duplicates unlike have been told in the exercise
new_df[new_df.duplicated('Postal code')]

Unnamed: 0,Postal code,Borough,Neighborhood


In [196]:
# There are no Neighborhood with 'Not assigned' or empty 
new_df[(new_df['Neighborhood'] == 'Not assigned') | (new_df['Neighborhood'] == '')]

Unnamed: 0,Postal code,Borough,Neighborhood


In [197]:
# Replace / in Neighborhood with ,
new_df['Neighborhood'].replace(' /', ',', regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [198]:
toronto_postal_codes = new_df.reset_index(drop=True)

In [199]:
toronto_postal_codes.rename(columns={'Postal code': 'Postal Code'}, inplace=True)

In [200]:
toronto_postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [201]:
toronto_postal_codes.tail()

Unnamed: 0,Postal Code,Borough,Neighborhood
98,M8X,Etobicoke,"The Kingsway, Montgomery Road , Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,Business reply mail Processing CentrE
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."
102,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [202]:
toronto_postal_codes.shape

(103, 3)

# Part2. Add Latitude and Longitude to Postal Codes

In [164]:
import geocoder

In [181]:
geocoder.google('{}, Toronto, Ontario'.format('M8X'))

<[REQUEST_DENIED] Google - Geocode [empty]>

In [179]:
print(g.latlng)

None


In [177]:
def get_location(postal_code):

    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    return latitude, longitude

Ok, lets simply add csv with coordinates from Coursera

In [182]:
url = 'https://cocl.us/Geospatial_data'

In [183]:
df_lat_lon = pd.read_csv(url)

In [186]:
df_lat_lon.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [187]:
df_lat_lon.shape

(103, 3)

In [203]:
toronto_postal_codes_w_coords = pd.merge(
    toronto_postal_codes,
    df_lat_lon,
    on='Postal Code')

In [204]:
toronto_postal_codes_w_coords.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Part3. Explore neighborhoods

In [206]:
import folium

### Create map

In [209]:
latitude, longitude = 43.753259, -79.329656
# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(
    toronto_postal_codes_w_coords['Latitude'],
    toronto_postal_codes_w_coords['Longitude'], 
    toronto_postal_codes_w_coords['Borough'], 
    toronto_postal_codes_w_coords['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

As a virtual guy is planning to move to East York, lets explore and cluster East York area

In [214]:
east_york = toronto_postal_codes_w_coords[toronto_postal_codes_w_coords['Borough'] == 'East York']
east_york.reset_index(inplace=True, drop=True)

In [217]:
east_york.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
1,M4C,East York,Woodbine Heights,43.695344,-79.318389
2,M4G,East York,Leaside,43.70906,-79.363452
3,M4H,East York,Thorncliffe Park,43.705369,-79.349372
4,M4J,East York,East Toronto,43.685347,-79.338106


In [220]:
# Lets get Borough coordinates
address = 'East York, Toronto'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of {} are {}, {}.'.format(address, latitude, longitude))

The geograpical coordinate of East York, Toronto are 43.699971000000005, -79.33251996261595.


In [222]:
# We visualize it
# create map of Manhattan using latitude and longitude values
map_borough = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(
    east_york['Latitude'], 
    east_york['Longitude'], 
    east_york['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_borough)  
    
map_borough

# Part3. Clustering