### This notebook will be mainly used for the capstone project.

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [11]:
#!pip install Scrapy

#### Web Scraping using Beautiful Soup

In [12]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [13]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)

In [14]:
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [17]:
# Print the first 2 rows for sanity check
rows = soup.find_all('tr')
print(rows[:2])

[<tr>
<th>Postal code
</th>
<th>Borough
</th>
<th>Neighborhood
</th></tr>, <tr>
<td>M1A
</td>
<td>Not assigned
</td>
<td>
</td></tr>]


In [19]:
for row in rows:
    row_td = row.find_all('td')
#print(row_td)
type(row_td)

bs4.element.ResultSet

In [21]:
str_cells = str(row_td)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()
#print(cleantext)

In [23]:
import re

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
#print(clean2)
type(clean2)

str

In [25]:
df = pd.DataFrame(list_rows)
df.head(2)

Unnamed: 0,0
0,[]
1,"[M1A\n, Not assigned\n, \n]"


#### Split the data column with comma

In [38]:
df1 = df[0].str.split(',', expand=True)
df1.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,[],,,,,,,,,,...,,,,,,,,,,
1,[M1A\n,Not assigned\n,\n],,,,,,,,...,,,,,,,,,,
2,[M2A\n,Not assigned\n,\n],,,,,,,,...,,,,,,,,,,
3,[M3A\n,North York\n,Parkwoods\n],,,,,,,,...,,,,,,,,,,
4,[M4A\n,North York\n,Victoria Village\n],,,,,,,,...,,,,,,,,,,
5,[M5A\n,Downtown Toronto\n,Regent Park / Harbourfront\n],,,,,,,,...,,,,,,,,,,
6,[M6A\n,North York\n,Lawrence Manor / Lawrence Heights\n],,,,,,,,...,,,,,,,,,,
7,[M7A\n,Downtown Toronto\n,Queen's Park / Ontario Provincial Government\n],,,,,,,,...,,,,,,,,,,
8,[M8A\n,Not assigned\n,\n],,,,,,,,...,,,,,,,,,,
9,[M9A\n,Etobicoke\n,Islington Avenue\n],,,,,,,,...,,,,,,,,,,


#### Cleaning values for first 3 coumns

In [35]:
# Remove '[',']' on Column 0 and Column 1
df1[0] = df1[0].str.strip('[')
df1[2] = df1[2].str.strip(']')

# Remove \n on first 3 columns
for i in range(3):
    df1[i] = df1[i].str.strip('\n')

df1.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,30
0,,,,,,,,,,,...,,,,,,,,,,
1,M1A,Not assigned,,,,,,,,,...,,,,,,,,,,
2,M2A,Not assigned,,,,,,,,,...,,,,,,,,,,
3,M3A,North York,Parkwoods,,,,,,,,...,,,,,,,,,,
4,M4A,North York,Victoria Village,,,,,,,,...,,,,,,,,,,
5,M5A,Downtown Toronto,Regent Park / Harbourfront,,,,,,,,...,,,,,,,,,,
6,M6A,North York,Lawrence Manor / Lawrence Heights,,,,,,,,...,,,,,,,,,,
7,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government,,,,,,,,...,,,,,,,,,,
8,M8A,Not assigned,,,,,,,,,...,,,,,,,,,,
9,M9A,Etobicoke,Islington Avenue,,,,,,,,...,,,,,,,,,,


#### Drop other columns except the first 3 columns

In [37]:
# Keep first three columns
df2 = df1.iloc[:, : 3]
df2.head(5)

Unnamed: 0,0,1,2
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village


#### Get Table Headers

In [29]:
# Get table headers
col_labels = soup.find_all('th')
all_header = []
col_str = str(col_labels)
cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
all_header.append(cleantext2)
print(all_header)

['[Postal code\n, Borough\n, Neighborhood\n, Canadian postal codes\n]']


In [105]:
# Replace headers
columns = {0:'Postal code', 1:'Borough', 2:'Neighborhood'}
df3 = df2.rename(columns=columns)
df3.head(11)

Unnamed: 0,Postal code,Borough,Neighborhood
0,,,
1,M1A,Not assigned,
2,M2A,Not assigned,
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront
6,M6A,North York,Lawrence Manor / Lawrence Heights
7,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
8,M8A,Not assigned,
9,M9A,Etobicoke,Islington Avenue


#### Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.

In [106]:
#df3['Borough']

In [127]:
# Remove rows contain 'Not assigned','NS','NL','B', and 'None' in Borough column
df4 = df3[~df3['Borough'].str.contains("Not assigned|NS|NL|B", na=True)]

# Remove rows contain 'None' in Neighborhood column
df5 = df4[~df4['Neighborhood'].str.contains("N/A", na=True)]
df5

Unnamed: 0,Postal code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Regent Park / Harbourfront
6,M6A,North York,Lawrence Manor / Lawrence Heights
7,M7A,Downtown Toronto,Queen's Park / Ontario Provincial Government
9,M9A,Etobicoke,Islington Avenue
10,M1B,Scarborough,Malvern / Rouge
12,M3B,North York,Don Mills
13,M4B,East York,Parkview Hill / Woodbine Gardens
14,M5B,Downtown Toronto,Garden District


In [130]:
# Replace / with , in Neighborhood column
df5['Neighborhood'] = df5['Neighborhood'].str.replace(' /',',')
df5.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,Postal code,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"
6,M6A,North York,"Lawrence Manor, Lawrence Heights"
7,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


In [131]:
df6 = df5[df5['Postal code'].str.contains("M5A", na=True)]
df6

Unnamed: 0,Postal code,Borough,Neighborhood
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe.

In [133]:
df5.shape

(103, 3)

Now that you have built a dataframe of the postal code of each neighborhood along with the borough name and neighborhood name, in order to utilize the Foursquare location data, we need to get the latitude and the longitude coordinates of each neighborhood.

In an older version of this course, we were leveraging the Google Maps Geocoding API to get the latitude and the longitude coordinates of each neighborhood. However, recently Google started charging for their API: http://geoawesomeness.com/developers-up-in-arms-over-google-maps-api-insane-price-hike/, so we will use the Geocoder Python package instead: https://geocoder.readthedocs.io/index.html.

The problem with this Package is you have to be persistent sometimes in order to get the geographical coordinates of a given postal code. So you can make a call to get the latitude and longitude coordinates of a given postal code and the result would be None, and then make the call again and you would get the coordinates. So, in order to make sure that you get the coordinates for all of our neighborhoods, you can run a while loop for each postal code. Taking postal code M5G as an example, your code would look something like this:

In [137]:
#!pip install geocoder

In [141]:
df5['Postal code'].head(5)

3    M3A
4    M4A
5    M5A
6    M6A
7    M7A
Name: Postal code, dtype: object

#### Use geocoder to get latitude and longitude for each 'Postal code' in dataframe

import geocoder # import geocoder

##### initialize your variable to None
lat_lng_coords = None

##### loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(df5['Postal code']))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]

#### Create two new dataframes for latitude and longitude

df_lat = pd.DataFrame(latitude) 
df_lon = pd.DataFrame(longitude) 
df_lat.head()

### Use the csv file to create the dataframe

Given that this package can be very unreliable, in case you are not able to get the geographical coordinates of the neighborhoods using the Geocoder package, here is a link to a csv file that has the geographical coordinates of each postal code: http://cocl.us/Geospatial_data

In [145]:
df_geocoder = pd.read_csv("Geospatial_Coordinates.csv")
df_geocoder.head()

Unnamed: 0,Postal code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Merge Geocode columns with the dataframe

In [148]:
df_geocode = pd.merge(df5, df_geocoder, on='Postal code', how='outer')
df_geocode.head(12)

Unnamed: 0,Postal code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,Islington Avenue,43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,Garden District,43.657162,-79.378937


#### Get Geocode of Toronto

In [154]:
#!pip install geopy

In [156]:
from geopy.geocoders import Nominatim 

city = 'Toronto, ON, Canada'
geolocator = Nominatim(user_agent="to_explorer")
location = geolocator.geocode(city)
latitude = location.latitude
longitude = location.longitude
print('Geograpical coordinate of Toronto, ON, Canada are {}, {}.'.format(latitude, longitude))

Geograpical coordinate of Toronto, ON, Canada are 43.6534817, -79.3839347.


Explore and cluster the neighborhoods in Toronto. You can decide to work with only boroughs that contain the word Toronto and then replicate the same analysis we did to the New York City data. It is up to you.

Just make sure:

1. to add enough Markdown cells to explain what you decided to do and to report any observations you make.
2. to generate maps to visualize your neighborhoods and how they cluster together.

#### Create map of Toronto using latitude and longitude values

1. Use folium library to render map
2. Use CircleMarker with blue color to represent the neighborhood of boroughs from the dataframe

In [162]:
import folium 

map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# Add markers to map
for lat, lng, borough, neighborhood in zip(df_geocode['Latitude'], df_geocode['Longitude'], 
    df_geocode['Borough'], df_geocode['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker([lat, lng],radius=5,popup=label,color='blue',fill=True,fill_color='#3186cc',fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto