In [None]:
!pip install geopy
!pip install folium

## Courseras Applied Data Science Capstone
1. Web Scraping Wikipedia Toronto data from 


https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M 

#### We are going to use the Beautiful Soup package for our web scraping due to better functionality 


In [325]:
#import all the important libraries
import matplotlib.colors as colors
from bs4 import BeautifulSoup
import numpy as np
import urllib.request
import pandas as pd 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values, because 
import folium
from sklearn.cluster import KMeans 

In [326]:
import urllib.request
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M "
page = urllib.request.urlopen(url)
# import the BeautifulSoup library so we can parse HTML and XML documents
file = BeautifulSoup(page, "lxml")
#uncomment the line  below to get access to the html of the web page 
#file.prettify 


#### From the file scraped from the wikipedia page, we see that we the data that we are interested in is from 
####  `<table class="wikitable sortable"> `

In [327]:
Toranto_Table =file.find('table', class_='wikitable sortable')

postal_code, bourough, neighborhood, l  = [], [], [] , []
for r in Toranto_Table.findAll('tr'):
    value = r.findAll('td')
    if len(value) == 3: 
        p = (value[0].find(text = True)).strip()
        b = (value[1].find(text = True)).strip()
        n = (value[2].find(text = True)).strip()
        ##If neighborhood is not assigned then neighborhood = borough
        if(n == "Not assigned"):
            n = b
        l.append(9)
        postal_code.append(p)
        bourough.append(b)
        neighborhood.append(n)
        
## 


#### Now that we have sorted through our html file, lets create the pandas data frame. As it will be easist to work with our data there



In [328]:
toranto_df = pd.DataFrame(postal_code, columns = ['Postal Code'])
toranto_df['bourough'] = bourough
toranto_df['neighborhood'] = neighborhood
toranto_df['labels']= l
print("tada! Data frame is made!")

tada! Data frame is made!


### Now with our data frame we need to remove "Not Assigned Data" data 

In [329]:
clean_df = toranto_df[ toranto_df['bourough'] !='Not assigned'   ].reset_index(drop = True)
number_of_dropped_rows, shape = toranto_df.shape[0] - clean_df.shape[0], clean_df.shape
print("Number of dropped rows:   " , number_of_dropped_rows)
print( "Shape of cleaned data frame:  ", shape)


Number of dropped rows:    77
Shape of cleaned data frame:   (103, 4)


### Now with data frame google in the "Geospatial_Coordinates" we are going to make a data frame of those cordinates 
2. Now we merge the two data frames together 


In [330]:
document = "Geospatial_Coordinates.csv"
geo_data = pd.read_csv(document)
final_df= pd.merge(clean_df, geo_data, on = 'Postal Code')

final_df['bourough'].nunique()




10

## Now lets make the map to visualize the data clusters 

In [337]:
address = 'Toronto, Ca'

#using the geo location liibrary get lat and long 

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
lat = location.latitude
long = location.longitude
print('The geograpical coordinate of Toronto Canada are {}, {}.'.format(lat, long))
##make the map 
toronto_map = folium.Map(location=[lat, long], zoom_start=11)
##MAke the list of colors for the bourghs 
c= [colors.rgb2hex('red'), colors.rgb2hex('blue'), colors.rgb2hex('green'),colors.rgb2hex('yellow'), colors.rgb2hex('brown'),\
    colors.rgb2hex('tomato'), colors.rgb2hex('brown'), colors.rgb2hex('green'),colors.rgb2hex('yellow'), colors.rgb2hex('brown')]




for lat, long, lab in zip(final_df['Latitude'], final_df['Longitude'], final_df['bourough']):
    folium.CircleMarker(
        [lat, long],
        radius=4,
        popup=lab,
        color='grey',
        fill=True,
        fill_color= c[3],
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  

    
#toronto_map

The geograpical coordinate of Toronto Canada are 43.6534817, -79.3839347.


## Using k means clustering we see the data splits half and half


In [332]:
final_df['Latitude'] = final_df['Latitude'].astype(float) 
final_df['Longitude'] = final_df['Longitude'].astype(float)
final_df['bourough'] = final_df['bourough'].astype(str)
final_df['neighborhood'] = final_df['neighborhood'].astype(str)

lat_long = final_df[['Latitude', 'Longitude']]

                                                     


In [333]:
num_clusters = 2


k_means = KMeans(init="k-means++", n_clusters=num_clusters, n_init=12)
k_means.fit(lat_long)
a = k_means.labels_
final_df['labels'] = a




In [334]:
toronto_map = folium.Map(location=[lat, long], zoom_start=11)
c= [colors.rgb2hex('red'), colors.rgb2hex('blue')]
for lat, long, lab, j in zip(final_df['Latitude'], final_df['Longitude'], final_df['bourough'], final_df['labels']):
    folium.CircleMarker(
        [lat, long],
        radius=4,
        popup=lab,
        color='grey',
        fill=True,
        fill_color= c[j],
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  


In [335]:
toronto_map

## Remake the map with the bourough with string `Toronto ` in  the name are black the other bouroughs are red

In [322]:
import re 
##find all the neighborhoods with "Toronto " in the name 
bourough_list = final_df['bourough'].str.findall('Toronto', flags=re.IGNORECASE).astype(bool)
toronto_map = folium.Map(location=[lat, long], zoom_start=11)
c= [colors.rgb2hex('red'), colors.rgb2hex('black')]
for lat, long, lab, j, i in zip(final_df['Latitude'], final_df['Longitude'], final_df['bourough'], final_df['labels'], bourough_list):
    if(i):
        b = 1
    else:
        b = 0
 
    folium.CircleMarker(
        [lat, long],
        radius=4,
        popup=lab,
        color='grey',
        fill=True, 
        
        fill_color= c[b], 
        
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
toronto_map