# Sara Toronto Neighborhood DS Project

## Part 1

In [1]:
# importing nessesary libraries
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import pandas as pd
import numpy as np
import folium
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

# getting data from web source
with urlopen('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M') as html_doc:
    soup = BeautifulSoup(html_doc, 'lxml')

In [2]:
# defining scraping variables
postal = [ ]
borough = [ ]
neighborhood = [ ]

# scraping data
for dt in soup.table.tbody.find_all('tr'):
    
    for dt2 in dt.find_all('td'):
        # extacting postal code
        postal_code = dt2.p.b
        postal.append(postal_code.text)
        
        # extracting borough & neighborhood
        for dt3 in dt2.find_all('span'):
            
            if dt3.text == 'Not assigned':
                borough.append(dt3.text)
                neighborhood.append('')
                
            else:
                brh = dt3.text.split('(')[0]
                borough.append(brh)     
                
                nhds = dt3.text.split('(')[~0].replace(')', '').replace(' /', ',') # formatting string appearance
                neighborhood.append(nhds)

In [3]:
# transoforming extracted data to a dataframe
df = pd.DataFrame([])
df['Postal Code'] = postal
df['Borough'] = borough
df['Neighborhood'] = neighborhood

# removing data with unassigned borough
NA = df[df['Borough'] == 'Not assigned'].index
df.drop(NA, inplace=True)

In [4]:
# cleaning data
df['Borough'] = df['Borough'].replace('EtobicokeNorthwest', 'Etobicoke')
df['Borough'] = df['Borough'].replace('Downtown TorontoStn A PO Boxes25 The Esplanade', 'Downtown Toronto')
df['Borough'] = df['Borough'].replace('East TorontoBusiness reply mail Processing Centre969 Eastern', 'East Toronto')
df['Borough'] = df['Borough'].replace('Queen\'s Park / Ontario Provincial Government', 'Downtown Toronto') # park is in Downtown Toronto Borough
df.drop(df[df.Borough == 'MississaugaCanada Post Gateway Processing Centre'].index, inplace=True) # not in Toronto
df

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
160,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
165,M4Y,Downtown Toronto,Church and Wellesley
168,M7Y,East Toronto,Enclave of M4L
169,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [5]:
# getting dataframe shape
df.shape

(102, 3)

## Part 2

In [6]:
# getting location data from web source
with urlopen('http://cocl.us/Geospatial_data') as geo_data:
    geo_df = pd.read_csv(geo_data)

In [7]:
# merge location data with neighborhood using the postal code
fdf = pd.merge(df, geo_df, on='Postal Code')
fdf

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
97,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
98,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
99,M7Y,East Toronto,Enclave of M4L,43.662744,-79.321558
100,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Part 3

In [8]:
# visualizing all the Toronto neighborhoods
map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=11)

for lat, lng, borough, neighborhood in zip(fdf['Latitude'], fdf['Longitude'], fdf['Borough'], fdf['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
    radius = 5,
    popup = label,
    color = 'green',
    fill = True,
    fill_color = '#3186cc',
    fill_opacity = 0.7,
    parse_html = False).add_to(map_toronto)
map_toronto

In [9]:
# clustering all toronto borough (unlike the example I considered all boroughs)
k = 5
toronto_clustering = fdf.drop(['Postal Code','Borough','Neighborhood'], 1)
kmeans = KMeans(n_clusters = k, random_state=0).fit(toronto_clustering)
kmeans.labels_
fdf.insert(0, 'Cluster Labels', kmeans.labels_)
fdf

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,0,M3A,North York,Parkwoods,43.753259,-79.329656
1,0,M4A,North York,Victoria Village,43.725882,-79.315572
2,4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...,...
97,1,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
98,4,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
99,4,M7Y,East Toronto,Enclave of M4L,43.662744,-79.321558
100,1,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


In [10]:
# create map
map_clusters = folium.Map(location=[43.651070,-79.347015], zoom_start=11)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, Neighborhood, cluster in zip(fdf['Latitude'], fdf['Longitude'], fdf['Neighborhood'], fdf['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html = True)
    folium.CircleMarker(
        [lat, lon],
        radius = 5,
        popup = label,
        color = rainbow[cluster-1],
        fill = True,
        fill_color = rainbow[cluster-1],
        fill_opacity = 0.7).add_to(map_clusters)       
map_clusters