## Neighborhoods in Toronto ##

In [19]:
#install Beautifulsoup4 and other packages
!conda install -c conda-forge beautifulsoup4 --yes
!conda install -c conda-forge lxml --yes
!conda install -c conda-forge html5lib --yes
!conda install -c conda-forge geocoder --yes
!conda install -c conda-forge folium=0.5.0 --yes

Fetching package metadata .............
Solving package specifications: .

Package plan for installation in environment /opt/conda/envs/DSX-Python35:

The following packages will be DOWNGRADED:

    folium: 0.7.0-py_0 conda-forge --> 0.5.0-py_0 conda-forge

folium-0.5.0-p 100% |################################| Time: 0:00:00   3.23 MB/s


In [20]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors
from geopy.geocoders import Nominatim

### Scrape wiki page

In [6]:
source=requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup=BeautifulSoup(source, 'lxml')
#print(soup)
#print(table.prettify())

In [7]:
#read all neighourhood of Toronto from xml and load into dataframe
table=soup.find('table', attrs={'class':'wikitable sortable'})
table_rows=table.find_all('tr')
res=[]
i=0
for tr in table_rows:
     td=tr.find_all('td')
     row=[tr.text.strip() for tr in td if tr.text.strip()]
     if row:
            res.append(row)

df=pd.DataFrame(res,columns=["Postcode","Borough","Neighbourhood"])
df

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


In [8]:
df1=df[df['Borough']!= 'Not assigned']
df1.shape

(212, 3)

In [9]:
print("Before: ", df1[df1['Neighbourhood'] == 'Not assigned'].shape)
df1['Neighbourhood'].replace(regex='Not assigned', value=df1['Borough'], inplace=True)
print("After: ", df1[df1['Neighbourhood'] == 'Not assigned'].shape)

Before:  (1, 3)
After:  (0, 3)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [10]:
#group all neighbourhood by postcode
#df2=df1.groupby(['Postcode','Borough'])['Neighbourhood'].apply(list)
df2=df1.groupby(['Postcode','Borough'])['Neighbourhood'].agg(lambda x: ', '.join(x))
df2=pd.DataFrame(df2.reset_index())
df2

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [7]:
df2.shape

(103, 3)

In [24]:
#import geocoder # to get lat & log for postal code
#lat_lng_coords = None
#g = geocoder.google('M1C')
#g.latlng

### Since above package not giving desire result....hence using CSV file

In [11]:
#download and read lat & lng of Troronto postal code data
!wget -q -O 'Geospatial_Coordinates.csv' http://cocl.us/Geospatial_data

In [12]:
latlng=pd.read_csv("Geospatial_Coordinates.csv")
latlng.columns=["Postcode", "Latitude", "Longitude"]
print ('Neighbourhood df: ', df2.shape)
print ('post code lat n lng df: ', latlng.shape)

Neighbourhood df:  (103, 3)
post code lat n lng df:  (103, 3)


In [13]:
#join two dataframes
df3=pd.merge(df2, latlng, on='Postcode')
df3

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## Cluster neighbourhood in Toronto

In [16]:
#get lat n lng value of Toronto city
address = 'Toronto'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Lat n lng of Toronto city are {},{}'.format(latitude, longitude))

Lat n lng of Toronto city are 43.653963,-79.387207


In [22]:
# create map of Manhattan using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, borough, neighbourhood in zip(df3['Latitude'], df3['Longitude'], df3['Borough'], df3['Neighbourhood']):
    label='{}, {}'.format(borough, neighbourhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
#        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [52]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df3)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

ValueError: could not convert string to float: 'Northwest'