# Segmenting and Clustering Neighborhoods in Toronto - Assignment

Retreving the Wikipedia page using Request lib & importing the html lib to read the table data

In [25]:
import requests 
url1='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
import lxml.html as lh
page = requests.get(url1)
doc = lh.fromstring(page.content)
tr_elements = doc.xpath('//tr')


Reteriving the columns & values of the table using the html Tags

In [26]:
col=[]
i=0
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))


1:"Postcode"
2:"Borough"
3:"Neighbourhood
"


In [27]:
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    if len(T)!=3:
        break
    i=0
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
            try:
                data=int(data)
            except:
                pass
        col[i][1].append(data)
        i+=1        
[len(C) for (title,C) in col]

[289, 289, 289]

Using Pandas to convert the data into a table format

In [28]:
import pandas as pd
My_table={title:column for (title,column) in col}
df=pd.DataFrame(My_table)
df.rename(columns={'Neighbourhood\n': 'Neighbourhood'}, inplace=True)
df.columns

df=df[['Postcode','Borough','Neighbourhood']]



In [29]:
df.replace('\n','', regex=True, inplace=True)
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


Dropped the rows that contain 'Not Assigned'

In [30]:
df=df[~df['Neighbourhood'].isin(['Not assigned'])]
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights


Appiled the Groupby function to combine the Neighbourhood -

In [31]:
df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(', '.join).reset_index()
df[['Postcode','Borough','Neighbourhood']].head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


Retreving the Shape of the Dataframe.

In [32]:
print('The dataframe has {} boroughs and {} neighborhoods.'.format(
        len(df['Borough'].unique()),
        df.shape[0]
    )
)

The dataframe has 10 boroughs and 102 neighborhoods.


Importing the Latitude & Longitud codes from the Geospatial_data.csv

In [33]:
url_geo='http://cocl.us/Geospatial_data'
geo_df = pd.read_csv(url_geo)
geo_df.head()



Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [34]:
geo_df.rename(columns={'Postal Code': 'Postcode'}, inplace=True)


In [35]:
geo_df.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


Merging the two tables based on the Postal codes.

In [36]:
df_new = pd.merge(df, geo_df, on='Postcode')
df_new.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


Importing th Required Library to explore the dataset

In [37]:
!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
!conda install -c conda-forge folium=0.5.0 --yes
import folium

Solving environment: done

# All requested packages already installed.

Solving environment: done

# All requested packages already installed.



Getting the Latitude & Longitude of Toronto, Canada

In [38]:
address = 'Toronto, CA'

geolocator = Nominatim()
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))



The geograpical coordinate of Toronto are 43.653963, -79.387207.


Creating a Map of Toronto with Neighbourhoods marked.

In [39]:
toronto_map = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_new['Latitude'], df_new['Longitude'], df_new['Borough'], df_new['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(toronto_map)  
    
toronto_map