## Importing libraries for loading table data into dataframe

In [71]:
import requests
import pandas as pd
import numpy as np
import lxml.html as lh
from bs4 import BeautifulSoup

In [72]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

### Creating an empty list of canada in order to append it with the table data such as borogh, neighbour, etc

In [73]:
canada=[]

html=requests.get(url).text
soup=BeautifulSoup(html,'html.parser')

In [74]:
post_table=soup.find('table',{'class':'wikitable sortable'})

In [75]:
print(post_table)

<table class="wikitable sortable">
<tbody><tr>
<th>Postcode</th>
<th>Borough</th>
<th>Neighbourhood
</th></tr>
<tr>
<td>M1A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M2A</td>
<td>Not assigned</td>
<td>Not assigned
</td></tr>
<tr>
<td>M3A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Parkwoods" title="Parkwoods">Parkwoods</a>
</td></tr>
<tr>
<td>M4A</td>
<td><a href="/wiki/North_York" title="North York">North York</a></td>
<td><a href="/wiki/Victoria_Village" title="Victoria Village">Victoria Village</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Harbourfront_(Toronto)" title="Harbourfront (Toronto)">Harbourfront</a>
</td></tr>
<tr>
<td>M5A</td>
<td><a href="/wiki/Downtown_Toronto" title="Downtown Toronto">Downtown Toronto</a></td>
<td><a href="/wiki/Regent_Park" title="Regent Park">Regent Park</a>
</td></tr>
<tr>
<td>M6A</td>

### finding table rows and then their respective columns and populating them

In [76]:
try:
    for rows in post_table.find_all('tr'):
        cols=rows.find_all('td')
        if len(cols)==3:
            canada.append((cols[0].text.strip(),cols[1].text.strip(),cols[2].text.strip()))
except:pass

In [77]:
canada

[('M1A', 'Not assigned', 'Not assigned'),
 ('M2A', 'Not assigned', 'Not assigned'),
 ('M3A', 'North York', 'Parkwoods'),
 ('M4A', 'North York', 'Victoria Village'),
 ('M5A', 'Downtown Toronto', 'Harbourfront'),
 ('M5A', 'Downtown Toronto', 'Regent Park'),
 ('M6A', 'North York', 'Lawrence Heights'),
 ('M6A', 'North York', 'Lawrence Manor'),
 ('M7A', "Queen's Park", 'Not assigned'),
 ('M8A', 'Not assigned', 'Not assigned'),
 ('M9A', 'Etobicoke', 'Islington Avenue'),
 ('M1B', 'Scarborough', 'Rouge'),
 ('M1B', 'Scarborough', 'Malvern'),
 ('M2B', 'Not assigned', 'Not assigned'),
 ('M3B', 'North York', 'Don Mills North'),
 ('M4B', 'East York', 'Woodbine Gardens'),
 ('M4B', 'East York', 'Parkview Hill'),
 ('M5B', 'Downtown Toronto', 'Ryerson'),
 ('M5B', 'Downtown Toronto', 'Garden District'),
 ('M6B', 'North York', 'Glencairn'),
 ('M7B', 'Not assigned', 'Not assigned'),
 ('M8B', 'Not assigned', 'Not assigned'),
 ('M9B', 'Etobicoke', 'Cloverdale'),
 ('M9B', 'Etobicoke', 'Islington'),
 ('M9B', 

In [78]:
canada_array=np.asarray(canada)

In [79]:
len(canada_array)

288

In [80]:
len(canada)

288

### COnverting the canda list into dataframe

In [81]:
df=pd.DataFrame(canada_array)
df.columns=['PostalCode', 'Borough' ,'Neighborhood']

In [82]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### removing borough from df which are not assigned

In [83]:
df=df[~df['Borough'].isin(['Not assigned'])]

In [84]:
df.shape

(211, 3)

### Group by Postcode and Borough to combine Neighbourhood data in one column with comma

In [85]:
df=df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(','.join).reset_index()

In [86]:
df.shape

(103, 3)

### Replace "Not assigned" data in Neighbourhood with Borough data

In [88]:
df.Neighborhood.replace("Not assigned",df.Borough,inplace=True)
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park,Ionview,Kennedy Park"
7,M1L,Scarborough,"Clairlea,Golden Mile,Oakridge"
8,M1M,Scarborough,"Cliffcrest,Cliffside,Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff,Cliffside West"


### checking the shape of dataframe

In [89]:
df.shape

(103, 3)

## q2

### Read csv file to get Latitude & Longitude datas

In [90]:
df2=pd.read_csv("http://cocl.us/Geospatial_data")

In [91]:
df2.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### removing space from postal code in order to merge it

In [92]:
df2.rename(columns={'Postal Code':'PostalCode'},inplace=True)

In [94]:
df2.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### combinging wikipedia df and csv file df2 on PostalCode

In [95]:
df3=df.merge(df2,on='PostalCode',how='left')
df3.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [96]:
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium 

In [99]:
df_toronto=df3[df3['Borough'].str.contains('Toronto')]


In [100]:
df_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [101]:
toronto_map = folium.Map(location=[43.65, -79.4], zoom_start=12)

X = df_toronto['Latitude']
Y = df_toronto['Longitude']
Z = np.stack((X, Y), axis=1)

kmeans = KMeans(n_clusters=4, random_state=0).fit(Z)

clusters = kmeans.labels_
colors = ['red', 'green', 'blue', 'yellow']
df_toronto['Cluster'] = clusters

for latitude, longitude, borough, cluster in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Cluster']):
    label = folium.Popup(borough, parse_html=True)
    folium.CircleMarker(
        [latitude, longitude],
        radius=5,
        popup=label,
        color='black',
        fill=True,
        fill_color=colors[cluster],
        fill_opacity=0.7).add_to(toronto_map)  

toronto_map

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
