# Peer-graded Assignment: Segmenting and Clustering

### 1. import Library

In [2]:
!pip install folium==0.5.0
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import os
from sklearn.cluster import KMeans
import folium 
from geopy.geocoders import Nominatim 
import matplotlib.cm as cm
import matplotlib.colors as colors

Collecting folium==0.5.0
[?25l  Downloading https://files.pythonhosted.org/packages/07/37/456fb3699ed23caa0011f8b90d9cad94445eddc656b601e6268090de35f5/folium-0.5.0.tar.gz (79kB)
[K     |████████████████████████████████| 81kB 7.6MB/s eta 0:00:011
[?25hCollecting branca (from folium==0.5.0)
  Downloading https://files.pythonhosted.org/packages/13/fb/9eacc24ba3216510c6b59a4ea1cd53d87f25ba76237d7f4393abeaf4c94e/branca-0.4.1-py3-none-any.whl
Building wheels for collected packages: folium
  Building wheel for folium (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/dsxuser/.cache/pip/wheels/f8/98/ff/954791afc47740d554f0d9e5885fa09dd60c2265d42578e665
Successfully built folium
Installing collected packages: branca, folium
Successfully installed branca-0.4.1 folium-0.5.0


### 2. Scrape from Wiki URL

In [11]:
url='https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(url).text

In [34]:
soup = BeautifulSoup(source, 'xml')
table=soup.find('table')
column_names=['Postalcode','Borough','Neighborhood']
df = pd.DataFrame(columns=column_names)

In [35]:
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df.loc[len(df)] = row_data
        df.head()

### 2.1 Remove "Not Assigned" Rows

In [36]:
# Dropping the rows where Borough is 'Not assigned'
df1 = df[df.Borough != 'Not assigned']

# Combining the neighborhoods with same Postalcode
df_2 = df1.groupby(['Postalcode','Borough'], sort=False).agg(', '.join)
df_2.reset_index(inplace=True)

# Replacing the name of the neighborhoods which are 'Not assigned' with names of Borough
df_2['Neighborhood'] = np.where(df_2['Neighborhood'] == 'Not assigned',df_2['Borough'], df_2['Neighborhood'])

### 3. Dataframe Shape

In [37]:
df_2.shape

(103, 3)

### 4. Import latitudes and longitudes from CSV file

In [53]:
df_lati_long = pd.read_csv('https://cocl.us/Geospatial_data')
df_lati_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 4.1 Merged with original dataframe

In [55]:
df_lati_long.rename(columns={'Postal Code':'Postalcode'},inplace=True)
df_3 = pd.merge(df_2,df_lati_long,on='Postalcode')

In [56]:
df_3.head()

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


### 4.2 Extract rows from the dataframe which contains Toronto in their Borough

In [57]:
df_map = df_3[df_3['Borough'].str.contains('Toronto',regex=False)]
df_map

Unnamed: 0,Postalcode,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


### 5. Create Map

In [82]:
mapview_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=12)

### 5.1 Import Latitude and Longitude from dataframe and Visualize (Map preview on ReadMe file)

In [83]:
for lat,lng,borough,neighborhood in zip(df_map['Latitude'],df_map['Longitude'],df_map['Borough'],df_map['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=3,
    popup=label,
    color='Green',
    fill=True,
    fill_color='#003100',
    fill_opacity=0.5,
    parse_html=False).add_to(mapview_toronto)
mapview_toronto

### 6. K-means Clustering for visualizing neighborhoods

In [74]:
k=6
map_clustering = df_map.drop(['Postalcode','Borough','Neighborhood'],1)
kmeans = KMeans(n_clusters = k,random_state=0).fit(map_clustering)
kmeans.labels_

array([0, 0, 0, 0, 3, 0, 0, 5, 0, 1, 0, 5, 4, 0, 5, 3, 0, 3, 2, 2, 2, 2,
       1, 2, 5, 1, 2, 5, 1, 4, 5, 2, 0, 4, 0, 4, 0, 4, 3], dtype=int32)

### 6.1 insert K-means Labels to dataframe

In [75]:
df_map.insert(0, 'Cluster Labels', kmeans.labels_)

In [76]:
df_map

Unnamed: 0,Cluster Labels,Postalcode,Borough,Neighborhood,Latitude,Longitude
2,0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,0,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,0,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,0,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,3,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,0,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,5,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,0,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,1,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


### 6.2 Import Clustered neighborhoods and Visualize (Map preview on ReadMe file)

In [81]:
mapview_toronto_clustered = folium.Map(location=[43.651070,-79.347015],zoom_start=12)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, neighborhood, cluster in zip(df_map['Latitude'], df_map['Longitude'], df_map['Neighborhood'], df_map['Cluster Labels']):
    label = folium.Popup(' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.8).add_to(mapview_toronto_clustered)
       
mapview_toronto_clustered