# KMEANS Clustering

## part 1 : scrape using beautifulsoup into pandas df

In [54]:
#!pip install beautifulsoup4

In [1]:
from bs4 import BeautifulSoup as sp
import pandas as pd
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans
import folium

#### this is the start of scraping process

In [2]:
import requests
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
page

<Response [200]>

In [3]:
soup = sp(page.content, 'html.parser')

In [4]:
table = soup.find(class_="wikitable sortable")

In [5]:
print(table.prettify())

<table class="wikitable sortable">
 <tbody>
  <tr>
   <th>
    Postal Code
   </th>
   <th>
    Borough
   </th>
   <th>
    Neighbourhood
   </th>
  </tr>
  <tr>
   <td>
    M1A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M2A
   </td>
   <td>
    Not assigned
   </td>
   <td>
    Not assigned
   </td>
  </tr>
  <tr>
   <td>
    M3A
   </td>
   <td>
    North York
   </td>
   <td>
    Parkwoods
   </td>
  </tr>
  <tr>
   <td>
    M4A
   </td>
   <td>
    North York
   </td>
   <td>
    Victoria Village
   </td>
  </tr>
  <tr>
   <td>
    M5A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Regent Park, Harbourfront
   </td>
  </tr>
  <tr>
   <td>
    M6A
   </td>
   <td>
    North York
   </td>
   <td>
    Lawrence Manor, Lawrence Heights
   </td>
  </tr>
  <tr>
   <td>
    M7A
   </td>
   <td>
    Downtown Toronto
   </td>
   <td>
    Queen's Park, Ontario Provincial Government
   </td>
  </tr>
  <tr>
   <td>
    M8

#### this is the where we tidy the table and make it into structured table

In [6]:
table_rows = table.find_all('tr')

res = []
for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td if tr.text.strip()]
    if row:
        res.append(row)

In [7]:
res

[['M1A', 'Not assigned', 'Not assigned'],
 ['M2A', 'Not assigned', 'Not assigned'],
 ['M3A', 'North York', 'Parkwoods'],
 ['M4A', 'North York', 'Victoria Village'],
 ['M5A', 'Downtown Toronto', 'Regent Park, Harbourfront'],
 ['M6A', 'North York', 'Lawrence Manor, Lawrence Heights'],
 ['M7A', 'Downtown Toronto', "Queen's Park, Ontario Provincial Government"],
 ['M8A', 'Not assigned', 'Not assigned'],
 ['M9A', 'Etobicoke', 'Islington Avenue, Humber Valley Village'],
 ['M1B', 'Scarborough', 'Malvern, Rouge'],
 ['M2B', 'Not assigned', 'Not assigned'],
 ['M3B', 'North York', 'Don Mills'],
 ['M4B', 'East York', 'Parkview Hill, Woodbine Gardens'],
 ['M5B', 'Downtown Toronto', 'Garden District, Ryerson'],
 ['M6B', 'North York', 'Glencairn'],
 ['M7B', 'Not assigned', 'Not assigned'],
 ['M8B', 'Not assigned', 'Not assigned'],
 ['M9B',
  'Etobicoke',
  'West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale'],
 ['M1C', 'Scarborough', 'Rouge Hill, Port Union, Highland Creek'],
 ['M

#### the first dataframe from scraping process before

In [8]:
df = pd.DataFrame(res, columns=["Postal Code", "Borough", "Neighborhood"])
df

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
7,M8A,Not assigned,Not assigned
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"


#### we drop the not assigned value

In [9]:
dfdrop = df[df.Borough != 'Not assigned']
dfdrop2 = df[df.Borough != 'Not assigned']
dfdrop2['Neighborhood'] = np.where(dfdrop2['Neighborhood'] == "Not assigned", dfdrop2['Borough'], dfdrop2['Neighborhood'])

dfdrop2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
8,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
9,M1B,Scarborough,"Malvern, Rouge"
11,M3B,North York,Don Mills
12,M4B,East York,"Parkview Hill, Woodbine Gardens"
13,M5B,Downtown Toronto,"Garden District, Ryerson"


#### joining the same postal code

In [10]:
dffinal = dfdrop2.groupby('Postal Code').agg({'Borough':'first',
                               'Neighborhood': ', '.join}).reset_index()
dffinal

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


#### final task of part 1, print shape

In [11]:
dffinal.shape

(103, 3)

## part 2 : geocoder package

In [12]:
geo = pd.read_csv("https://cocl.us/Geospatial_data")
geo

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
5,M1J,43.744734,-79.239476
6,M1K,43.727929,-79.262029
7,M1L,43.711112,-79.284577
8,M1M,43.716316,-79.239476
9,M1N,43.692657,-79.264848


In [13]:
merge = pd.merge(dffinal, geo, on='Postal Code')

In [14]:
merge

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


## part 3 cluster the neighborhood

#### im going to cluster neighborhood on borough that contains the word toronto only, which are east toronto, central toronto, downtown toronto, and west toronto

In [15]:
toronto = merge[merge['Borough'].str.contains("Toronto")]
toronto

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


In [16]:
toronto.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39 entries, 37 to 87
Data columns (total 5 columns):
Postal Code     39 non-null object
Borough         39 non-null object
Neighborhood    39 non-null object
Latitude        39 non-null float64
Longitude       39 non-null float64
dtypes: float64(2), object(3)
memory usage: 1.8+ KB


In [17]:
toronto3 = toronto.copy()
toronto3

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,M4M,East Toronto,Studio District,43.659526,-79.340923
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


#### i clustered it using kmeans clustering algorithm only using latitude and longitude value with assumption the algorithm is going to cluster based on how similar and close the neighborhood distance is

In [18]:
toronto_cluster3 = toronto3.iloc[:,3:5]
toronto_cluster3

Unnamed: 0,Latitude,Longitude
37,43.676357,-79.293031
41,43.679557,-79.352188
42,43.668999,-79.315572
43,43.659526,-79.340923
44,43.72802,-79.38879
45,43.712751,-79.390197
46,43.715383,-79.405678
47,43.704324,-79.38879
48,43.689574,-79.38316
49,43.686412,-79.400049


In [19]:
# set number of clusters
kclusters3 = 5

# run k-means clustering
kmeans3 = KMeans(n_clusters=kclusters3, random_state=0).fit(toronto_cluster3)

# check cluster labels generated for each row in the dataframe
kmeans3.labels_[0:10] 

array([4, 4, 4, 4, 2, 2, 2, 2, 2, 2])

In [20]:
torontolabel3 = toronto3.copy()
torontolabel3.insert(0, 'Cluster Labels', kmeans3.labels_)
torontolabel3

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
37,4,M4E,East Toronto,The Beaches,43.676357,-79.293031
41,4,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188
42,4,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572
43,4,M4M,East Toronto,Studio District,43.659526,-79.340923
44,2,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879
45,2,M4P,Central Toronto,Davisville North,43.712751,-79.390197
46,2,M4R,Central Toronto,"North Toronto West, Lawrence Park",43.715383,-79.405678
47,2,M4S,Central Toronto,Davisville,43.704324,-79.38879
48,2,M4T,Central Toronto,"Moore Park, Summerhill East",43.689574,-79.38316
49,2,M4V,Central Toronto,"Summerhill West, Rathnelly, South Hill, Forest...",43.686412,-79.400049


#### here is the map of the clustered neighborhood

In [21]:
# create map
map_clusters = folium.Map(location=[43.651070, -79.347015], zoom_start=11)

# set color scheme for the clusters
x = np.arange(kclusters3)
ys = [i + x + (i*x)**2 for i in range(kclusters3)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(torontolabel3['Latitude'], torontolabel3['Longitude'], torontolabel3['Neighborhood'], torontolabel3['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [22]:
torontolabel3.loc[torontolabel3['Cluster Labels'] == 0, torontolabel3.columns[[1] + list(range(5, torontolabel3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
76,M6H,-79.442259
82,M6P,-79.464763
83,M6R,-79.456325
84,M6S,-79.48445


In [23]:
torontolabel3.loc[torontolabel3['Cluster Labels'] == 1, torontolabel3.columns[[1] + list(range(5, torontolabel3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
50,M4W,-79.377529
51,M4X,-79.367675
52,M4Y,-79.38316
53,M5A,-79.360636
54,M5B,-79.378937
55,M5C,-79.375418
56,M5E,-79.373306
57,M5G,-79.387383
58,M5H,-79.384568
59,M5J,-79.381752


In [24]:
torontolabel3.loc[torontolabel3['Cluster Labels'] == 2, torontolabel3.columns[[1] + list(range(5, torontolabel3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
44,M4N,-79.38879
45,M4P,-79.390197
46,M4R,-79.405678
47,M4S,-79.38879
48,M4T,-79.38316
49,M4V,-79.400049
63,M5N,-79.416936
64,M5P,-79.411307


In [25]:
torontolabel3.loc[torontolabel3['Cluster Labels'] == 3, torontolabel3.columns[[1] + list(range(5, torontolabel3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
65,M5R,-79.405678
66,M5S,-79.400049
67,M5T,-79.400049
75,M6G,-79.422564
77,M6J,-79.41975
78,M6K,-79.428191


In [26]:
torontolabel3.loc[torontolabel3['Cluster Labels'] == 4, torontolabel3.columns[[1] + list(range(5, torontolabel3.shape[1]))]]

Unnamed: 0,Postal Code,Longitude
37,M4E,-79.293031
41,M4K,-79.352188
42,M4L,-79.315572
43,M4M,-79.340923
87,M7Y,-79.321558
