# Part 1
## This asignemnt is scrapting the data from the given wikipedia link
### The output of Dataframe should follow the below:
* The dataframe will consist of three columns: PostalCode, Borough, and Neighborhood
* Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.
* More than one neighborhood can exist in one postal code area
* If a cell has a borough but a Not assigned  neighborhood, then the neighborhood will be the same as the borough.
* Clean your Notebook and add Markdown cells to explain your work and any assumptions you are making.
* In the last cell of your notebook, use the .shape method to print the number of rows of your dataframe

To obtain the data, the link is here:
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

### The approaching to this assignment
1. Get the data through the link.
2. Create a dataframe and store the extracted data from step1.
3. Process the data, considering ignore ""Borough" 'Not assigned' as removing the entire row. Then group entire data based on the postal code.
4. Show the size after data is processed. 

In [55]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as BS
import requests
from sklearn.cluster import KMeans
!pip install folium
import folium

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors



In [2]:
# using request to get page content
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
page = requests.get(url)

# using beautifulsoup 
soup = BS(page.content, 'html.parser')
#print(soup.prettify)

In [3]:
# extract the table
table = soup.find('table').find('tbody').find_all('tr')
# the first row is the name of table
name = [ i.text.strip() for i in table[0].find_all('th')]
#look at the column name
name

['Postal Code', 'Borough', 'Neighbourhood']

In [4]:
#create a dataframe
postal_code = pd.DataFrame(columns = name)

#extract the rest of value and append into the postal_code
values = []
for row in table[1:]:
    cell = row.find_all('td')    
    #extract the value
    cell = [ i.text.strip() for i in cell]
    postal_code = postal_code.append({name[0]: cell[0], name[1]:cell[1], name[2]:cell[2]}, ignore_index= True)

#have a look on the dataframe
postal_code

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
...,...,...,...
175,M5Z,Not assigned,Not assigned
176,M6Z,Not assigned,Not assigned
177,M7Z,Not assigned,Not assigned
178,M8Z,Etobicoke,"Mimico NW, The Queensway West, South of Bloor,..."


In [5]:
#process the data check the number of Borough is Not assigned
postal_code['Borough'].replace('Not assigned', np.nan, inplace = True)
postal_code.isna().sum()

Postal Code       0
Borough          77
Neighbourhood     0
dtype: int64

In [6]:
# we drop the Borough as Nan, SINCE it can be ignored
postal_code.dropna(inplace= True)

#check the datasize and ensure the na is zero
print("After removing \'Not asigned\' in the Borough, the data shape is  :", postal_code.shape)
print(postal_code.isna().sum())

After removing 'Not asigned' in the Borough, the data shape is  : (103, 3)
Postal Code      0
Borough          0
Neighbourhood    0
dtype: int64


In [7]:
# check is there Not assigned in Neighbourhood
print((postal_code['Neighbourhood'] == 'Not assigned').sum())

0


In [8]:
#group the neighbourhood with the same postal code
df = postal_code.groupby(postal_code['Postal Code'].unique() , as_index=False).sum()
df

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


In [9]:
df.shape

(103, 3)

# Part2
## Update the dataframe df has longnitiude and magnitude from postal code Link
the link is given here: http://cocl.us/Geospatial_data

In [10]:
#download the data and open the download data 
read = requests.get("http://cocl.us/Geospatial_data", allow_redirects=True)
open('data.csv', 'wb').write(read.content)
data = pd.read_csv('data.csv')
data

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [11]:
#merge the data base on postal code
df = df.merge(data, on= 'Postal Code')
df

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [15]:
df['Borough'].unique()

array(['Scarborough', 'North York', 'East York', 'East Toronto',
       'Central Toronto', 'Downtown Toronto', 'York', 'West Toronto',
       'Mississauga', 'Etobicoke'], dtype=object)

In [16]:
#check the datatype
df.dtypes

Postal Code       object
Borough           object
Neighbourhood     object
Latitude         float64
Longitude        float64
dtype: object

# Part 3
## Apply KMeans and show the data on the map with different color clusters

In [17]:
#check information for each borough
df.groupby('Borough').count()

Unnamed: 0_level_0,Postal Code,Neighbourhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Scarborough,17,17,17,17
West Toronto,6,6,6,6
York,5,5,5,5


In [50]:
#dropping the group of Borough and postal code
X = df.drop(['Borough','Postal Code'], axis =1)
#X = X.set_index('Neighbourhood')

In [51]:
X.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude
0,"Malvern, Rouge",43.806686,-79.194353
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Woburn,43.770992,-79.216917
4,Cedarbrae,43.773136,-79.239476


In [52]:
X.columns

Index(['Neighbourhood', 'Latitude', 'Longitude'], dtype='object')

In [53]:
# set K cluster to group data into 10 K group based on Borough size
k = 10
df_clustering = X.drop('Neighbourhood',axis=1)
kmeans = KMeans(n_clusters=k, random_state = 1).fit(df_clustering)

#check labels of Kmeans generated
kmeans.labels_[:15]

array([0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 3, 8, 3, 3], dtype=int32)

In [54]:
X.insert(3,'Cluster Labels', kmeans.labels_)
X.head()

Unnamed: 0,Neighbourhood,Latitude,Longitude,Cluster Labels
0,"Malvern, Rouge",43.806686,-79.194353,0
1,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,0
2,"Guildwood, Morningside, West Hill",43.763573,-79.188711,0
3,Woburn,43.770992,-79.216917,0
4,Cedarbrae,43.773136,-79.239476,8


In [66]:
#create the map
map_clusters = folium.Map(location=[X['Latitude'][0], X['Longitude'][0]])

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(X['Latitude'], X['Longitude'], X['Neighbourhood'], X['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius= 10,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters