# Segmenting and Clustering Neighborhoods in Toronto
#### _(Data Science Capstone)_

## 1. Scraping the Wikipedia Web page to get the list of postal codes for Toronto and its neighborhoods
***

In [3]:
# Import librairies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

#### > Use of Beautifulsoup to scrape the page and wrangle the data : 
> - Open the page URL with the Beautifulsoup HTML Parser (lxml)
> - Find the objects (tr, td) related to the data sheet we want to download.
> - Use of the Python module "re" (regular expression) to find the elements in the table (get list_rows)
> - Load data in a panda dataframe df (One columne 0 where data are loaded by line)

In [5]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
html = urlopen(url)

In [6]:
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

In [7]:
rows = soup.find_all('tr')
#print(rows[:10])

In [8]:
for row in rows:
    row_td = row.find_all('td')
#print(row_td)
type(row_td)

bs4.element.ResultSet

In [9]:
str_cells = str(row_td)
cleantext = BeautifulSoup(str_cells, "lxml").get_text()
#print(cleantext)

In [10]:
import re

list_rows = []
for row in rows:
    cells = row.find_all('td')
    str_cells = str(cells)
    clean = re.compile('<.*?>')
    clean2 = (re.sub(clean, '',str_cells))
    list_rows.append(clean2)
#print(clean2)
#type(clean2)

In [11]:
df = pd.DataFrame(list_rows)
df.head()

Unnamed: 0,0
0,[]
1,"[M1A, Not assigned, Not assigned\n]"
2,"[M2A, Not assigned, Not assigned\n]"
3,"[M3A, North York, Parkwoods\n]"
4,"[M4A, North York, Victoria Village\n]"


#### > Data manipulation and cleanup : 
> - Split each line in the dataframe into columns.
> - Drop unnecessary column.
> - Cleanup data : 
    > - delete rows with value 'None' in each column.
    > - update the Postal code column in deleting the '[' character and the Neighborhood column in deleting extra ']\n' character
    > - rename the columns names
    > - delete rows where Borough == 'Not Assigned'
    > - Update rows where Neighborhood == 'Not Assigned' with the related Borough value 

In [12]:
df1 = df[0].str.split(',', expand=True)
#df1.head()

In [13]:

#for i in range (4,26):
#    print (i)
list = np.array (range (3,31))
df2=df1.drop (list, axis=1) 

In [14]:
df2.head()

Unnamed: 0,0,1,2
0,[],,
1,[M1A,Not assigned,Not assigned\n]
2,[M2A,Not assigned,Not assigned\n]
3,[M3A,North York,Parkwoods\n]
4,[M4A,North York,Victoria Village\n]


In [15]:
df2[0] = df2[0].str.strip('[')
df2[2] = df2[2].str.strip('\n]')
df2 = df2.drop ([0], axis=0)
#df2.head()

In [16]:
df2.rename (columns = { 0 : 'PostalCode', 1 : 'Borough', 2 : 'Neighborhood'}, inplace=True)

In [17]:
df2.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


In [18]:
#print (df2 (df2["Borough"].strip() == "Not assigned"))
df3 = df2[~df2.Borough.str.contains ("Not assigned", na=False)]

In [19]:
df3.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


In [20]:
df3.loc[df3.Neighborhood.str.contains ("Not assigned", na=False), 'Neighborhood'] =df3['Borough']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
df3.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Queen's Park
11,M9A,Etobicoke,Islington Avenue
12,M1B,Scarborough,Rouge
13,M1B,Scarborough,Malvern


#### > Group data by Postal Code and Borough in merging the neighborhoods names to a list separated by a comma.

In [22]:
df3 ['Neighborhood']= df3 ["Neighborhood"].astype (str)
#df3.info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
df4 = df3.groupby(['PostalCode', 'Borough']).agg({'Neighborhood': \
                                  ",".join}).reset_index() 

In [24]:
df4.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,\n,\n],
1,\n\n\nNL\n\nNS\n\nPE\n\nNB\n\nQC\n\nON\n\nMB\n...,NL\n,NS
2,A\n,B\n,C
3,M1B,Scarborough,"Rouge, Malvern"
4,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"


In [25]:
df4 = df4[~df4.PostalCode.str.contains ("\n", na=False)]
df4 = df4.reset_index()
df4 = df4.drop (['index'], axis=1)

In [26]:
df4.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


Export the dataframe to csv file for future use : File Toronto_neigh.csv 

In [27]:
df4.to_csv ('Toronto_neigh.csv')

In [28]:
df4.shape

(103, 3)