## Web scraping using python

#### Importing libraries

In [10]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

#### Using BeautifulSoup for scrape with HTTP Request and HTML Parser

In [12]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M")
soup = BeautifulSoup(page.content, 'html.parser')

#### Finding table body and create new data frame for postcode of Toronto

In [13]:

tableBody = soup.find_all('tbody', limit=1)[0]

dfPostalToronto = pd.DataFrame(columns=['Postcode', 'Borough', 'Neighborhood'])

#### Parsing with rows of html table and add item to data frame

In [14]:
for tag in tableBody.find_all('tr'):
    postcod = tag.find_next()
    borough = postcod.find_next()
    neighborhood = borough.find_next()
    
    str_borough = borough.string     
    # if exists tag "a" get content tag "a" and go to next tag
    if borough.a is not None :
      str_borough =  borough.a.string.strip()
      neighborhood = neighborhood.find_next()
        
    str_neighborhood = neighborhood.string   
    # if exists tag "a" get content tag "a" and go to next tag
    if neighborhood.a is not None :
      str_neighborhood =  neighborhood.a.string.strip()
    
    dfPostalToronto = dfPostalToronto.append({
                        'Postcode'     : postcod.string.strip(), 
                        'Borough'      : str_borough, 
                        'Neighborhood': str_neighborhood.strip()}, ignore_index=True)

#### Preparing data and exclude boroughs not assigned

In [15]:
#remove header of table
dfPostalToronto = dfPostalToronto.iloc[1:]

#exclude Boroughs Not assigned
dfPostalToronto = dfPostalToronto.loc[dfPostalToronto['Borough']  != 'Not assigned']

#change Neighbourhood Not assigned to Borough
nbAux = np.where(dfPostalToronto.Neighborhood.eq('Not assigned'), dfPostalToronto.Borough, dfPostalToronto.Neighborhood)
dfPostalToronto.Neighborhood = nbAux

#### Grouping data by Postcode and Borough

In [16]:

dfPostalToronto = dfPostalToronto.groupby(['Postcode', 'Borough'])['Neighborhood'].agg([('Neighborhood', ', '.join)])
dfPostalToronto = dfPostalToronto.reset_index()

#### Display number of rows after agregate

In [17]:
dfPostalToronto.shape

(180, 3)

## Step 2: Geo Location 

#### Importing CSV

In [50]:
pdGeoToronto = pd.read_csv('Geospatial_Coordinates.csv')

#### join dataframes by Postcode

In [52]:
dfPostalToronto

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude_x,Longitude_x,Latitude_y,Longitude_y
0,M1B,Scarborough\n,"Malvern, Rouge",43.806686,-79.194353,43.806686,-79.194353
1,M1C,Scarborough\n,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,43.784535,-79.160497
2,M1E,Scarborough\n,"Guildwood, Morningside, West Hill",43.763573,-79.188711,43.763573,-79.188711
3,M1G,Scarborough\n,Woburn,43.770992,-79.216917,43.770992,-79.216917
4,M1H,Scarborough\n,Cedarbrae,43.773136,-79.239476,43.773136,-79.239476
5,M1J,Scarborough\n,Scarborough Village,43.744734,-79.239476,43.744734,-79.239476
6,M1K,Scarborough\n,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029,43.727929,-79.262029
7,M1L,Scarborough\n,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577,43.711112,-79.284577
8,M1M,Scarborough\n,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476,43.716316,-79.239476
9,M1N,Scarborough\n,"Birch Cliff, Cliffside West",43.692657,-79.264848,43.692657,-79.264848


## Step 3 Explore and cluster the neighborhoods in Toronto

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude_x,Longitude_x,Latitude_y,Longitude_y,Latitude,Longitude
0,M1B,Scarborough\n,"Malvern, Rouge",43.806686,-79.194353,43.806686,-79.194353,43.806686,-79.194353
1,M1C,Scarborough\n,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497,43.784535,-79.160497,43.784535,-79.160497
2,M1E,Scarborough\n,"Guildwood, Morningside, West Hill",43.763573,-79.188711,43.763573,-79.188711,43.763573,-79.188711
3,M1G,Scarborough\n,Woburn,43.770992,-79.216917,43.770992,-79.216917,43.770992,-79.216917
4,M1H,Scarborough\n,Cedarbrae,43.773136,-79.239476,43.773136,-79.239476,43.773136,-79.239476
5,M1J,Scarborough\n,Scarborough Village,43.744734,-79.239476,43.744734,-79.239476,43.744734,-79.239476
6,M1K,Scarborough\n,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029,43.727929,-79.262029,43.727929,-79.262029
7,M1L,Scarborough\n,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577,43.711112,-79.284577,43.711112,-79.284577
8,M1M,Scarborough\n,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476,43.716316,-79.239476,43.716316,-79.239476
9,M1N,Scarborough\n,"Birch Cliff, Cliffside West",43.692657,-79.264848,43.692657,-79.264848,43.692657,-79.264848


32

(180, 3)

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A\r\n,Not assigned\r\n,Not assigned\r\n
1,M2A\r\n,Not assigned\r\n,Not assigned\r\n
2,M3A\r\n,North York\r\n,Parkwoods
3,M4A\r\n,North York\r\n,Victoria Village
4,M5A\r\n,Downtown Toronto\r\n,"Regent Park, Harbourfront"


(180, 3)