## Scraping Toronto neighbourhoods and postal codes from WikiPedia with BeautifulSoup4 and Pandas

### Import pandas and beautifulsoup for scraping a html table into pd.DataFrame

In [58]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests

Define the url and BeautifulSoup4-process the html with the 'lxml' transformer into a bs-object

In [59]:
url = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
req = requests.get(url)
page = bs(req.content,'lxml')

## Step by step walkthrough
- Find the table in the bs4 object
- Read the found table with Pandas straight into a DataFrame with correct column headers
- Drop all rows that have a 'Borough' with value 'Not assigned'
- Join all Neighbourhoods per Postcode
- Reset the index so the result returns towards a Integer-indexed DataFrame
- Verification method for 'M5A'-case
- Replace all Neighbourhood == 'Not assigned' with the value of the Borough column of that row
- Verify this alteration of the DataFrame
- Show the header of the resulting DataFrame

In [60]:
table = page.find('table')
df = pd.read_html(str(table),header=0)[0]
#df = df.drop(['Not assigned'], axis=0)
df = df[df.Borough != 'Not assigned']
df = pd.DataFrame(df.groupby(['Postal Code','Borough'])['Neighborhood'].apply(', '.join) )
df = df.reset_index()
print('verify that the \'M5A\'-case is correct : \n{}\n '.format(df[df['Postal Code'] == 'M5A']))
df.Neighborhood[df.Neighborhood == 'Not assigned'] = df.Borough
print('verify the Neighborhood \'Not assigned\' method:\n{}\n'.format(df[df.Borough == 'Queen\'s Park']))
df.head(12)

verify that the 'M5A'-case is correct : 
   Postal Code           Borough               Neighborhood
53         M5A  Downtown Toronto  Regent Park, Harbourfront
 
verify the Neighborhood 'Not assigned' method:
Empty DataFrame
Columns: [Postal Code, Borough, Neighborhood]
Index: []



Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park"
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge"
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [61]:
print('Toronto summary : \n There are {} unique Postcodes \
and \n {} Boroughs in the final DataFrame'.format(df['Postal Code'].unique().size,df.Borough.unique().size))

Toronto summary : 
 There are 103 unique Postcodes and 
 10 Boroughs in the final DataFrame


In [67]:
df.shape

(103, 3)

# df is a dataframe consisting of postalcode, borough and neighorhood 

In [62]:
!pip install geocoder
import geocoder



# Using both the methods to create a dataframe of toronto neighborhood

## converting the geospatial data into a dataframe named Postcode using the given csv link


In [63]:
Postcode = pd.read_csv('http://cocl.us/Geospatial_data')
Postcode

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [73]:

csvtor = pd.merge(df,Postcode,on = 'Postal Code')
csvtor

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...,...
98,M9N,York,Weston,43.706876,-79.518188
99,M9P,Etobicoke,Westmount,43.696319,-79.532242
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [83]:
csvtor.to_csv('toronto_csv_postcodes.csv')

## using geocoder also to show the latitudes and longitudes

In [68]:
def getLatLong(row):
    #print('post :{}'.format(row[:]))
    #print('neigh :{}'.format(row[1]))
    # initialize your variable to None
    lat_lng_coords = None
    search_query = '{}, Toronto, Ontario'.format(row)
    # loop until you get the coordinates
    try:
        while(lat_lng_coords is None):
            #g = geocoder.here(search_query,app_id=app_id,app_code=app_code)
            g = geocoder.arcgis(search_query)
            lat_lng_coords = g.latlng
            #print('FIRST')
    except IndexError:
        latitude = 0.0
        longitude = 0.0
        print('BACKUP')
        return [latitude,longitude]

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    print(latitude, longitude)
    return [latitude, longitude]

In [70]:
coords_list = df['Postal Code'].apply(getLatLong).tolist()

43.80862623100006 -79.18991284599997
43.78577865700004 -79.15736763799998
43.76580607300008 -79.18528434099994
43.77154467100007 -79.21813521299998
43.76879106300004 -79.23881306799996
43.74420268600005 -79.22872456899995
43.72688089800005 -79.26569363099998
43.71334044300005 -79.28494163099998
43.72353760300007 -79.22835287899994
43.696447877000026 -79.26564232899995
43.76131000000004 -79.26993999999996
43.75004250000006 -79.30053749999996
43.79392980200004 -79.26569360999997
43.78490240700006 -79.30472546399994
43.81799836300007 -79.28088739199995
43.80053000000004 -79.32182999999998
43.83476822500006 -79.20410074399996
43.802556149000054 -79.35656576499997
43.78030600000005 -79.34868699999998
43.780606885000054 -79.37692081799997
43.74936000000008 -79.38068999999996
43.79180026600005 -79.40642782199996
43.769076780000034 -79.41369502399994
43.750260000000026 -79.39835499999998
43.77991227600006 -79.44522868999996
43.75293455500008 -79.33564142299997
43.74890000000005 -79.35721999999

In [81]:
tor1 = pd.DataFrame(coords_list, columns=['Latitude', 'Longitude'])
tor1['Postal Code']=df['Postal Code']
tor1

Unnamed: 0,Latitude,Longitude,Postal Code
0,43.808626,-79.189913,M1B
1,43.785779,-79.157368,M1C
2,43.765806,-79.185284,M1E
3,43.771545,-79.218135,M1G
4,43.768791,-79.238813,M1H
...,...,...,...
98,43.705496,-79.520370,M9N
99,43.696296,-79.533126,M9P
100,43.686887,-79.565507,M9R
101,43.744055,-79.581203,M9V


In [82]:
geotor = pd.merge(df,tor1, on='Postal Code')
geotor

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.808626,-79.189913
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.785779,-79.157368
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.765806,-79.185284
3,M1G,Scarborough,Woburn,43.771545,-79.218135
4,M1H,Scarborough,Cedarbrae,43.768791,-79.238813
...,...,...,...,...,...
98,M9N,York,Weston,43.705496,-79.520370
99,M9P,Etobicoke,Westmount,43.696296,-79.533126
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.686887,-79.565507
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.744055,-79.581203
