# Creation a new Notebook




In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

# Use the Notebook to build the code to scrape the following Wikipedia page
###  https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [9]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
can_ps = requests.get(url)

### Reading the data from the url and creating the dataframe

In [60]:
df_toronto = pd.DataFrame()

toronto_soup = bs(can_ps.text, 'lxml')

toronto_tab = toronto_soup.find_all('table')[0] 

row_marker = 0

for row in toronto_tab.find_all('tr'):
    
    column_marker = 0
    
    columns = row.find_all('td')
    
    for column in columns:
        
        df_toronto.loc[row_marker,column_marker] = column.get_text()
        
        column_marker += 1
        
    row_marker += 1
    
df_toronto.head()
df_toronto.shape

(180, 3)

### Cleaning the data set 

In [62]:
#The column name is not as per the standard. Replace the column name 
#the dataframe will consist of three columns: PostalCode, Borough, and Neighborhood(One of the condition in assignment)
df_toronto.rename(columns={0:'PostalCode',1:'Borough',2:'Neighborhood'}, inplace=True)

#Cleaning new line character from the dataframe
df_toronto = df_toronto.replace('\n','', regex=True)

#Before removing the row where Bourough equals "Not assigned", check how many such value are there. 
df_toronto_Borough_na = df_toronto[df_toronto["Borough"] == "Not assigned" ]

print("number of rows where 'Borough' is not assigned is %d" %(df_toronto_Borough_na.shape[0]))
print("Remaining rows count is %d" %(df_toronto.shape[0] - df_toronto_Borough_na.shape[0]))


df_toronto.head()

number of rows where 'Borough' is not assigned is 77
Remaining rows count is 103


Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Sub query:- Only process the cells that have an assigned borough. Ignore cells with a borough that is Not assigned.(One of the condition in assignment)

In [63]:
#Cleaning the data set and removing the not assigned value
df_toronto = df_toronto[df_toronto["Borough"] !='Not assigned']

#Setting the index again after removal of the row 
df_toronto.reset_index(drop=True,inplace=True)

In [64]:
# Expectation is there will be no record with df_toronto["Borough"] = "Not assigned"
df_toronto[df_toronto["Borough"] == "Not assigned"]

Unnamed: 0,PostalCode,Borough,Neighborhood



## If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough

## More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park. These two rows will be combined into one row with the neighborhoods separated with a comma as shown in row 11 in the above tab

In [69]:
# Check how many neighborhood are there where it is duplicate 
df_toronto_nei = df_toronto[df_toronto['Neighborhood'] == 'Not assigned']
df_toronto_nei

Unnamed: 0,PostalCode,Borough,Neighborhood


In [29]:
# Creating new dataframe for question 1 
df_toronto_Q1= pd.DataFrame(columns = ['PostalCode','Borough','Neighborhood'])


for ii in range(df_toronto.shape[0]):
    
    df_borough = df_toronto.loc[ii,'Borough']
    df_postalcode = df_toronto.loc[ii,'PostalCode']
    df_neighborhood = df_toronto.loc[ii,'Neighborhood'][:-1]
    
    # If a cell has a borough but a Not assigned neighborhood, then the neighborhood will be the same as the borough
    # ( One of the condition in the assignment )
    if df_neighborhood == 'Not assigned':
        df_neighborhood = borough
    
    #More than one neighborhood can exist in one postal code area. For example, in the table on the Wikipedia page, 
    #you will notice that M5A is listed twice and has two neighborhoods: Harbourfront and Regent Park.
    #These two rows will be combined into one row with the neighborhoods separated with 
    #a comma as shown in row 11 in the above table
    
    if df_postalcode in df_toronto_Q1.PostalCode.values:
        df_old_neighborhood = df_toronto_Q1[df_toronto_Q1.PostalCode == df_postalcode].Neighborhood
        new_value = (old_neighborhood + ',' + df_neighborhood)
        df_toronto_Q1.loc[df_toronto_Q1.PostalCode == postalcode,'Neighborhood'] = new_value 
    
    else:
        df_toronto_Q1 = df_toronto_Q1.append({'PostalCode': df_postalcode,
                                                'Borough': df_borough,
                                                'Neighborhood': df_neighborhood,
                                               }, ignore_index=True)



df_toronto_Q1

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwood
1,M4A,North York,Victoria Villag
2,M5A,Downtown Toronto,"Regent Park, Harbourfron"
3,M6A,North York,"Lawrence Manor, Lawrence Height"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Governmen"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Villag"
6,M1B,Scarborough,"Malvern, Roug"
7,M3B,North York,Don Mill
8,M4B,East York,"Parkview Hill, Woodbine Garden"
9,M5B,Downtown Toronto,"Garden District, Ryerso"


## use the .shape method to print the number of rows of your dataframe.

In [70]:
df_toronto_Q1.shape

(103, 3)

In [83]:
!pip install pgeocode

Collecting pgeocode
  Downloading https://files.pythonhosted.org/packages/86/44/519e3db3db84acdeb29e24f2e65991960f13464279b61bde5e9e96909c9d/pgeocode-0.2.1-py2.py3-none-any.whl
Installing collected packages: pgeocode
Successfully installed pgeocode-0.2.1


You are using pip version 18.0, however version 20.2b1 is available.
You should consider upgrading via the 'python -m pip install --upgrade pip' command.


## Get the latitude and the longitude coordinates of each neighborhood
## Create dataframe with latitude and the longitude

In [85]:
from geopy.geocoders import Nominatim  # import geocoder
import pgeocode

nomi = pgeocode.Nominatim('ca')

for zip_cd in df_toronto_Q1["PostalCode"]:
    # loop until you get the coordinates
    
    #geolocator = Nominatim(user_agent="Toronto_Exp")
    #location = geolocator.geocode({"postalcode": 'M1T' , 'countryRegion': 'Canada'})
    location = nomi.query_postal_code(zip_cd)
    
    latitude = location.latitude
    longitude = location.longitude
    
    df_toronto_Q1.loc[df_toronto_Q1["PostalCode"] == zip_cd, "latitude" ] = latitude
    df_toronto_Q1.loc[df_toronto_Q1["PostalCode"] == zip_cd, "longitude" ] = longitude
    
    #print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))
    
df_toronto_Q1.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwood,43.7545,-79.33
1,M4A,North York,Victoria Villag,43.7276,-79.3148
2,M5A,Downtown Toronto,"Regent Park, Harbourfron",43.6555,-79.3626
3,M6A,North York,"Lawrence Manor, Lawrence Height",43.7223,-79.4504
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Governmen",43.6641,-79.3889


In [97]:
df_toronto_Q1 = df_toronto_Q1.dropna()
df_toronto_Q1 = df_toronto_Q1.reset_index(drop=True)

## Mapping neighbourhood present in the dataframe with Folium

In [98]:
import numpy as np
df_toronto_Q1[df_toronto_Q1['longitude'] == np.nan]

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude


In [100]:
import folium
import matplotlib.cm as cm
import matplotlib.colors as colors

# create map of New York using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto_Q1['latitude'], df_toronto_Q1['longitude'], df_toronto_Q1['Borough'], df_toronto_Q1['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

In [102]:
NorthYork_data = df_toronto_Q1[df_toronto_Q1['Borough'] == 'North York'].reset_index(drop=True)
NorthYork_data.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,latitude,longitude
0,M3A,North York,Parkwood,43.7545,-79.33
1,M4A,North York,Victoria Villag,43.7276,-79.3148
2,M6A,North York,"Lawrence Manor, Lawrence Height",43.7223,-79.4504
3,M3B,North York,Don Mill,43.745,-79.359
4,M6B,North York,Glencair,43.7081,-79.4479
