# CAPSTONE PROJECT 2

This project is a part of IBM Data Science Professional Certificate's Capstone Project.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
source = requests.get(url).text

soup = BeautifulSoup(source, 'lxml')

table=soup.find('table')

# Setting the Column Names for the Dataframe
column_names = ['PostalCode','Borough','Neighborhood']
df_raw = pd.DataFrame(columns = column_names)

# Search all the rows and assign PostCode, Borough, Neighborhood 
for tr_cell in table.find_all('tr'):
    row_data=[]
    for td_cell in tr_cell.find_all('td'):
        row_data.append(td_cell.text.strip())
    if len(row_data)==3:
        df_raw.loc[len(df_raw)] = row_data
        
df_raw.head() # checking the data in the dataframe

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


In [3]:
df_raw=df_raw[df_raw['Borough']!='Not assigned'] # Removing Not Assigned Values

# Grouping Neighborhood By Postal Code
df_groupby = df_raw.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()

# Printing Data and shape of Dataframe

print("Head of Grouped Data\n",df_groupby.head())

print("\n\nShape of Grouped Data :",df_groupby.shape)

Head of Grouped Data
   PostalCode      Borough                            Neighborhood
0        M1B  Scarborough                          Rouge, Malvern
1        M1C  Scarborough  Highland Creek, Rouge Hill, Port Union
2        M1E  Scarborough       Guildwood, Morningside, West Hill
3        M1G  Scarborough                                  Woburn
4        M1H  Scarborough                               Cedarbrae


Shape of Grouped Data : (103, 3)


In [5]:
# Reading coordinated from provided csv file
df_coordinates = pd.read_csv('Geospatial_Coordinates.csv')
df_coordinates.rename(columns={'Postal Code':'PostalCode'}, inplace=True) # Renaming Column to match key

# Merging Groupby Data and Coordinate Data into single frame

df_groupby_with_coordinates = pd.merge(df_groupby, df_coordinates, on='PostalCode')

# Printing Data and shape of Dataframe

print("Head of Grouped with Coordinate Data\n",df_groupby_with_coordinates.head())

print("\n\nShape of Grouped with Coordinate Data :",df_groupby_with_coordinates.shape)

Head of Grouped with Coordinate Data
   PostalCode      Borough                            Neighborhood   Latitude  \
0        M1B  Scarborough                          Rouge, Malvern  43.806686   
1        M1C  Scarborough  Highland Creek, Rouge Hill, Port Union  43.784535   
2        M1E  Scarborough       Guildwood, Morningside, West Hill  43.763573   
3        M1G  Scarborough                                  Woburn  43.770992   
4        M1H  Scarborough                               Cedarbrae  43.773136   

   Longitude  
0 -79.194353  
1 -79.160497  
2 -79.188711  
3 -79.216917  
4 -79.239476  


Shape of Grouped with Coordinate Data : (103, 5)
