In [13]:
from bs4 import BeautifulSoup
import requests # library to handle requests
import pandas as pd # library for data analsysis
import numpy as np # library to handle data 
import random # library for random number generation
from IPython.display import display_html
import lxml

In [24]:
# perform web scrapping, use BeautifulSoup to parse and beautify the raw xml data
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
xml_data=BeautifulSoup(source,'lxml')
#print(soup.title)
from IPython.display import display_html
data_table = str(xml_data.table)
display_html(data_table,raw=True)

Postal Code,Borough,Neighborhood
M1A,Not assigned,Not assigned
M2A,Not assigned,Not assigned
M3A,North York,Parkwoods
M4A,North York,Victoria Village
M5A,Downtown Toronto,"Regent Park, Harbourfront"
M6A,North York,"Lawrence Manor, Lawrence Heights"
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
M8A,Not assigned,Not assigned
M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
M1B,Scarborough,"Malvern, Rouge"


In [25]:
# convert html data into dataframe so that we can perform cleaning and preprocessing
toronto_df = pd.read_html(data_table)
codes_df=toronto_df[0]
codes_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [22]:
# Dropping the rows where Borough is 'Not assigned'
borough_df = codes_df[codes_df.Borough != 'Not assigned']

# Combining the neighbourhoods with same Postcode
pc_borough_df = borough_df.groupby(['Postal Code','Borough'], sort=False).agg(', '.join)
pc_borough_df.reset_index(inplace=True)

# Replacing the name of the neighbourhoods which are 'Not assigned' with names of Borough
pc_borough_df['Neighborhood'] = np.where(pc_borough_df['Neighborhood'] == 'Not assigned',pc_borough_df['Borough'], pc_borough_df['Neighborhood'])

pc_borough_df 

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


In [26]:
#lets see the shape of the dataframe
pc_borough_df.shape

(103, 3)

**This completes the part 1 of the project**

**Part 2 - get the latitude and the longitude coordinates of each neighborhood**

*** I will be using the Geospatial data CSV to read the Long and Lat information ***

In [32]:
lat_long = pd.read_csv('https://cocl.us/Geospatial_data')
lat_long.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [35]:
# merge the pc_borough and the lat_long dataframes
merged_df = pd.merge(pc_borough_df,lat_long,on='Postal Code')
#merged_df.rename(columns={'Postal Code':'Postcode'},inplace=True)
merged_df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [36]:
# Get all the rows from the data frame which contains Toronto in their Borough
new_toronto_df = merged_df[merged_df['Borough'].str.contains('Toronto',regex=False)]
new_toronto_df

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031
20,M5E,Downtown Toronto,Berczy Park,43.644771,-79.373306
24,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
25,M6G,Downtown Toronto,Christie,43.669542,-79.422564
30,M5H,Downtown Toronto,"Richmond, Adelaide, King",43.650571,-79.384568
31,M6H,West Toronto,"Dufferin, Dovercourt Village",43.669005,-79.442259


**End of part 2**

**Begin part 3 - Generate Toronto Maps. Note: Sunce GitHub cannot display maps, I have included a screenshot of the image in Readme.md file**

In [42]:
from geopy.geocoders import Nominatim # module to convert an address into latitude and longitude values
from IPython.display import Image # libraries for displaying images
from IPython.core.display import HTML 
import folium # plotting library

map_toronto = folium.Map(location=[43.651070,-79.347015],zoom_start=10)

for lat,lng,borough,neighbourhood in zip(new_toronto_df['Latitude'],new_toronto_df['Longitude'],new_toronto_df['Borough'],new_toronto_df['Neighborhood']):
    label = '{}, {}'.format(neighbourhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat,lng],
    radius=5,
    popup=label,
    color='blue',
    fill=True,
    fill_color='#3186cc',
    fill_opacity=0.7,
    parse_html=False).add_to(map_toronto)
map_toronto