# <div align="center">  Capstone Assignment Notebook

<div align="center">-Sanjeet Manchanda

#    Assignment - Part 1

In [1]:
#Import urllib library to webscrap
import urllib.request
# import the BeautifulSoup library so we can parse HTML and XML documents
from bs4 import BeautifulSoup

In [2]:
#Preparing for scraping
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"

# open the url using urllib.request and put the HTML into the page variable
page = urllib.request.urlopen(url)

# parse the HTML from our URL into the BeautifulSoup parse tree format
soup = BeautifulSoup(page, "lxml")


In [3]:
# use the 'find_all' function to bring back all instances of the 'table' tag in the HTML and store in 'all_tables' variable

all_tables=soup.find_all("table")
all_tables

# Asour table is the only one that is sortable bringing back the sortable table to isolate the table

right_table=soup.find('table', class_='wikitable sortable')


In [4]:
#Using Temp placeholders to store the values for each column

A=[]
B=[]
C=[]

for row in right_table.findAll('tr'):
    cells=row.findAll('td')
    if len(cells)==3:
        A.append(cells[0].find(text=True))
        B.append(cells[1].find(text=True))
        C.append(cells[2].find(text=True))
        
        


In [5]:
#importing pandas and creating the dataframe from temp cells

import pandas as pd

df=pd.DataFrame(A,columns=['PostalCode'])
df['Borough']=B
df['Neighbourhood']=C


df

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A\n,Not assigned\n,Not assigned\n
1,M2A\n,Not assigned\n,Not assigned\n
2,M3A\n,North York\n,Parkwoods\n
3,M4A\n,North York\n,Victoria Village\n
4,M5A\n,Downtown Toronto\n,"Regent Park, Harbourfront\n"
...,...,...,...
175,M5Z\n,Not assigned\n,Not assigned\n
176,M6Z\n,Not assigned\n,Not assigned\n
177,M7Z\n,Not assigned\n,Not assigned\n
178,M8Z\n,Etobicoke\n,"Mimico NW, The Queensway West, South of Bloor,..."


In [6]:
#Cleaning up the Data for garbage characters in all columns

df = df.applymap(lambda x: x.lstrip('+-').rstrip('\n'))

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
#Data rows without ny neighbourhood assigned are not of any use to us- so we will remove rows with Neighbourhood is Not Assigned

df.drop(df.loc[df['Neighbourhood']=='Not assigned'].index, inplace=True)

df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


# End of Assignment Part 1

# Assignment Part 2 - Start

In [8]:
!wget -q -O 'geo_data.csv' http://cocl.us/Geospatial_data
print('Data downloaded!')


    
    

Data downloaded!


In [9]:
import pandas as pd

In [10]:
data = pd.read_csv("geo_data.csv")

In [11]:
data.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [12]:
#Renamed the Column Header to be able to compare it to our original data frame


data = data.rename(columns={"Postal Code": "PostalCode"})

data.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [13]:
#Combining Two Data Frames to get final data to work on

tor_df = pd.merge(df, data, on = 'PostalCode')

tor_df.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Let's do a High Level Analysis of our final data

In [14]:


print('The Datafram has {} boroughs and {} neighborhoods.'.format(
        len(tor_df['Borough'].unique()),
        tor_df.shape[0]
    )
)

The Datafram has 10 boroughs and 103 neighborhoods.


# Install Liabraries for Maps

In [17]:
!pip install folium

import folium

print('Folium installed and imported!')


Folium installed and imported!


# Greater Toronoto with Neighbourhoods superimposed  on a Map

In [18]:
#Toronot Co-ordinates : 
latitude = 43.651070
longitude = -79.347015
print('By Googles courtsey, we know The geograpical coordinate of GTA are {}, {}.'.format(latitude, longitude))

By Googles courtsey, we know The geograpical coordinate of GTA are 43.65107, -79.347015.


In [19]:
# create map of GTA  using latitude and longitude values
map_GTA = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, label in zip(tor_df['Latitude'], tor_df['Longitude'], tor_df['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_GTA)  
    
map_GTA