<a href="https://colab.research.google.com/github/sahil239681/Coursera_Capstone/blob/master/Toronto_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Import Libraries**

In [0]:
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
import json # library to handle JSON files

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import requests # library to handle requests
from bs4 import BeautifulSoup # library to parse HTML and XML documents

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import folium # map rendering library

Scrap data from Wikipedia page into a DataFrame

In [0]:
# send the GET request
data = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

In [0]:
# parse data from the html into a beautifulsoup object
from bs4 import BeautifulSoup
soup = BeautifulSoup(website_url,'html.parser')

In [0]:
# create three lists to store table data
postalCodeList = []
boroughList = []
neighborhoodList = []

In [0]:
# find the table
soup.find('table').find_all('tr')

In [0]:
# find all the rows of the table
soup.find('table').find_all('tr')

# for each row of the table, find all the table data
for row in soup.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text.rstrip('\n'))
        boroughList.append(cells[1].text.rstrip('\n'))
        neighborhoodList.append(cells[2].text.rstrip('\n')) # avoid new lines in neighborhood cell

In [0]:
toronto_df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})


**Drop cells with a borough that is "Not assigned"**

In [0]:
toronto_df_dropna = toronto_df[toronto_df.Borough != "Not assigned"]

**Group neighborhoods in the same borough**

In [0]:
toronto_df_grouped = toronto_df_dropna.groupby(["PostalCode", "Borough"]).agg(lambda x: ', '.join(x)).reset_index()

**For Neighborhood="Not assigned", make the value the same as Borough**

In [0]:
toronto_df_grouped['Neighborhood'] = list(map(lambda x,y: y if x == "Not assigned" else x  ,toronto_df_grouped['Neighborhood'],toronto_df_grouped['Borough']))

**Print the number of rows of the cleaned dataframe**

In [83]:
toronto_df_grouped.shape

(103, 3)

**Load the coordinates from the csv file on Coursera**

In [0]:
file = pd.read_csv('/content/Geospatial_Coordinates.csv', sep = ',')

In [0]:
file.rename(columns = {'Postal Code':'PostalCode'}, inplace = True)

**Merge two tables to get the coordinates**

In [0]:
Final_Df = pd.merge(toronto_df_grouped, file,on = 'PostalCode', how = 'left')

In [0]:
Final_Df['Neighborhood'] = Final_Df['Neighborhood'].apply(lambda x: x.replace("/",","))

**Use geopy library to get the latitude and longitude values of Toronto**

In [104]:
address = 'Toronto'

geolocator = Nominatim(user_agent="my-application")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


**Create a map of Toronto with neighborhoods superimposed on top**

In [0]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(Final_Df['Latitude'], Final_Df['Longitude'], Final_Df['Borough'], Final_Df['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

**Filter only boroughs that contain the word Toronto**

In [109]:
# filter borough names that contain the word Toronto
borough_names = list(Final_Df.Borough.unique())

borough_with_toronto = []

for x in borough_names:
    if "toronto" in x.lower():
        borough_with_toronto.append(x)
        
borough_with_toronto

['East Toronto', 'Central Toronto', 'Downtown Toronto', 'West Toronto']

In [111]:
# create a new DataFrame with only boroughs that contain the word Toronto
toronto_df_new = Final_Df[Final_Df['Borough'].isin(borough_with_toronto)].reset_index(drop=True)
print(toronto_df_new.shape)
toronto_df_new.head()

(39, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M4E,East Toronto,The Beaches,43.676357,-79.293031
1,M4K,East Toronto,"The Danforth West , Riverdale",43.679557,-79.352188
2,M4L,East Toronto,"India Bazaar , The Beaches West",43.668999,-79.315572
3,M4M,East Toronto,Studio District,43.659526,-79.340923
4,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879


In [0]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(toronto_df_new['Latitude'], toronto_df_new['Longitude'], toronto_df_new['Borough'], toronto_df_new['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto