# IBM Applied Datascience Capstone By Coursera - Week 3

## Objectives Part 1:

1) Scrape the data from wikipedia

2) Clean and group the data in a pandas dataframe

### 1) Import libraries

In [51]:
import pandas as pd
import numpy as np
import requests
from lxml import html
from bs4 import BeautifulSoup
import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

### 2) Scrape data from wikipedia into a dataframe 

In [2]:
# sending request
raw_html = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

In [3]:
# parse the html into a beautifulsoup object
parsed_html = BeautifulSoup(raw_html.content, 'html.parser')

In [21]:
# creating 3 different lists
postalCodeList = []
boroughList = []
neighborhoodList = []

In [22]:
# sorting the data in lists
# using beautifulsoup to first find the table then finding all the rows in it and then getting the data out
for row in parsed_html.find('table').find_all('tr'):
    cells = row.find_all('td')
    if(len(cells) > 0):
        postalCodeList.append(cells[0].text.strip('\n'))
        boroughList.append(cells[1].text.strip('\n'))
        neighborhoodList.append(cells[2].text.strip('\n'))

In [28]:
# create the desired dataframe
df = pd.DataFrame({"PostalCode": postalCodeList,
                           "Borough": boroughList,
                           "Neighborhood": neighborhoodList})

In [31]:
# display top 5 entries of the dataframe
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


##### As seen above, the neighborhoods for the same PostalCode are already grouped

### 3) Clean data by dopping entries having Not Assigned Borough 

In [32]:
# dropping the rows with Borough not assigned
df_cleaned = df[df.Borough != "Not assigned"].reset_index(drop=True)

In [33]:
# display top 5 entries of the dataframe
df_cleaned

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### 4) Assign the values to Not Assigned Neighborhood having Borough with value same as Borough 

In [35]:
# for entries having Neighborhood empty but have a Borough, make the value of Neighborhood the same as Borough
for index, row in df_cleaned.iterrows():
    if row["Neighborhood"] == "Not assigned":
        row["Neighborhood"] = row["Borough"]

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North"
99,M4Y,Downtown Toronto,Church and Wellesley
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C..."
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu..."


### 5) Number of rows of the dataframe

In [38]:
df_cleaned.shape

(103, 3)

## Objectives Part 2:

1) Get the get the longitude latitude data in a dataframe

2) Merge the scraped data of postal codes, borough, and neighborhoods with coordinates data from the CSV file based on postal code 

### 6) Loading postal codes and latlong CSV

In [40]:
# load the latlong from the CSV and rename the column
latlong = pd.read_csv("Geospatial_Coordinates.csv")
latlong.rename(columns={"Postal Code": "PostalCode"}, inplace=True)
latlong.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### 7) Get the final dataframe with latitude and longitude with postal code

In [46]:
# combine the tables based on PostalCode to get the desired dataframe
df_toronto = df_cleaned.merge(latlong, on="PostalCode", how="left")
df_toronto

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.654260,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
...,...,...,...,...,...
98,M8X,Etobicoke,"The Kingsway, Montgomery Road, Old Mill North",43.653654,-79.506944
99,M4Y,Downtown Toronto,Church and Wellesley,43.665860,-79.383160
100,M7Y,East Toronto,"Business reply mail Processing Centre, South C...",43.662744,-79.321558
101,M8Y,Etobicoke,"Old Mill South, King's Mill Park, Sunnylea, Hu...",43.636258,-79.498509


## Objectives Part 3:

1) Build a dataframe with Toronto's postal codes, borough names and neighborhood names.

2) Get the geographical coordinates of the neighborhoods in Toronto.

3) Cluster the neighborhoods in Toronto.

### 8) Use geopy library to get the latitude and longitude values of Toronto

In [53]:
address = 'Toronto'

geolocator = Nominatim(user_agent="m.salikkhan95@gmail.com")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('Geograpical coordinate of Toronto are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Toronto are 43.6534817, -79.3839347.


### 9) Map of Toronoto with neighborhoods marked

In [55]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

### 10) Create a dataframe with only Toronto's Boroughs

In [58]:
borough_names = list(df_toronto.Borough.unique())

toronto_borough = []

for x in borough_names:
    if "toronto" in x.lower():
        toronto_borough.append(x)

print(toronto_borough)

# create a DataFrame with only boroughs that contain the word Toronto
toronto_df_new = df_toronto[df_toronto['Borough'].isin(toronto_borough)].reset_index(drop=True)
print(df_toronto.shape)
df_toronto.head()

['Downtown Toronto', 'East Toronto', 'West Toronto', 'Central Toronto']
(103, 5)


Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


In [61]:
# create map of Toronto using latitude and longitude values
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    
map_toronto

### 11) Neighborhoods exploration using the Foursquare API