# IBM Data Science Professional Certificate
## Applied Data Science Capstone


# PART 1: Scraping Wikipedia page to obtain the table of postal codes
https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M

In [1]:
from bs4 import BeautifulSoup
import requests

## 1. Load page content

In [2]:
targetPage = r'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
source = requests.get(targetPage).text

## 2. Parse HTML
### 2.1 Create soup

In [3]:
soup = BeautifulSoup(source, 'lxml')

### 2.2 Extract table tag

In [4]:
tableStr = soup.find('table').prettify() # Returns HTML table tag as a string

## 3. Convert HTML table to pandas dataframe
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_html.html

In [5]:
import pandas as pd
df = pd.read_html(tableStr, match='str', header=0) # Returns a list of dataframes from tables tags in the input string. header=0 is denote the first row conatins the column lables.
postalcodes = df[0] # First dataframe in the list
postalcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 289 entries, 0 to 288
Data columns (total 3 columns):
Postcode         289 non-null object
Borough          289 non-null object
Neighbourhood    289 non-null object
dtypes: object(3)
memory usage: 6.9+ KB


In [6]:
postalcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


## 4. Pre-Processing
### 4.1 Filter out the records that does not have  an assigned borough

In [7]:
postalcodes = postalcodes.loc[postalcodes.Borough!='Not assigned'].reset_index(drop=True)
postalcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 3 columns):
Postcode         212 non-null object
Borough          212 non-null object
Neighbourhood    212 non-null object
dtypes: object(3)
memory usage: 5.0+ KB


In [8]:
postalcodes.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


### 4.2 Collapse records with more than one neighborhood can exist in one postal code area

In [9]:
postalcodes = postalcodes.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
postalcodes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postcode         103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
dtypes: object(3)
memory usage: 2.5+ KB


In [10]:
postalcodes.columns = ['PostalCode', 'Borough', 'Neighbourhood'] # Set column names
postalcodes.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### 4.3 Assign Neighbourhood=Borough for the records with Neighbourhood='Not Assigned'

In [11]:
postalcodes.loc[postalcodes.Neighbourhood=='Not assigned'].reset_index(drop=True)

Unnamed: 0,PostalCode,Borough,Neighbourhood
0,M7A,Queen's Park,Not assigned


In [12]:
postalcodes.loc[postalcodes.Neighbourhood=='Not assigned', 'Neighbourhood']=postalcodes.Borough

In [13]:
postalcodes.loc[postalcodes.Neighbourhood=='Not assigned'].reset_index(drop=True) # Should have no rows satisfying this condition excists

Unnamed: 0,PostalCode,Borough,Neighbourhood


## 5. Number of rows and columns in the dataframe

In [14]:
print('Number of Rows =', postalcodes.shape[0])

Number of Rows = 103


<hr>

# PART 2: Postal Code Map to Geographical Coordinates

In [15]:
#!conda install -c conda-forge geocoder --yes
import geocoder # import geocoder

In [16]:
# Function that convert postal code to coordicate (didn't work as expected)
# Source: Assignment description (https://www.coursera.org/learn/applied-data-science-capstone/peer/I1bDq/segmenting-and-clustering-neighborhoods-in-toronto/submit)

def PostalCodesToCoordinates(postalCode):
    # initialize your variable to None
    lat_lng_coords = None

    # loop until you get the coordinates
    while(lat_lng_coords is None):
      g = geocoder.google('{}, Toronto, Ontario'.format(postalCode))
      lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

## 1. Load Geographical Coordinates Dataset

In [17]:
gc = pd.read_csv(r'https://cocl.us/Geospatial_data')
gc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103 entries, 0 to 102
Data columns (total 3 columns):
Postal Code    103 non-null object
Latitude       103 non-null float64
Longitude      103 non-null float64
dtypes: float64(2), object(1)
memory usage: 2.5+ KB


In [18]:
gc.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


## 2. Merge Geographical Coordinates Dataset with the Postal Code dataset

In [19]:
postalCodesCoord = postalcodes.merge(gc, left_on='PostalCode', right_on='Postal Code', how='left')
postalCodesCoord.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 103 entries, 0 to 102
Data columns (total 6 columns):
PostalCode       103 non-null object
Borough          103 non-null object
Neighbourhood    103 non-null object
Postal Code      103 non-null object
Latitude         103 non-null float64
Longitude        103 non-null float64
dtypes: float64(2), object(4)
memory usage: 5.6+ KB


In [20]:
postalCodesCoord.head()

Unnamed: 0,PostalCode,Borough,Neighbourhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",M1C,43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",M1E,43.763573,-79.188711
3,M1G,Scarborough,Woburn,M1G,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,M1H,43.773136,-79.239476


In [21]:
postalCode='M5G'
postalcodes.loc[postalcodes.PostalCode==postalCode]

Unnamed: 0,PostalCode,Borough,Neighbourhood
57,M5G,Downtown Toronto,Central Bay Street


# PART 3: Explore and Cluster Neighbourhood in Toronto

In [22]:
#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium 

# 1. Create a map of Toronto using latitude and longitude values

In [32]:
latitude = 43.6487
longitude = -79.38544

neighborhoods = postalCodesCoord 

mapToronto = folium.Map(location=[latitude, longitude], zoom_start=10)

## 2. Add markers to map

In [33]:
# Source: DP0701EN-3-3-2-Neighborhoods-New-York-py-v1.0.ipynb (https://www.coursera.org/learn/applied-data-science-capstone/ungradedLti/f0QY7/segmenting-and-clustering-neighborhoods-in-new-york-city)

for lat, lng, borough, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Borough'], neighborhoods['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(mapToronto)  
    
mapToronto

In [36]:
import json
import requests 
from pandas.io.json import json_normalize

In [44]:
# The code was removed by Watson Studio for sharing.

In [45]:
# The code was removed by Watson Studio for sharing.

## Get request

In [46]:
results = requests.get(url).json()
results

{'meta': {'code': 200, 'requestId': '5c0de9421ed2194a781afb6e'},
 'response': {'groups': [{'items': [{'reasons': {'count': 0,
       'items': [{'reasonName': 'globalInteractionReason',
         'summary': 'This spot is popular',
         'type': 'general'}]},
      'referralId': 'e-0-50322b6ae4b09116a296568c-0',
      'venue': {'categories': [{'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/nightlife/secretbar_',
          'suffix': '.png'},
         'id': '4bf58dd8d48988d1d4941735',
         'name': 'Speakeasy',
         'pluralName': 'Speakeasies',
         'primary': True,
         'shortName': 'Speakeasy'}],
       'id': '50322b6ae4b09116a296568c',
       'location': {'address': '192 Adelaide St W',
        'cc': 'CA',
        'city': 'Toronto',
        'country': 'Canada',
        'crossStreet': 'at Simcoe St.',
        'distance': 88,
        'formattedAddress': ['192 Adelaide St W (at Simcoe St.)',
         'Toronto ON M5H 0A4',
         'Canada'],
        'labeledLat

In [25]:
import numpy as np

# Set pandass display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)



#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values



# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans