# IBM Data Science Capstone Notebook

This Jupyter Notebook will be encompassing the Capstone section of the IBM Data Science Professional Certificate course.

In [102]:
import pandas as pd
import numpy as np

In [103]:
print("Hello Capstone Project Course!")

Hello Capstone Project Course!


## Part 1.1: Segmenting and Clustering Neighborhoods in Toronto

In [104]:
import requests
from bs4 import BeautifulSoup

### Use requests to call url

In [105]:
url = "https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M"
res = requests.get(url)

### Use soup to scrape and find table

In [106]:
soup = BeautifulSoup(res.content,'lxml')
table = soup.find_all('table')[0]
#print(table.prettify())

# Add Table to dataframe
df = pd.read_html(str(table))
#print(df[0].to_json())

### Convert html->json, Read from json

In [107]:
df_can = pd.read_json(df[0].to_json()).sort_index()
df_can.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront


### Drop 'Not assigned' Boroughs

In [108]:
df_filter = df_can[df_can['Borough'] == 'Not assigned'].index
df_can.drop(df_filter,inplace=True)
df_can.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M6A,North York,Lawrence Heights
6,M6A,North York,Lawrence Manor


### Find 'Not assigned' Neighbourhood  

In [109]:
df_can[df_can['Neighbourhood'] == 'Not assigned']

Unnamed: 0,Postcode,Borough,Neighbourhood


In [110]:
df_can['Neighbourhood'].replace( "Not assigned", df_can['Borough'] , inplace=True)
df_can[df_can['Borough'] == "Queen's Park"]

Unnamed: 0,Postcode,Borough,Neighbourhood


### Dupilicate Handling

In [111]:
# Checking for duplicates
if df_can['Postcode'].duplicated().any(): print("Duplicates Exist!")
else: print("No duplicates!")

# Handling duplicate rows
df_dupe = df_can.groupby(['Postcode']).agg({'Borough':'first','Neighbourhood':', '.join})
df_can = df_dupe.reset_index()

# Checking for duplicates
if df_can['Postcode'].duplicated().any(): print("Duplicates Exist!")
else: print("No duplicates!")

Duplicates Exist!
No duplicates!


In [112]:
df_can.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [113]:
df_can.shape

(103, 3)

## Part 1.2: Getting Lat&Long From Postal Codes

### Using Geocoder

_EDIT: Having issues with Jupyter Notebook importing gecoder module_

In [114]:
"""
import geocoder # import geocoder

# initialize your variable to None
lat_lng_coords = None

# loop until you get the coordinates
while(lat_lng_coords is None):
  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))
  lat_lng_coords = g.latlng

latitude = lat_lng_coords[0]
longitude = lat_lng_coords[1]
"""

"\nimport geocoder # import geocoder\n\n# initialize your variable to None\nlat_lng_coords = None\n\n# loop until you get the coordinates\nwhile(lat_lng_coords is None):\n  g = geocoder.google('{}, Toronto, Ontario'.format(postal_code))\n  lat_lng_coords = g.latlng\n\nlatitude = lat_lng_coords[0]\nlongitude = lat_lng_coords[1]\n"

### Using Provided csv

In [115]:
df_latlong = pd.read_csv('http://cocl.us/Geospatial_data')
df_latlong = df_latlong.rename(columns={'Postal Code': 'Postcode'})
df_latlong.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [118]:
df_can = df_can.join( df_latlong.set_index('Postcode'), on='Postcode' )
df_can.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


In [119]:
df_can.shape

(103, 5)