# Applied Data Science Capstone Project

## Toronto Neighborhoods

### Part 1. Data Loading, Preparation and Wrangling

*Importing all necessary libraries*

In [1]:
from bs4 import BeautifulSoup
import urllib
import pandas as pd
import numpy as np

*Downloading and soup'ifying data*

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
html_doc = urllib.request.urlopen(url)

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')

*Converting data to a dataframe*

In [5]:
table = soup.find('table')
table_rows = table.find_all('tr')

l = []

for tr in table_rows:
    td = tr.find_all('td')
    row = [tr.text.strip() for tr in td]
    if row: 
        l.append(row)
    
    
neigh = pd.DataFrame(l,columns=['Postcode','Borough','Neighborhood'])
neigh.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
9,M8A,Not assigned,Not assigned


*Removing rows not assigned to any borough*

In [6]:
dirty = ['Not assigned']
neigh = neigh[~neigh['Borough'].isin(dirty)]
neigh.head(10)

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,Harbourfront
5,M5A,Downtown Toronto,Regent Park
6,M6A,North York,Lawrence Heights
7,M6A,North York,Lawrence Manor
8,M7A,Queen's Park,Not assigned
10,M9A,Etobicoke,Islington Avenue
11,M1B,Scarborough,Rouge
12,M1B,Scarborough,Malvern


*Replacing 'Not assigned' neighborhoods with borough names*

In [7]:
neigh['Neighborhood'].where(neigh['Neighborhood'] != 'Not assigned', neigh['Borough'], axis=0, inplace=True)
neigh.shape

(211, 3)

*Grouping rows by borough and postcode, and combining neighborhoods into lists*

In [8]:
neigh['Neighborhood'] = neigh.groupby(['Postcode','Borough'])['Neighborhood'].transform(lambda x: ', '.join(x))
neigh = neigh.drop_duplicates()

#Experimental approach to combine neighborhoods into a list, but it's more trouble than it's worth for the task at hand. 
#neigh = pd.DataFrame(neigh.groupby(['Postcode','Borough'])['Neighborhood'].apply(list)).reset_index()
neigh.head()

Unnamed: 0,Postcode,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Harbourfront, Regent Park"
6,M6A,North York,"Lawrence Heights, Lawrence Manor"
8,M7A,Queen's Park,Queen's Park


In [9]:
neigh.shape

(103, 3)

### Part 2: Getting neighborhood geodata

*Importing libraries* 

In [10]:
import geocoder

In [11]:
neigh.insert(len(neigh.columns),'Latitude', 0)
neigh.insert(len(neigh.columns),'Longitude', 0)
neigh.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,0,0
3,M4A,North York,Victoria Village,0,0
4,M5A,Downtown Toronto,"Harbourfront, Regent Park",0,0
6,M6A,North York,"Lawrence Heights, Lawrence Manor",0,0
8,M7A,Queen's Park,Queen's Park,0,0


*Defining the geocoder function that takes a postal code and returns lat and lon* 

In [12]:
def geolocator(postal_code):
    
    lat_lng_coords = None

    g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng

    latitude = lat_lng_coords[0]
    longitude = lat_lng_coords[1]
    
    return latitude, longitude

*Going over the dataframe*

In [13]:
nbhood = neigh.copy()
nbhood.head()


Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,0,0
3,M4A,North York,Victoria Village,0,0
4,M5A,Downtown Toronto,"Harbourfront, Regent Park",0,0
6,M6A,North York,"Lawrence Heights, Lawrence Manor",0,0
8,M7A,Queen's Park,Queen's Park,0,0


In [14]:
nbhood['Coordinates'] = neigh['Postcode'].apply(geolocator)

In [15]:
nbhood.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude,Coordinates
2,M3A,North York,Parkwoods,0,0,"(43.752440000000036, -79.32927072599995)"
3,M4A,North York,Victoria Village,0,0,"(43.730420577000075, -79.31331999999998)"
4,M5A,Downtown Toronto,"Harbourfront, Regent Park",0,0,"(43.65512000000007, -79.36263979699999)"
6,M6A,North York,"Lawrence Heights, Lawrence Manor",0,0,"(43.72312500000004, -79.45158914699994)"
8,M7A,Queen's Park,Queen's Park,0,0,"(43.66110229800006, -79.39103499999999)"


*Separating coordinates into latitude and longitude in the dataframe*

In [16]:
nbhood[['Latitude','Longitude']] = pd.DataFrame(nbhood['Coordinates'].tolist(), index = nbhood.index)
nbhood.drop(['Coordinates'], axis=1, inplace=True)

In [17]:
nbhood.head()

Unnamed: 0,Postcode,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.75244,-79.329271
3,M4A,North York,Victoria Village,43.730421,-79.31332
4,M5A,Downtown Toronto,"Harbourfront, Regent Park",43.65512,-79.36264
6,M6A,North York,"Lawrence Heights, Lawrence Manor",43.723125,-79.451589
8,M7A,Queen's Park,Queen's Park,43.661102,-79.391035
