# Applied Data Science Capstone

### Segmenting and Clustering Neighborhoods in Toronto 

## Exercise 1

#### Imports

In [4]:
# imports of libraries
import pandas as pd

# imports for pulling data out of HTML
from bs4 import BeautifulSoup
import requests

#### Get the data from wikipedia's website

In [10]:
# get the postal code table of Toronto from wikipedia
#get the tml document
html_toronto = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

# get the rows from the html document
soup = BeautifulSoup(html_toronto, 'html.parser')
rows = soup.table.find_all('tr')



#### Data cleaning

In [11]:
#create an empty pandas dataframe with its respective columns
df = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood'])

# for each cell in table
for row in rows:
    # get the text, separated by line breaks
    lines = row.get_text()
    
    # list of text in row
    data_list = lines.split('\n')
    
    # append the content in the dataframe
    df = df.append(pd.Series([data_list[1],data_list[3],data_list[5]],index=df.columns), ignore_index=True)

# drop not assigned borough from dataframe
df = df[df['Borough']!='Not assigned']

# drop first row with headers of html table
df = df.iloc[1:]

#reset index if dataframe
df.reset_index(drop=True, inplace=True)

df.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


In [12]:
df.shape

(103, 3)

## Exercise 2

#### Downloading csv file with coordinates

In [20]:
# get the file
!wget -q -O 'coordinates.csv' https://cocl.us/Geospatial_data
print('Data downloaded!')

#### convert the file to a dataframe

In [33]:
# fiile conversion
df_coord = pd.read_csv('coordinates.csv')

# rename column "Postal Code" for merging
df_coord.rename(columns={'Postal Code':'PostalCode'}, inplace=True)
df_coord.head()

Unnamed: 0,PostalCode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


#### Join the dataframes by PostalCode

In [36]:
# Merge the dataframes using PostalCode as Key
df_result = pd.merge(df, df_coord, on='PostalCode')

# check the result with example
df_result[df_result['PostalCode']=='M5G']

# head of result dataframe
df_result.head(15)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
6,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
7,M3B,North York,Don Mills,43.745906,-79.352188
8,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
