# Captone Project - Segmenting and Clustering Neighborhoods in Toronto

## PART 1

### Import Necessary Library

In [32]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim

from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

### Scrap data from Wikipedia Page

In [33]:
#Get website data

import requests
url = requests.get("https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M").text 

from bs4 import BeautifulSoup
soup = BeautifulSoup(url,"lxml")

#Find table
my_table = soup.find('table',{'class':'wikitable sortable'})

### Store data into list, then convert the list into a pandas dataframe

In [34]:
table_data = [] #Create empty list for data

#Obtain value from table and store them to "table_data"
for row in my_table.findAll('tr'):
    cell = row.findAll('td')
    cell_clean = [i.text.replace('\n','') for i in cell] #remove the <td></td>
    if cell_clean != []:
        table_data.append(cell_clean)

#Convert list into pandas dataframe
df = pd.DataFrame(table_data, columns = ['Postal Code','Borough','Neighborhood'])
df.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Clean Data (i.e. remove unassigned borough, replace unassigned neighborhood with borough)

In [35]:
# Remove rows without assigned Borough
df_clean = df.loc[df['Borough'] != "Not assigned"]
df_clean.head()

# Replace Neighborhood with Borough if not assigned
for i in range(0, df_clean.shape[0]):
    if (df_clean.iloc[i,2]) == "Not assigned":
        df_clean.iloc[i,2] = df_clean.iloc[i,1]

df_clean.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"
5,M6A,North York,"Lawrence Manor, Lawrence Heights"
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### Number of rows in the dataframe

In [36]:
print('Number of rows in the dataframe is: ', df_clean.shape[0])

Number of rows in the dataframe is:  103


## PART 2

### Setup geocoder, create a list of latitude and longitude

In [53]:
!pip install geocoder #install geocoder
import geocoder

#initialize your variable to None
lat_lng_coords = None

#create empty lists for latitude and longitude
lat_list = []
lng_list = []

#for loop to append the latitude and longitude of Postal Code into lists
for postal_code in df_clean['Postal Code']:
    g = geocoder.arcgis('{}, Toronto, Ontario'.format(postal_code))
    lat_lng_coords = g.latlng
    if lat_lng_coords != []:
        lat_list.append(lat_lng_coords[0])
        lng_list.append(lat_lng_coords[1])

print('done')

done


### Add Latitude and Longitude into DataFrame

In [52]:
#Add lat_list and lng_list into df_clean
df_clean['Latitude'], df_clean['Longitude'] = lat_list, lng_list
df_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
2,M3A,North York,Parkwoods,43.75188,-79.33036
3,M4A,North York,Victoria Village,43.73042,-79.31282
4,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65514,-79.36265
5,M6A,North York,"Lawrence Manor, Lawrence Heights",43.72321,-79.45141
6,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66449,-79.39302
