### Segmenting and Clustering Neighborhoods in Toronto

In [16]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import folium

### Part 1

##### Project Part 1: Text Scrapping with BeautifulSoup!

1. Start a new notebook for this project.
2. Build the code to scrape the following Wikipedia page.
3. Also create a dataframe according to the requirements in the assignment.

### Get HTML data from wikipedia

In [2]:
Canada_M = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text

### Scrape PostalCode, Borough, Neighborhood from HTML using BeautifulSoup

In [3]:
# soup = BeautifulSoup(open("Canada_M.htm"), "lxml")
soup = BeautifulSoup(Canada_M, "lxml")

In [4]:
# Create a dataframe makes up of three columns: PostalCode, Borough, and Neighborhood
PostalCode = []
Borough = []
Neighborhood = []

for data in soup.tbody.find_all('tr')[1:]:
    PostalCode.append(data.find('td').text[:-1])
    Borough.append(data.find_all('td')[1].text[:-1])
    Neighborhood.append(data.find_all('td')[2].text[:-1])

In [5]:
# Only process the cells with an assigned borough. Ignore cells with a borough "Not assigned".

data = {'PostalCode': PostalCode,
        'Borough': Borough,
        'Neighborhood': Neighborhood}
df = pd.DataFrame(data)

# Drop rows where Borough is 'Not assigned', then reset index
df = df[df.Borough != 'Not assigned'].reset_index().drop('index', axis = 1)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"


### More than one neighborhood can exist in one postal code area, these two rows will be combined into one row with the neighborhoods

In [6]:
i = 1
while(i < len(df)):
    if df['PostalCode'].iloc[i] == df['PostalCode'].iloc[i - 1]:
        df.at[i - 1, 'Neighborhood'] = df.Neighborhood.iloc[i - 1] +', ' + df.Neighborhood.iloc[i]
        df.drop(index = i, inplace = True)
        df = df.reset_index().drop('index', axis = 1)
    else:
        i += 1

In [7]:
df = df.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
df

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
...,...,...,...
98,M9N,York,Weston
99,M9P,Etobicoke,Westmount
100,M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
101,M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


### Print the number of rows of your dataframe by using .shape

In [8]:
df.shape

(103, 3)

###### Save the dataframe on CSV file named Torontopart1df

In [9]:
df.to_csv('Torontopart1df.csv',index=False)

### Part 2

### Obtain Latitude and Longitude of each PostalCode by using the csv file to create the following dataframe:

In [10]:
TorontoPostalCodes = pd.read_csv("Torontopart1df.csv").set_index("PostalCode")
#TorontoPostalCodes = TorontoPostalCodes.groupby(['PostalCode','Borough'])['Neighborhood'].apply(', '.join).reset_index()
TorontoPostalCodes

Unnamed: 0_level_0,Borough,Neighborhood
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,Scarborough,"Malvern, Rouge"
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
M1E,Scarborough,"Guildwood, Morningside, West Hill"
M1G,Scarborough,Woburn
M1H,Scarborough,Cedarbrae
...,...,...
M9N,York,Weston
M9P,Etobicoke,Westmount
M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ..."
M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest..."


So, in order to make sure that you get the coordinates for all of our neighborhoods, we just used a prepared csv to retrieve the coordinates. Extract csv with Toronto geographical coordinates to dataframe.

In [11]:
geocsv_data = pd.read_csv("Geospatial_Coordinates.csv").set_index("PostalCode")
geocsv_data

Unnamed: 0_level_0,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476
...,...,...
M9N,43.706876,-79.518188
M9P,43.696319,-79.532242
M9R,43.688905,-79.554724
M9V,43.739416,-79.588437


In [12]:
Toronto_neighborhoods = TorontoPostalCodes.join(geocsv_data)
Toronto_neighborhoods

Unnamed: 0_level_0,Borough,Neighborhood,Latitude,Longitude
PostalCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
M1G,Scarborough,Woburn,43.770992,-79.216917
M1H,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...
M9N,York,Weston,43.706876,-79.518188
M9P,Etobicoke,Westmount,43.696319,-79.532242
M9R,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
M9V,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437


In [13]:
Toronto_neighborhoods.to_csv('Torontopart2df.csv',index=False)

In [14]:
Toronto_neighborhoods = pd.read_csv("Torontopart2df.csv")
toronto_venues = getNearbyVenues(names=Toronto_df['Neighbourhood'],
                                   latitudes=Toronto_df['Latitude'],
                                   longitudes=Toronto_df['Longitude']
                                  )

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,Scarborough,Woburn,43.770992,-79.216917
4,Scarborough,Cedarbrae,43.773136,-79.239476
...,...,...,...,...
98,York,Weston,43.706876,-79.518188
99,Etobicoke,Westmount,43.696319,-79.532242
100,Etobicoke,"Kingsview Village, St. Phillips, Martin Grove ...",43.688905,-79.554724
101,Etobicoke,"South Steeles, Silverstone, Humbergate, Jamest...",43.739416,-79.588437
