# Notebook for Capstone Project

# Import libraries

In [125]:
import pandas as pd
import numpy as npù
!pip install bs4

from bs4 import BeautifulSoup
import requests



# Get the html from url

In [126]:
headers = {'Accept-Encoding': 'identity'}
r = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M', headers=headers)

# Use BeatifulSoup library to analyze html and show it with prettify method

In [None]:
soup = BeautifulSoup(r.text, 'html5lib')
print(soup.prettify())

# Parse the html in order to find infos as suggested and create the ngbr_toronto dataframe

In [128]:
table_contents=[]
table=soup.find('table')
for row in table.findAll('td'):
    cell = {}
    if row.span.text=='Not assigned':
        pass
    else:
        cell['PostalCode'] = row.p.text[:3]
        cell['Borough'] = (row.span.text).split('(')[0]
        cell['Neighborhood'] = (((((row.span.text).split('(')[1]).strip(')')).replace(' /',',')).replace(')',' ')).strip(' ')
        table_contents.append(cell)

# print(table_contents)
ngbr_toronto=pd.DataFrame(table_contents)
ngbr_toronto['Borough']=ngbr_toronto['Borough'].replace({'Downtown TorontoStn A PO Boxes25 The Esplanade':'Downtown Toronto Stn A',
                                             'East TorontoBusiness reply mail Processing Centre969 Eastern':'East Toronto Business',
                                             'EtobicokeNorthwest':'Etobicoke Northwest','East YorkEast Toronto':'East York/East Toronto',
                                             'MississaugaCanada Post Gateway Processing Centre':'Mississauga'})

ngbr_toronto.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Queen's Park,Ontario Provincial Government


 # Group on PostalCode (it will automatically combine on the same Neighborhood values) and than replace all 'Not Assigned' values in Neighborhood with the value of the proper Borough 

In [129]:
ngbr_toronto.set_index('PostalCode', drop=False, inplace=True)
ngbr_toronto.rename(columns={'PostalCode': 'Postal Code'}, inplace=True)
ngbr_toronto = ngbr_toronto.groupby(by='PostalCode').sum()
ngbr_toronto.reset_index(drop=True, inplace=True)
ngbr_toronto['Neighborhood'] = ngbr_toronto.apply(lambda row: row['Borough'] if row['Neighborhood'] == 'Not Assigned' else row['Neighborhood'], axis=1)

# Print the shape of the created dataframe

In [130]:
ngbr_toronto.shape

(103, 3)

# Merge Coordinates Informations

In [131]:
coordinates_df = pd.read_csv('/content/Geospatial_Coordinates.csv')
ngbr_toronto = pd.merge(ngbr_toronto, coordinates_df, on='Postal Code')
ngbr_toronto.head(10)

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park, Ionview, East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile, Clairlea, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside, Cliffcrest, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


# Create map with all coordinates

In [132]:
import folium
map_toronto = folium.Map(zoom_start=1)

# add markers to map
for lat, lng, borough, neighborhood in zip(ngbr_toronto['Latitude'], ngbr_toronto['Longitude'], ngbr_toronto['Borough'], ngbr_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

# Let's use Foursquare API to explore neighborhood in ngbr_toronto dataframe

## Foursquare access

In [133]:
CLIENT_ID = 'EWL1F4APWLEDLJEIN1YA2VUZV4VUI5ZI2XS13P2T5HTGWWRT' # your Foursquare ID
CLIENT_SECRET = 'PKB1DKE4HHLRRLEX01WESNZBQD0DGPPGIFAMHHB0KYK1B1TC' # your Foursquare Secret
ACCESS_TOKEN = 'FSWGHGNOWYZJCNZ3D1PGRW0JKHL4J21MLY3SBUYZFBBQA1NM' # your FourSquare Access Token
VERSION = '20180605' # Foursquare API version
LIMIT = 100 # A default Foursquare API limit value

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: EWL1F4APWLEDLJEIN1YA2VUZV4VUI5ZI2XS13P2T5HTGWWRT
CLIENT_SECRET:PKB1DKE4HHLRRLEX01WESNZBQD0DGPPGIFAMHHB0KYK1B1TC


## Get one neighborhood with max occurences

In [134]:
max_occurences = ngbr_toronto.loc[:, 'Neighborhood'].value_counts().idxmax()
max_occurences = max_occurences.split(',')[0]

## Get the top 50 venue for the neighborhood with max occurences

In [135]:
neighborhood_latitude = ngbr_toronto[ngbr_toronto['Neighborhood'] ==  max_occurences]['Latitude'].iloc[0]
neighborhood_longitude = ngbr_toronto[ngbr_toronto['Neighborhood'] ==  max_occurences]['Longitude'].iloc[0]

limit = 50
radius = 1000  # I choose 1Km because the are not any results in 500m

# Create the url
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    limit)

## Save the result in a json object

In [136]:
results = requests.get(url).json()['response']['groups'][0]['items']

# As it has been done for the NY project, I parse the results object and I save it into a dataframe for further analysis

In [137]:
venues = [(
    v['venue']['name'], 
    v['venue']['location']['lat'], 
    v['venue']['location']['lng'],  
    v['venue']['categories'][0]['name']) for v in results]

venues_df = pd.DataFrame([a_vanue for a_vanue in venues], columns=['Vanue Name', 'Latitude', 'Longitude', 'Type'])
venues_df

Unnamed: 0,Vanue Name,Latitude,Longitude,Type
0,Java Joe's Village Cafe,43.662461,-79.532054,Café
1,St Georges Golf and Country Club,43.674395,-79.537142,Golf Course
2,TD Canada Trust,43.662545,-79.531749,Bank
3,Shoppers Drug Mart,43.663067,-79.531753,Pharmacy
4,COBS Bread,43.66494,-79.520485,Bakery
5,Thorncrest Drug Store,43.662988,-79.531817,Pharmacy
6,Foodland - Toronto,43.662724,-79.531984,Grocery Store
7,Thorncrest Plaza,43.66262,-79.532146,Shopping Mall
8,Princess Margaret Park,43.667835,-79.539934,Playground
9,Bay Carpentry and Contracting,43.663892,-79.52621,Home Service


# Try to count each Type of vanue inside the new dataframe

In [138]:
count_venue_type = venues_df[['Type']].groupby(by='Type').size()
count_venue_type = pd.DataFrame(count_venue_type, columns=['Count'])
count_venue_type = count_venue_type.sort_values(by='Count', ascending=False)
count_venue_type

Unnamed: 0_level_0,Count
Type,Unnamed: 1_level_1
Pharmacy,2
Bakery,1
Bank,1
Café,1
Convenience Store,1
Golf Course,1
Grocery Store,1
Home Service,1
Park,1
Playground,1


# We discover that there are two Pharmacy in the nearest km, while every other km have only 1 per type. Where they are? Discover it with a map!

In [139]:
map_pharmacy = folium.Map(zoom_start=1)
venues_df_pharmacy = venues_df[venues_df['Type'] == 'Pharmacy']
venues_df_pharmacy

Unnamed: 0,Vanue Name,Latitude,Longitude,Type
3,Shoppers Drug Mart,43.663067,-79.531753,Pharmacy
5,Thorncrest Drug Store,43.662988,-79.531817,Pharmacy


In [140]:
# add markers to map
for value_name, lat, lng in zip(venues_df_pharmacy['Vanue Name'], venues_df_pharmacy['Latitude'], venues_df_pharmacy['Longitude']):
    label = '{}, {}'.format(value_name, 'Pharmacy')

    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_pharmacy)  
    
map_pharmacy

They are so close!

# Let's cluster all the points that contains the word 'Toronto' in the Neighborood feature

## So, let's take the originale ngbr_toronto dataframe, and cut it on the condition

In [141]:
ngbr_toronto_cutted = ngbr_toronto[ngbr_toronto['Neighborhood'].str.contains('Toronto')]
ngbr_toronto_cutted

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
46,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
59,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
60,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
66,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049
88,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores",43.605647,-79.501321


## Now prepare the clustering

## We only keep some columns

In [142]:
ngbr_toronto_cutted_for_cluster = ngbr_toronto_cutted[['Latitude', 'Longitude']]

## Apply KMeans to group in two cluster the element

In [146]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=2, random_state=0).fit(ngbr_toronto_cutted_for_cluster)
ngbr_toronto_cutted.insert(0, 'Cluster Labels', kmeans.labels_)
ngbr_toronto_cutted

Unnamed: 0,Cluster Labels,Postal Code,Borough,Neighborhood,Latitude,Longitude
46,1,M4R,Central Toronto,North Toronto West,43.715383,-79.405678
59,1,M5J,Downtown Toronto,"Harbourfront East, Union Station, Toronto Islands",43.640816,-79.381752
60,1,M5K,Downtown Toronto,"Toronto Dominion Centre, Design Exchange",43.647177,-79.381576
66,1,M5S,Downtown Toronto,"University of Toronto, Harbord",43.662696,-79.400049
88,0,M8V,Etobicoke,"New Toronto, Mimico South, Humber Bay Shores",43.605647,-79.501321


# Plot them

In [147]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors
# create map
map_clusters = folium.Map()

# set color scheme for the clusters
kclusters = 2
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(ngbr_toronto_cutted['Latitude'], ngbr_toronto_cutted['Longitude'], ngbr_toronto_cutted['Neighborhood'], ngbr_toronto_cutted['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters