# Start of Part 1

### Import required libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as bs
import requests

### Build scraper

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
page = requests.get(url)
page_text = page.text
soup = bs(page_text, 'html.parser')

### Create DataFrame

In [3]:
table = soup.find_all('table')
df = pd.read_html(str(table))[0]
df.head() 

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


### Clean and Wrangle dataframe
1. Drop all postal codes without a Borough. Check to see if any postal code has multiple rows
2. Combine neighborhoods that share the same postal code
3. Any postal code that has a borough, but not a neighborhood is assigned that borough as its neighborhood


In [4]:
df1 = df[df['Borough'] != 'Not assigned']
df1['Postal Code'].value_counts(sort = True)

df2 = df1.groupby(['Postal Code', 'Borough']).agg({
    'Neighbourhood': lambda x: ', '.join(x)
})
df2 = df2.reset_index()

df3 = df2
df3['Neighbourhood'] = np.where(df3['Neighbourhood'] == 'Not assigned', df3['Borough'], df3['Neighbourhood'])

df3.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern, Rouge"
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


### Final DataFrame and shape

In [5]:
final_df = df3
final_df.shape

(103, 3)

# End of Part 1

# Start of Part 2
### I am using IBM Cloud for my project. I downloaded the csv file and loaded it into the cloud. The code below was generated by IBM cloud. Credentials removed in shared notebook

In [6]:
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3

def __iter__(self): return 0

# @hidden_cell
# The following code accesses a file in your IBM Cloud Object Storage. It includes your credentials.
# You might want to remove those credentials before you share the notebook.
client_d23dce32647949c19c5101de06c7024d = ibm_boto3.client(service_name='s3',
    ibm_api_key_id='',
    ibm_auth_endpoint="https://iam.cloud.ibm.com/oidc/token",
    config=Config(signature_version='oauth'),
    endpoint_url='https://s3-api.us-geo.objectstorage.service.networklayer.com')

body = client_d23dce32647949c19c5101de06c7024d.get_object(Bucket='datasciencecaptsone-donotdelete-pr-qgohmkknyciimv',Key='Geospatial_Coordinates.csv')['Body']
# add missing __iter__ method, so pandas accepts body as file-like object
if not hasattr(body, "__iter__"): body.__iter__ = types.MethodType( __iter__, body )

df_data_1 = pd.read_csv(body)
df_data_1.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Merge two DataFrames such that longitude and latitude data are available

In [7]:
final_df = final_df.merge(df_data_1, left_on = 'Postal Code', right_on = 'Postal Code')
final_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


# End of Part 2

# Start of Part 3

### Import additional libraries for analysis and vizualization
##### Folium and geopy libraries may not be available uncomment line 1 to install geopy before attempting to import
##### Uncomment line 2 to install folium, if it still fails, try line 3

In [10]:
#!conda install -c conda-forge geopy --yes 
#!conda install -c conda-forge folium=0.5.0 --yes
#!pip install folium

import json
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize 
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

print('import successful')

import successful


### Create new DataFrame for Toronto and determine longitude and latitude for Toronto (Toronto is in the borough name)
##### Note: I will be using Postal Code as my most granular data point
##### geolocator produced a good result as the coordinates for Toronto the city are very similar to the coordinates for the postal codes

In [11]:
t_df = final_df[final_df['Borough'].str.contains('Toronto')]
print(t_df.head())

address = 'Toronto'

geolocator = Nominatim(user_agent="explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('latitude and longitude for toronto: {}, {}.'.format(latitude, longitude))

   Postal Code          Borough                   Neighbourhood   Latitude  \
37         M4E     East Toronto                     The Beaches  43.676357   
41         M4K     East Toronto    The Danforth West, Riverdale  43.679557   
42         M4L     East Toronto  India Bazaar, The Beaches West  43.668999   
43         M4M     East Toronto                 Studio District  43.659526   
44         M4N  Central Toronto                   Lawrence Park  43.728020   

    Longitude  
37 -79.293031  
41 -79.352188  
42 -79.315572  
43 -79.340923  
44 -79.388790  
latitude and longitude for toronto: 43.6534817, -79.3839347.


### Creating a map of Toronto with postal codes on top, this will serve as a base to compare to

In [12]:
map_toronto = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighborhood,postal in zip(t_df['Latitude'], t_df['Longitude'], t_df['Borough'], t_df['Neighbourhood'], t_df['Postal Code']):
    label = '{}: {}'.format(postal, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

### Define four square credentials, credentials will be removed when the workbook is shared

In [13]:
CLIENT_ID = '' 
CLIENT_SECRET = '' 
ACCESS_TOKEN = '' 
VERSION = ''
LIMIT = 50

### Define function to get nearby venues for each postal code
##### Function comes from lab: DS0701EN-3-3-2-Neighborhoods-New-York-py-v1.0 and will be modified to fit this example

In [14]:
def getVenues(names, neighborhood, latitudes, longitudes, radius):
    
    venues_list=[]
    for name, nhood, lat, lng in zip(names, neighborhood, latitudes, longitudes):
        print(name,': ',nhood)
            
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        venues_list.append([(
            name, 
            nhood,
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],
            v['venue']['location']['distance'],
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postal Code',
                             'Neighborhood',
                  'Postal Code Latitude', 
                  'Postal Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude',
                             'Venue Distance',
                  'Venue Category']
    
    return(nearby_venues)

### This generates a list of venues within a defined radius of 10000 m

In [15]:
radius = 10000
toronto_venues = getVenues(names=t_df['Postal Code'],
                                 neighborhood = t_df['Neighbourhood'],
                                   latitudes=t_df['Latitude'],
                                   longitudes=t_df['Longitude'],
                                 radius = radius
                                  )

M4E :  The Beaches
M4K :  The Danforth West, Riverdale
M4L :  India Bazaar, The Beaches West
M4M :  Studio District
M4N :  Lawrence Park
M4P :  Davisville North
M4R :  North Toronto West, Lawrence Park
M4S :  Davisville
M4T :  Moore Park, Summerhill East
M4V :  Summerhill West, Rathnelly, South Hill, Forest Hill SE, Deer Park
M4W :  Rosedale
M4X :  St. James Town, Cabbagetown
M4Y :  Church and Wellesley
M5A :  Regent Park, Harbourfront
M5B :  Garden District, Ryerson
M5C :  St. James Town
M5E :  Berczy Park
M5G :  Central Bay Street
M5H :  Richmond, Adelaide, King
M5J :  Harbourfront East, Union Station, Toronto Islands
M5K :  Toronto Dominion Centre, Design Exchange
M5L :  Commerce Court, Victoria Hotel
M5N :  Roselawn
M5P :  Forest Hill North & West, Forest Hill Road Park
M5R :  The Annex, North Midtown, Yorkville
M5S :  University of Toronto, Harbord
M5T :  Kensington Market, Chinatown, Grange Park
M5V :  CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, S

### Display DataFrame Head and shape to begin analysis

In [16]:
print(toronto_venues.shape)
toronto_venues.head()

(1950, 9)


Unnamed: 0,Postal Code,Neighborhood,Postal Code Latitude,Postal Longitude,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category
0,M4E,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,609,Indie Movie Theater
1,M4E,The Beaches,43.676357,-79.293031,Kew Gardens,43.669038,-79.298538,927,Park
2,M4E,The Beaches,43.676357,-79.293031,I'm On The Beach,43.670364,-79.284696,946,Beach
3,M4E,The Beaches,43.676357,-79.293031,Kew-Balmy Beach,43.667372,-79.295312,1016,Beach
4,M4E,The Beaches,43.676357,-79.293031,Buds Coffee Bar,43.669375,-79.303218,1129,Coffee Shop


### Get counts for Venue Category

In [17]:
toronto_venues['Venue Category'].value_counts(sort = True)

Coffee Shop           175
Park                  172
Café                  169
Bakery                 78
Farmers Market         64
                     ... 
Ramen Restaurant        2
Bike Shop               2
Bagel Shop              2
Steakhouse              1
Mexican Restaurant      1
Name: Venue Category, Length: 95, dtype: int64

### Create new column to categorize travel distance

In [18]:
toronto_venues.loc[toronto_venues['Venue Distance'] < 1000, 'Travel Distance'] = 'Short'
toronto_venues.loc[toronto_venues['Venue Distance'] > 6000, 'Travel Distance'] = 'Long'
toronto_venues['Travel Distance'] = toronto_venues['Travel Distance'].fillna('Medium')
                   
toronto_venues.head()   


Unnamed: 0,Postal Code,Neighborhood,Postal Code Latitude,Postal Longitude,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category,Travel Distance
0,M4E,The Beaches,43.676357,-79.293031,The Fox Theatre,43.672801,-79.287272,609,Indie Movie Theater,Short
1,M4E,The Beaches,43.676357,-79.293031,Kew Gardens,43.669038,-79.298538,927,Park,Short
2,M4E,The Beaches,43.676357,-79.293031,I'm On The Beach,43.670364,-79.284696,946,Beach,Short
3,M4E,The Beaches,43.676357,-79.293031,Kew-Balmy Beach,43.667372,-79.295312,1016,Beach,Medium
4,M4E,The Beaches,43.676357,-79.293031,Buds Coffee Bar,43.669375,-79.303218,1129,Coffee Shop,Medium


### Getting counts of venues based on travel distance from a Postal Code

In [19]:
tv_df = toronto_venues.groupby(['Postal Code', 'Neighborhood','Travel Distance']).count()
tv_df


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Postal Code Latitude,Postal Longitude,Venue,Venue Latitude,Venue Longitude,Venue Distance,Venue Category
Postal Code,Neighborhood,Travel Distance,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
M4E,The Beaches,Long,8,8,8,8,8,8,8
M4E,The Beaches,Medium,38,38,38,38,38,38,38
M4E,The Beaches,Short,4,4,4,4,4,4,4
M4K,"The Danforth West, Riverdale",Medium,46,46,46,46,46,46,46
M4K,"The Danforth West, Riverdale",Short,4,4,4,4,4,4,4
...,...,...,...,...,...,...,...,...,...
M6S,"Runnymede, Swansea",Medium,48,48,48,48,48,48,48
M7A,"Queen's Park, Ontario Provincial Government",Medium,38,38,38,38,38,38,38
M7A,"Queen's Park, Ontario Provincial Government",Short,12,12,12,12,12,12,12
M7Y,"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",Medium,47,47,47,47,47,47,47


### One Hot Encoding using Venue Category and Travel Distance. The travel distance is being included to give a since of distance people may have to travel to get to certain venues
#### This will give frequency of venues and how far someone might have to travel to get to them

In [20]:
t_ohe = pd.get_dummies(toronto_venues[['Venue Category','Travel Distance']])
t_ohe['Postal Code'] = toronto_venues['Postal Code']
t_ohe['Neighborhood'] = toronto_venues['Neighborhood']

print(t_ohe.shape)
t_ohe.head()

(1950, 100)


Unnamed: 0,Venue Category_American Restaurant,Venue Category_Art Gallery,Venue Category_Arts & Crafts Store,Venue Category_Asian Restaurant,Venue Category_Athletics & Sports,Venue Category_BBQ Joint,Venue Category_Bagel Shop,Venue Category_Bakery,Venue Category_Bar,Venue Category_Basketball Stadium,...,Venue Category_Thai Restaurant,Venue Category_Theater,Venue Category_Trail,Venue Category_Train Station,Venue Category_Vietnamese Restaurant,Travel Distance_Long,Travel Distance_Medium,Travel Distance_Short,Postal Code,Neighborhood
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,M4E,The Beaches
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,M4E,The Beaches
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,M4E,The Beaches
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,M4E,The Beaches
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,M4E,The Beaches


### Grouping by postal code and determining frequency

In [21]:
t_group = t_ohe.groupby(['Postal Code','Neighborhood']).mean().reset_index()
t_group

Unnamed: 0,Postal Code,Neighborhood,Venue Category_American Restaurant,Venue Category_Art Gallery,Venue Category_Arts & Crafts Store,Venue Category_Asian Restaurant,Venue Category_Athletics & Sports,Venue Category_BBQ Joint,Venue Category_Bagel Shop,Venue Category_Bakery,...,Venue Category_Sushi Restaurant,Venue Category_Tapas Restaurant,Venue Category_Thai Restaurant,Venue Category_Theater,Venue Category_Trail,Venue Category_Train Station,Venue Category_Vietnamese Restaurant,Travel Distance_Long,Travel Distance_Medium,Travel Distance_Short
0,M4E,The Beaches,0.02,0.0,0.0,0.02,0.02,0.02,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.76,0.08
1,M4K,"The Danforth West, Riverdale",0.02,0.0,0.0,0.02,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.08
2,M4L,"India Bazaar, The Beaches West",0.02,0.0,0.0,0.02,0.02,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.08
3,M4M,Studio District,0.02,0.0,0.0,0.02,0.02,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84,0.16
4,M4N,Lawrence Park,0.0,0.0,0.0,0.0,0.0,0.04,0.02,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.96,0.0
5,M4P,Davisville North,0.0,0.0,0.0,0.0,0.02,0.04,0.0,0.06,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.86,0.14
6,M4R,"North Toronto West, Lawrence Park",0.0,0.0,0.0,0.0,0.0,0.04,0.02,0.04,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9,0.1
7,M4S,Davisville,0.0,0.0,0.0,0.0,0.02,0.04,0.0,0.06,...,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.94,0.06
8,M4T,"Moore Park, Summerhill East",0.0,0.02,0.02,0.0,0.0,0.04,0.0,0.02,...,0.0,0.0,0.0,0.02,0.0,0.0,0.0,0.0,1.0,0.0
9,M4V,"Summerhill West, Rathnelly, South Hill, Forest...",0.0,0.02,0.02,0.0,0.0,0.02,0.0,0.02,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.98,0.02


### Model KNN and add Attributes back in

In [24]:
k = 5
t_model = t_group.drop(['Postal Code', 'Neighborhood'],1)

kmeans = KMeans(n_clusters=k, random_state=0).fit(t_model)

kmeans.labels_[0:15]


t_group.insert(0, 'Cluster Labels', kmeans.labels_)
map_t_df = t_df
map_t_df = map_t_df.join(t_group.set_index('Postal Code'), on='Postal Code')
map_t_df.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,Neighborhood,Venue Category_American Restaurant,Venue Category_Art Gallery,Venue Category_Arts & Crafts Store,...,Venue Category_Sushi Restaurant,Venue Category_Tapas Restaurant,Venue Category_Thai Restaurant,Venue Category_Theater,Venue Category_Trail,Venue Category_Train Station,Venue Category_Vietnamese Restaurant,Travel Distance_Long,Travel Distance_Medium,Travel Distance_Short
37,M4E,East Toronto,The Beaches,43.676357,-79.293031,3,The Beaches,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,0.76,0.08
41,M4K,East Toronto,"The Danforth West, Riverdale",43.679557,-79.352188,3,"The Danforth West, Riverdale",0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.08
42,M4L,East Toronto,"India Bazaar, The Beaches West",43.668999,-79.315572,3,"India Bazaar, The Beaches West",0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.92,0.08
43,M4M,East Toronto,Studio District,43.659526,-79.340923,3,Studio District,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84,0.16
44,M4N,Central Toronto,Lawrence Park,43.72802,-79.38879,1,Lawrence Park,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.96,0.0


### Create Map

In [25]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(k)
ys = [i + x + (i*x)**2 for i in range(k)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(map_t_df['Latitude'], map_t_df['Longitude'], map_t_df['Neighborhood'], map_t_df['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# End of Part 3