# Neighbourhood Clustering in Canada

# Part 1 : Cleaning and Transforming of Data

In [1]:
import pandas as pd

In [2]:
import json , requests
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'
html = requests.get(url).content
df=pd.read_html(html)
df2=df[0]
print(df2[0:5])

  Postal Code           Borough              Neighbourhood
0         M1A      Not assigned               Not assigned
1         M2A      Not assigned               Not assigned
2         M3A        North York                  Parkwoods
3         M4A        North York           Victoria Village
4         M5A  Downtown Toronto  Regent Park, Harbourfront


In [3]:
df2.to_csv("toronto.csv")

In [4]:
tor = pd.read_csv('toronto.csv')
df_tor = pd.DataFrame(data=tor)

In [5]:
df_tor.head()

Unnamed: 0.1,Unnamed: 0,Postal Code,Borough,Neighbourhood
0,0,M1A,Not assigned,Not assigned
1,1,M2A,Not assigned,Not assigned
2,2,M3A,North York,Parkwoods
3,3,M4A,North York,Victoria Village
4,4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [6]:
df_tor.drop('Unnamed: 0',axis=1,inplace=True)
df_tor.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M1A,Not assigned,Not assigned
1,M2A,Not assigned,Not assigned
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


In [7]:
df3 = df_tor[df_tor['Borough'] != 'Not assigned']
df3.reset_index(inplace=True)
df3.drop('index',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
df3.head(11)

Unnamed: 0,Postal Code,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Regent Park, Harbourfront"
3,M6A,North York,"Lawrence Manor, Lawrence Heights"
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government"
5,M9A,Etobicoke,"Islington Avenue, Humber Valley Village"
6,M1B,Scarborough,"Malvern, Rouge"
7,M3B,North York,Don Mills
8,M4B,East York,"Parkview Hill, Woodbine Gardens"
9,M5B,Downtown Toronto,"Garden District, Ryerson"


First, i extract content from the given url and read with pandas thus converting it into csv file.Then, i convert csv file into dataframe. A another Dataframe with values excluding 'Not assigned' values.Thus, finally after some cleaning and proper indexing formed above Dataframe. 

In [9]:
df3.shape

(103, 3)

# Part 2 : Adding more Data

In [10]:
#!conda install -c conda-forge geopy --yes

In [11]:
df5 = pd.read_csv('http://cocl.us/Geospatial_data')
df5

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476
...,...,...,...
98,M9N,43.706876,-79.518188
99,M9P,43.696319,-79.532242
100,M9R,43.688905,-79.554724
101,M9V,43.739416,-79.588437


In [12]:
df3.set_index('Postal Code')
df5.set_index('Postal Code',inplace=True)
df5.head()

Unnamed: 0_level_0,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1
M1B,43.806686,-79.194353
M1C,43.784535,-79.160497
M1E,43.763573,-79.188711
M1G,43.770992,-79.216917
M1H,43.773136,-79.239476


In [13]:
df3.set_index('Postal Code',inplace=True)

In [14]:
df5=df3.join(df5)
df5.head(11)

Unnamed: 0_level_0,Borough,Neighbourhood,Latitude,Longitude
Postal Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
M3A,North York,Parkwoods,43.753259,-79.329656
M4A,North York,Victoria Village,43.725882,-79.315572
M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
M9A,Etobicoke,"Islington Avenue, Humber Valley Village",43.667856,-79.532242
M1B,Scarborough,"Malvern, Rouge",43.806686,-79.194353
M3B,North York,Don Mills,43.745906,-79.352188
M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937


In [15]:
df5.reset_index(inplace=True)
df5.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
3,M6A,North York,"Lawrence Manor, Lawrence Heights",43.718518,-79.464763
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494


# Part 3 : Exploring , Analyzing and Visualizing of Data

In [16]:
df5.groupby('Borough').count()

Unnamed: 0_level_0,Postal Code,Neighbourhood,Latitude,Longitude
Borough,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Central Toronto,9,9,9,9
Downtown Toronto,19,19,19,19
East Toronto,5,5,5,5
East York,5,5,5,5
Etobicoke,12,12,12,12
Mississauga,1,1,1,1
North York,24,24,24,24
Scarborough,17,17,17,17
West Toronto,6,6,6,6
York,5,5,5,5


In [17]:
df6 = df5[df5['Borough'].str.contains('Toronto')]
df6.head()

Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
2,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
4,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
9,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
15,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
19,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [18]:
df6.reset_index(inplace=True)

In [19]:
df6.drop('index',axis=1,inplace=True)
df6.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,Postal Code,Borough,Neighbourhood,Latitude,Longitude
0,M5A,Downtown Toronto,"Regent Park, Harbourfront",43.65426,-79.360636
1,M7A,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.662301,-79.389494
2,M5B,Downtown Toronto,"Garden District, Ryerson",43.657162,-79.378937
3,M5C,Downtown Toronto,St. James Town,43.651494,-79.375418
4,M4E,East Toronto,The Beaches,43.676357,-79.293031


In [20]:
!pip install folium
df6.shape



(39, 5)

In [21]:
import folium

map_toronto = folium.Map(location= [43.728020,-79.388790],zoom_start = 11)
 

for lat,lng,label in zip(df6['Latitude'],df6['Longitude'],df6['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
    [lat, lng],
        radius=10,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)

map_toronto

In [22]:
CLIENT_ID = 'RXPKK0ZD2T2XXYV43LRX4JSPDS3RQEZ25A4GWVMLU4IZJCY3' 
CLIENT_SECRET = 'ASPCAQSPCWS0J52VNAWGCAQTADA13IZ2CWBJNS4QMHTBLGON' 
VERSION = '20180605' 
LIMIT = 100 


In [23]:
neigh_lat = df6.loc[0 , 'Latitude']
neigh_long = df6.loc[0, 'Longitude']
neigh_name = df6.loc[0,'Borough']

LIMIT=100
radius=500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neigh_lat, 
    neigh_long, 
    radius, 
    LIMIT)

print('Exploring Around' , df6.loc[0,'Borough'])

Exploring Around Downtown Toronto


In [24]:
results= requests.get(url).json()

In [25]:
def get_category_type(row):
    try:
        category = row['categories']
    except:
        category = row['venue.categories']
        
    if len(category) == 0:
        return None
    else:
        return category[0]['name']

In [26]:
from pandas.io.json import json_normalize

venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) 

filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()



Unnamed: 0,name,categories,lat,lng
0,Roselle Desserts,Bakery,43.653447,-79.362017
1,Tandem Coffee,Coffee Shop,43.653559,-79.361809
2,Cooper Koo Family YMCA,Distribution Center,43.653249,-79.358008
3,Body Blitz Spa East,Spa,43.654735,-79.359874
4,Impact Kitchen,Restaurant,43.656369,-79.35698


In [27]:
nearby_venues.shape

(44, 4)

In [28]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
       
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)    

In [29]:
toronto_venues = getNearbyVenues(df6['Neighbourhood'] , df6['Latitude'] , df6['Longitude'] , radius = 100)
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,"Regent Park, Harbourfront",43.65426,-79.360636,Body Blitz Spa East,43.654735,-79.359874,Spa
1,"Regent Park, Harbourfront",43.65426,-79.360636,Morning Glory Cafe,43.653947,-79.361149,Breakfast Spot
2,"Regent Park, Harbourfront",43.65426,-79.360636,Sackville Playground,43.654656,-79.359871,Park
3,"Garden District, Ryerson",43.657162,-79.378937,Ryerson Image Centre,43.657523,-79.37946,Art Gallery
4,"Garden District, Ryerson",43.657162,-79.378937,Balzac's Coffee,43.657854,-79.3792,Coffee Shop


In [30]:
toronto_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"CN Tower, King and Spadina, Railway Lands, Harbourfront West, Bathurst Quay, South Niagara, Island airport",1,1,1,1,1,1
Central Bay Street,4,4,4,4,4,4
Christie,1,1,1,1,1,1
"Commerce Court, Victoria Hotel",17,17,17,17,17,17
Davisville,9,9,9,9,9,9
"First Canadian Place, Underground city",12,12,12,12,12,12
"Garden District, Ryerson",2,2,2,2,2,2
"Harbourfront East, Union Station, Toronto Islands",2,2,2,2,2,2
"Kensington Market, Chinatown, Grange Park",4,4,4,4,4,4
"Little Portugal, Trinity",7,7,7,7,7,7


# Part 4 : Cluster Neighborhoods

In [31]:
from sklearn.cluster import KMeans 

In [32]:
df8 = pd.get_dummies(toronto_venues['Venue Category'])
df8['Neighborhood'] = toronto_venues['Neighborhood']
toronto_grouped = df8.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,American Restaurant,Art Gallery,Asian Restaurant,Bakery,Bank,Bar,Beer Store,Bookstore,Breakfast Spot,...,Supermarket,Sushi Restaurant,Taco Place,Tea Room,Thai Restaurant,Thrift / Vintage Store,Toy / Game Store,Trail,Vegetarian / Vegan Restaurant,Yoga Studio
0,"CN Tower, King and Spadina, Railway Lands, Har...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Commerce Court, Victoria Hotel",0.058824,0.058824,0.0,0.058824,0.058824,0.0,0.0,0.058824,0.0,...,0.0,0.058824,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.111111,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,0.0


In [33]:
toronto_cluster = toronto_grouped.drop('Neighborhood' , 1)

In [34]:
clusters = 5
clf = KMeans(n_clusters = clusters,random_state=0).fit(toronto_cluster)
clf.labels_

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 0, 0, 0, 2, 0, 3],
      dtype=int32)

In [35]:
df9 = toronto_venues.groupby('Neighborhood').mean().reset_index()
df9.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude
0,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442,43.628947,-79.39442
1,Central Bay Street,43.657952,-79.387383,43.657705,-79.387919
2,Christie,43.669542,-79.422564,43.669272,-79.42283
3,"Commerce Court, Victoria Hotel",43.648198,-79.379817,43.647984,-79.379754
4,Davisville,43.704324,-79.38879,43.704581,-79.388565


In [36]:
toronto_merged = df9
toronto_merged.insert(0,'Cluster Labels',clf.labels_)

toronto_merged.head()

Unnamed: 0,Cluster Labels,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue Latitude,Venue Longitude
0,0,"CN Tower, King and Spadina, Railway Lands, Har...",43.628947,-79.39442,43.628947,-79.39442
1,0,Central Bay Street,43.657952,-79.387383,43.657705,-79.387919
2,1,Christie,43.669542,-79.422564,43.669272,-79.42283
3,0,"Commerce Court, Victoria Hotel",43.648198,-79.379817,43.647984,-79.379754
4,0,Davisville,43.704324,-79.38879,43.704581,-79.388565


In [37]:
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

map_clusters = folium.Map(location=[43.628947, -79.394420], zoom_start=13)

# set color scheme for the clusters
x = np.arange(clusters)
ys = [i + x + (i*x)**2 for i in range(clusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Neighborhood Latitude'], toronto_merged['Neighborhood Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=20,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.8).add_to(map_clusters)
       
map_clusters