## IBM data science capstone

## Features and crime data mapping of the neighbourhoods in Vancouver

In [2]:
#import libraries
import pandas as pd
import numpy as np
print("Hello Capstone Project Course!")

!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

!conda install -c conda-forge tabula-py

import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

import requests # library to handle requests

Hello Capstone Project Course!
Solving environment: done

## Package Plan ##

  environment location: /opt/conda/envs/Python36

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.20.0               |             py_0          57 KB  conda-forge
    certifi-2019.9.11          |           py36_0         147 KB  conda-forge
    openssl-1.1.1c             |       h516909a_0         2.1 MB  conda-forge
    geographiclib-1.50         |             py_0          34 KB  conda-forge
    ca-certificates-2019.9.11  |       hecc5488_0         144 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         2.5 MB

The following NEW packages will be INSTALLED:

    geographiclib:   1.50-py_0         conda-forge
    geopy:           1.20.0-py_0       conda-forge

The following pac

In [7]:
# import 'list of neighborhood' data from csv file stored in 'city of Vancouver' website
neighbourhood=pd.read_csv('ftp://webftp.vancouver.ca/OpenData/csv/cov_localareas.csv').drop(columns='MAPID')
neighbourhood.rename(columns={'NAME':'Neighbourhood'}, inplace=True)
neighbourhood.head()

Unnamed: 0,Neighbourhood
0,Sunset
1,Mount Pleasant
2,Riley Park
3,Downtown
4,Kitsilano


In [8]:
# adding 'Borough=Vancouver' data to the 'neighbourhood' column
neighbourhood['Borough']='Vancouver'
column=['Borough','Neighbourhood']
neighbourhood=neighbourhood[column]
neighbourhood.head()


Unnamed: 0,Borough,Neighbourhood
0,Vancouver,Sunset
1,Vancouver,Mount Pleasant
2,Vancouver,Riley Park
3,Vancouver,Downtown
4,Vancouver,Kitsilano


In [11]:
ind=neighbourhood.shape[0]
ind

22

In [12]:
# adding 'geolocation data' to the 'neighbourhood' column
geoloc=pd.DataFrame(columns=['Latitude','Longitude'])

for index in range(0,ind):
    address= '{},Vancouver, BC'.format(neighbourhood.loc[index, "Neighbourhood"])
    geolocator = Nominatim(user_agent="vancouver_explorer")
    location = geolocator.geocode(address)
    latitude = location.latitude
    longitude = location.longitude
    geoloc.loc[index, "Latitude"]=latitude
    geoloc.loc[index, "Longitude"]=longitude

In [13]:
geoloc.head()

Unnamed: 0,Latitude,Longitude
0,49.2196,-123.09
1,49.2633,-123.097
2,49.2474,-123.103
3,49.2834,-123.117
4,49.2694,-123.155


In [14]:
# append 'coordinate' information into 'neighbourhood' dataframe
neighbourhood[["Latitude", "Longitude"]]=geoloc
neighbourhood.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Vancouver,Sunset,49.2196,-123.09
1,Vancouver,Mount Pleasant,49.2633,-123.097
2,Vancouver,Riley Park,49.2474,-123.103
3,Vancouver,Downtown,49.2834,-123.117
4,Vancouver,Kitsilano,49.2694,-123.155


### Create a map of Vancouver with neighborhoods superimposed on top.

In [18]:
# create map of Vancouver using latitude and longitude values
address= 'Vancouver, BC'
geolocator = Nominatim(user_agent="vancouver_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_vancouver = folium.Map(location=[latitude, longitude], zoom_start=12)

# add markers to map
for lat, lng, borough, neighbour in zip(neighbourhood['Latitude'], neighbourhood['Longitude'], neighbourhood['Borough'], neighbourhood['Neighbourhood']):
    label = '{}, {}'.format(neighbour, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_vancouver)  
    
map_vancouver

# utilize the Foursquare API to explore the neighbourhoods and segment them

In [19]:
# define foursquare API credentials
CLIENT_ID = 'BG1ZXHRYFARQLA510FAXBBKEKOIBAHBQ5B13RNZ0SIDVO2MG' # your Foursquare ID
CLIENT_SECRET = 'AGAUNCWIH3L0PWLBVTAX20A1A0RKWFCYTQOBE3NGGU1MOQ44' # your Foursquare Secret
VERSION = '20191024' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: BG1ZXHRYFARQLA510FAXBBKEKOIBAHBQ5B13RNZ0SIDVO2MG
CLIENT_SECRET:AGAUNCWIH3L0PWLBVTAX20A1A0RKWFCYTQOBE3NGGU1MOQ44


# 2. Explore Neighbourhoods in Vancouver

### Let's create a function to 'get nearby venues' to all the neighbourhoods in Vancouver

In [22]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        LIMIT=100
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

### Now run the ''get nearby venues' function on each neighbourhood and create a new dataframe vancouver_venues

In [23]:
vancouver_venues = getNearbyVenues(names=neighbourhood['Neighbourhood'],
                                   latitudes=neighbourhood['Latitude'],
                                   longitudes=neighbourhood['Longitude']
                                  )
print(vancouver_venues.shape)
vancouver_venues.head()

Sunset
Mount Pleasant
Riley Park
Downtown
Kitsilano
Dunbar-Southlands
Kerrisdale
Arbutus-Ridge
West Point Grey
Marpole
Oakridge
Shaughnessy
Fairview
South Cambie
West End
Killarney
Renfrew-Collingwood
Hastings-Sunrise
Victoria-Fraserview
Kensington-Cedar Cottage
Strathcona
Grandview-Woodland
(663, 7)


Unnamed: 0,Neighbourhood,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Sunset,49.219593,-123.090239,Kalai's Dosa Hut,49.218998,-123.09116,South Indian Restaurant
1,Sunset,49.219593,-123.090239,New Novelty Restaurant and Sweets,49.223925,-123.090885,Dessert Shop
2,Mount Pleasant,49.26333,-123.096588,Dude Chilling Park,49.26373,-123.096796,Outdoor Sculpture
3,Mount Pleasant,49.26333,-123.096588,Sushiyama,49.262897,-123.097169,Sushi Restaurant
4,Mount Pleasant,49.26333,-123.096588,La petite cuillère,49.263533,-123.0999,Tea Room


Let's check how many venues were returned for each neighborhood

In [24]:
vancouver_venues.groupby('Neighbourhood').count()

Unnamed: 0_level_0,Neighbourhood Latitude,Neighbourhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighbourhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Arbutus-Ridge,4,4,4,4,4,4
Downtown,100,100,100,100,100,100
Dunbar-Southlands,7,7,7,7,7,7
Fairview,25,25,25,25,25,25
Grandview-Woodland,69,69,69,69,69,69
Hastings-Sunrise,12,12,12,12,12,12
Kensington-Cedar Cottage,21,21,21,21,21,21
Kerrisdale,38,38,38,38,38,38
Killarney,4,4,4,4,4,4
Kitsilano,51,51,51,51,51,51


In [25]:
#### Let's find out how many unique categories can be curated from all the returned venues
print('There are {} uniques categories.'.format(len(vancouver_venues['Venue Category'].unique())))

There are 154 uniques categories.


# 3. Analyze Each Neighbourhood

In [26]:
# one hot encoding
vancouver_onehot = pd.get_dummies(vancouver_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighbourhood column back to dataframe
vancouver_onehot['Neighbourhood'] = vancouver_venues['Neighbourhood'] 

# move neighbourhood column to the first column
fixed_columns = [vancouver_onehot.columns[-1]] + list(vancouver_onehot.columns[:-1])
vancouver_onehot = vancouver_onehot[fixed_columns]

vancouver_onehot.head()

Unnamed: 0,Neighbourhood,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,BBQ Joint,Bagel Shop,Bakery,...,Thrift / Vintage Store,Tiki Bar,Toy / Game Store,Track,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Shop,Women's Store,Yoga Studio
0,Sunset,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Sunset,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Mount Pleasant,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Mount Pleasant,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Mount Pleasant,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
vancouver_onehot.shape

(663, 155)

### Next, let's group rows by neighbourhood and by taking the mean of the frequency of occurrence of each category

In [28]:
vancouver_grouped = vancouver_onehot.groupby('Neighbourhood').mean().reset_index()
vancouver_grouped.head()

Unnamed: 0,Neighbourhood,American Restaurant,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Australian Restaurant,BBQ Joint,Bagel Shop,Bakery,...,Thrift / Vintage Store,Tiki Bar,Toy / Game Store,Track,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Wine Shop,Women's Store,Yoga Studio
0,Arbutus-Ridge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Downtown,0.03,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.02,0.0,0.0,0.0,0.02,0.0,0.0,0.0
2,Dunbar-Southlands,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Fairview,0.0,0.0,0.0,0.08,0.0,0.0,0.04,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.04,0.0,0.0,0.0
4,Grandview-Woodland,0.0,0.0,0.0,0.0,0.0,0.0,0.014493,0.0,0.028986,...,0.0,0.0,0.014493,0.0,0.014493,0.0,0.0,0.014493,0.0,0.0


new size

In [29]:
vancouver_grouped.shape

(22, 155)

### Let's print each neighbourhood along with the top 5 most common venues

In [30]:
num_top_venues = 5

for hood in vancouver_grouped['Neighbourhood']:
    print("----"+hood+"----")
    temp = vancouver_grouped[vancouver_grouped['Neighbourhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Arbutus-Ridge----
                 venue  freq
0        Jewelry Store  0.25
1        Grocery Store  0.25
2               Bakery  0.25
3            Pet Store  0.25
4  American Restaurant  0.00


----Downtown----
            venue  freq
0           Hotel  0.08
1      Food Truck  0.06
2     Coffee Shop  0.05
3            Café  0.05
4  Clothing Store  0.04


----Dunbar-Southlands----
                  venue  freq
0  Fast Food Restaurant  0.14
1          Liquor Store  0.14
2   Japanese Restaurant  0.14
3    Italian Restaurant  0.14
4     Indian Restaurant  0.14


----Fairview----
                 venue  freq
0          Coffee Shop  0.16
1     Asian Restaurant  0.08
2                 Park  0.08
3    Korean Restaurant  0.04
4  Szechuan Restaurant  0.04


----Grandview-Woodland----
               venue  freq
0        Coffee Shop  0.12
1   Sushi Restaurant  0.06
2               Park  0.04
3               Café  0.04
4  Indian Restaurant  0.04


----Hastings-Sunrise----
                   ven

### Let's put that into a pandas dataframe

First, let's write a function to sort the venues in descending order.

In [44]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

Now let's create the new dataframe and display the top 10 venues for each neighborhood.

In [45]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighbourhoods_venues_sorted = pd.DataFrame(columns=columns)
neighbourhoods_venues_sorted['Neighbourhood'] = vancouver_grouped['Neighbourhood']

for ind in np.arange(vancouver_grouped.shape[0]):
    neighbourhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(vancouver_grouped.iloc[ind, :], num_top_venues)

neighbourhoods_venues_sorted

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Arbutus-Ridge,Jewelry Store,Grocery Store,Pet Store,Bakery,Falafel Restaurant,French Restaurant,Food Truck,Food Court,Filipino Restaurant,Fast Food Restaurant
1,Downtown,Hotel,Food Truck,Coffee Shop,Café,Clothing Store,Steakhouse,Lounge,Electronics Store,Dessert Shop,Concert Hall
2,Dunbar-Southlands,Japanese Restaurant,Liquor Store,Fast Food Restaurant,Coffee Shop,Salon / Barbershop,Indian Restaurant,Italian Restaurant,Diner,Discount Store,Donut Shop
3,Fairview,Coffee Shop,Park,Asian Restaurant,Indian Restaurant,Szechuan Restaurant,Diner,Nail Salon,Chinese Restaurant,Restaurant,Falafel Restaurant
4,Grandview-Woodland,Coffee Shop,Sushi Restaurant,Café,Park,Indian Restaurant,Pizza Place,Italian Restaurant,Bakery,French Restaurant,Seafood Restaurant
5,Hastings-Sunrise,Vietnamese Restaurant,Inn,Pharmacy,Coffee Shop,Sushi Restaurant,Park,Bakery,Fast Food Restaurant,Liquor Store,Sandwich Place
6,Kensington-Cedar Cottage,Bus Stop,Coffee Shop,Vietnamese Restaurant,Café,Filipino Restaurant,Sandwich Place,Greek Restaurant,Chinese Restaurant,Malay Restaurant,Restaurant
7,Kerrisdale,Coffee Shop,Chinese Restaurant,Tea Room,Sushi Restaurant,Sandwich Place,Pharmacy,Fast Food Restaurant,Hobby Shop,Noodle House,Portuguese Restaurant
8,Killarney,Italian Restaurant,Pool,Track,Gym,Yoga Studio,Falafel Restaurant,French Restaurant,Food Truck,Food Court,Filipino Restaurant
9,Kitsilano,Bakery,Coffee Shop,Sushi Restaurant,Thai Restaurant,Food Truck,Tea Room,French Restaurant,Japanese Restaurant,Ice Cream Shop,American Restaurant


# 4. Cluster Neighbourhoods

Run k-means to cluster the neighbourhood into 5 clusters.

In [46]:
# set number of clusters
kclusters = 5

vancouver_grouped_clustering = vancouver_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(vancouver_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:40] 

array([3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 3, 0, 3, 3, 1, 3, 3, 2, 0, 3, 3],
      dtype=int32)

Let's create a new dataframe that includes the cluster as well as the top 10 venues for each neighbourhood.

In [47]:
# add clustering labels
neighbourhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

vancouver_merged = neighbourhood

# merge vancouver_grouped with vancouver_data to add latitude/longitude for each neighborhood
vancouver_merged = vancouver_merged.join(neighbourhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

vancouver_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Vancouver,Sunset,49.2196,-123.09,2,Dessert Shop,South Indian Restaurant,Yoga Studio,Falafel Restaurant,French Restaurant,Food Truck,Food Court,Filipino Restaurant,Fast Food Restaurant,Farmers Market
1,Vancouver,Mount Pleasant,49.2633,-123.097,3,Coffee Shop,Sandwich Place,Diner,Sushi Restaurant,Breakfast Spot,Grocery Store,Vietnamese Restaurant,Arts & Crafts Store,Indian Restaurant,Thrift / Vintage Store
2,Vancouver,Riley Park,49.2474,-123.103,3,Coffee Shop,Japanese Restaurant,Restaurant,Café,Grocery Store,Sushi Restaurant,Thai Restaurant,Pub,Lounge,Chinese Restaurant
3,Vancouver,Downtown,49.2834,-123.117,3,Hotel,Food Truck,Coffee Shop,Café,Clothing Store,Steakhouse,Lounge,Electronics Store,Dessert Shop,Concert Hall
4,Vancouver,Kitsilano,49.2694,-123.155,3,Bakery,Coffee Shop,Sushi Restaurant,Thai Restaurant,Food Truck,Tea Room,French Restaurant,Japanese Restaurant,Ice Cream Shop,American Restaurant


Finally, let's visualize the resulting clusters

In [48]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=12)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, common in zip(vancouver_merged['Latitude'], vancouver_merged['Longitude'], vancouver_merged['Neighbourhood'], vancouver_merged['Cluster Labels'], vancouver_merged['1st Most Common Venue']):
    label_text=str(str(poi) + ","+'Feature:' + str(common))
    label = folium.Popup(label_text, parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10,
        popup=label,
        color=rainbow[cluster-2],
        fill=True,
        fill_color=rainbow[cluster-2],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

# 5. Examine Clusters

Cluster 1

In [49]:
vancouver_merged.loc[vancouver_merged['Cluster Labels'] == 0, vancouver_merged.columns[[1] + list(range(5, vancouver_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
10,Oakridge,Vietnamese Restaurant,Convenience Store,Pizza Place,Pharmacy,Sushi Restaurant,Fast Food Restaurant,Sandwich Place,Gym,Yoga Studio,Event Space
18,Victoria-Fraserview,Convenience Store,Pizza Place,Sandwich Place,Fast Food Restaurant,Yoga Studio,Falafel Restaurant,French Restaurant,Food Truck,Food Court,Filipino Restaurant


Cluster 2

In [50]:
vancouver_merged.loc[vancouver_merged['Cluster Labels'] == 1, vancouver_merged.columns[[1] + list(range(5, vancouver_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
11,Shaughnessy,French Restaurant,Park,Yoga Studio,Farmers Market,Fried Chicken Joint,Food Truck,Food Court,Filipino Restaurant,Fast Food Restaurant,Falafel Restaurant


Cluster 3

In [51]:
vancouver_merged.loc[vancouver_merged['Cluster Labels'] == 2, vancouver_merged.columns[[1] + list(range(5, vancouver_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Sunset,Dessert Shop,South Indian Restaurant,Yoga Studio,Falafel Restaurant,French Restaurant,Food Truck,Food Court,Filipino Restaurant,Fast Food Restaurant,Farmers Market


Cluster 4

In [52]:
vancouver_merged.loc[vancouver_merged['Cluster Labels'] == 3, vancouver_merged.columns[[1] + list(range(5, vancouver_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Mount Pleasant,Coffee Shop,Sandwich Place,Diner,Sushi Restaurant,Breakfast Spot,Grocery Store,Vietnamese Restaurant,Arts & Crafts Store,Indian Restaurant,Thrift / Vintage Store
2,Riley Park,Coffee Shop,Japanese Restaurant,Restaurant,Café,Grocery Store,Sushi Restaurant,Thai Restaurant,Pub,Lounge,Chinese Restaurant
3,Downtown,Hotel,Food Truck,Coffee Shop,Café,Clothing Store,Steakhouse,Lounge,Electronics Store,Dessert Shop,Concert Hall
4,Kitsilano,Bakery,Coffee Shop,Sushi Restaurant,Thai Restaurant,Food Truck,Tea Room,French Restaurant,Japanese Restaurant,Ice Cream Shop,American Restaurant
5,Dunbar-Southlands,Japanese Restaurant,Liquor Store,Fast Food Restaurant,Coffee Shop,Salon / Barbershop,Indian Restaurant,Italian Restaurant,Diner,Discount Store,Donut Shop
6,Kerrisdale,Coffee Shop,Chinese Restaurant,Tea Room,Sushi Restaurant,Sandwich Place,Pharmacy,Fast Food Restaurant,Hobby Shop,Noodle House,Portuguese Restaurant
7,Arbutus-Ridge,Jewelry Store,Grocery Store,Pet Store,Bakery,Falafel Restaurant,French Restaurant,Food Truck,Food Court,Filipino Restaurant,Fast Food Restaurant
8,West Point Grey,Coffee Shop,Café,Japanese Restaurant,Sushi Restaurant,Bookstore,Pub,Vegetarian / Vegan Restaurant,Bakery,Liquor Store,Spa
9,Marpole,Pizza Place,Sushi Restaurant,Chinese Restaurant,Vietnamese Restaurant,Japanese Restaurant,Dim Sum Restaurant,Café,Bus Stop,Shanghai Restaurant,Massage Studio
12,Fairview,Coffee Shop,Park,Asian Restaurant,Indian Restaurant,Szechuan Restaurant,Diner,Nail Salon,Chinese Restaurant,Restaurant,Falafel Restaurant


Cluster 5

In [53]:
vancouver_merged.loc[vancouver_merged['Cluster Labels'] == 4, vancouver_merged.columns[[1] + list(range(5, vancouver_merged.shape[1]))]]

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
15,Killarney,Italian Restaurant,Pool,Track,Gym,Yoga Studio,Falafel Restaurant,French Restaurant,Food Truck,Food Court,Filipino Restaurant


## This summurizes the clustering of Vancouver based on the venue categories

# 6. Neighbourhood crime rate map of Vancouver

Download crime data from "https://vancouver.ca/police/Planning/2018/2018YEN.pdf"

In [54]:
from tabula import read_pdf

In [55]:
df=read_pdf("https://vancouver.ca/police/Planning/2018/2018YEN.pdf")
df.head()

Unnamed: 0.1,Unnamed: 0,Sex,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Theft from,Unnamed: 7,Unnamed: 8,Unnamed: 9,Offensive
0,Neighbourhood,Offences,Assaults,Robbery,B&E,Theft of MV,Auto,Theft<>$5K,Arson,Mischief,Weapons
1,Arbutus Ridge,5,10,2,90,15,104,73,3,43,2
2,Central Business District,157,1590,208,655,241,5223,4710,62,1553,289
3,Dunbar - Southlands,6,14,6,111,9,187,71,1,73,0
4,Fairview,23,106,18,210,90,675,845,7,197,22


In [56]:
vc_crime=df.loc[1:26,:]
vc_crime.columns=['Neighbourhood','Offences','Assaults','Robbery','B&E','Theft of MV','Auto','Theft<>$5K','Arson','Mischief','Weapons']
vc_crime.replace(to_replace ="GRAND TOTAL", 
                 value ="Vancouver") 
# converting 'object' data type into 'integer'
vc_crime['Neighbourhood']=vc_crime['Neighbourhood'].astype(str)
vc_crime['Offences']=vc_crime['Offences'].astype(str).astype(int)
vc_crime['Assaults']=vc_crime['Assaults'].astype(str).astype(int)
vc_crime['Robbery']=vc_crime['Robbery'].astype(str).astype(int)
vc_crime['B&E']=vc_crime['B&E'].astype(str).astype(int)
vc_crime['Theft of MV']=vc_crime['Theft of MV'].astype(str).astype(int)
vc_crime['Auto']=vc_crime['Auto'].astype(str).astype(int)
vc_crime['Theft<>$5K']=vc_crime['Theft<>$5K'].astype(str).astype(int)
vc_crime['Arson']=vc_crime['Arson'].astype(str).astype(int)
vc_crime['Mischief']=vc_crime['Mischief'].astype(str).astype(int)
vc_crime['Weapons']=vc_crime['Weapons'].astype(str).astype(int)
vc_crime.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Unnamed: 0,Neighbourhood,Offences,Assaults,Robbery,B&E,Theft of MV,Auto,Theft<>$5K,Arson,Mischief,Weapons
1,Arbutus Ridge,5,10,2,90,15,104,73,3,43,2
2,Central Business District,157,1590,208,655,241,5223,4710,62,1553,289
3,Dunbar - Southlands,6,14,6,111,9,187,71,1,73,0
4,Fairview,23,106,18,210,90,675,845,7,197,22
5,Grandview - Woodland,30,247,37,307,154,617,526,13,256,33


calculate total crime and percentage in each neighbourhood

In [57]:
vc_dum=vc_crime
vc_dum['Total']=vc_dum.sum(axis=1)
vc_dum.head(25)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,Neighbourhood,Offences,Assaults,Robbery,B&E,Theft of MV,Auto,Theft<>$5K,Arson,Mischief,Weapons,Total
1,Arbutus Ridge,5,10,2,90,15,104,73,3,43,2,347
2,Central Business District,157,1590,208,655,241,5223,4710,62,1553,289,14688
3,Dunbar - Southlands,6,14,6,111,9,187,71,1,73,0,478
4,Fairview,23,106,18,210,90,675,845,7,197,22,2193
5,Grandview - Woodland,30,247,37,307,154,617,526,13,256,33,2220
6,Hastings - Sunrise,38,115,12,164,86,599,275,8,177,13,1487
7,Kensington - Cedar Cottage,25,150,34,201,90,524,375,7,225,24,1655
8,Kerrisdale,6,18,1,123,16,161,83,5,42,3,458
9,Killarney,17,38,5,105,41,229,119,4,83,7,648
10,Kitsilano,26,92,22,268,74,730,556,16,306,14,2104


In [70]:
vc_crime.dtypes

Neighbourhood    object
Offences          int64
Assaults          int64
Robbery           int64
B&E               int64
Theft of MV       int64
Auto              int64
Theft<>$5K        int64
Arson             int64
Mischief          int64
Weapons           int64
Total             int64
dtype: object

now write a function to sort crime numbers in descending order

In [71]:
def return_most_common_crimes(row, num_top_crimes):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_crimes]

In [72]:
num_top_crimes = 4

indicators = ['st', 'nd', 'rd']

# create columns according to number of top crimes

columns = ['Neighbourhood']
for ind in np.arange(num_top_crimes):
    try:
        columns.append('{}{} Most Common crimes'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common crimes'.format(ind+1))

# create a new dataframe
neighbourhoods_crimes_sorted = pd.DataFrame(columns=columns)
neighbourhoods_crimes_sorted['Neighbourhood'] = vc_crime['Neighbourhood']

for ind in np.arange(vc_crime.shape[0]):
    neighbourhoods_crimes_sorted.iloc[ind, 1:] = return_most_common_crimes(vc_crime.iloc[ind, :], num_top_crimes)

neighbourhoods_crimes_sorted.rename(columns={'1st Most Common crimes':'Total','2nd Most Common crimes':'1st Most Common crimes',
                          '3rd Most Common crimes':'2nd Most Common crimes',
                          '4th Most Common crimes':'3rd Most Common crimes'}, 
                 inplace=True)
neighbourhoods_crimes_sorted.drop(columns=['Total'])


Unnamed: 0,Neighbourhood,1st Most Common crimes,2nd Most Common crimes,3rd Most Common crimes
1,Arbutus Ridge,Auto,B&E,Theft<>$5K
2,Central Business District,Auto,Theft<>$5K,Assaults
3,Dunbar - Southlands,Auto,B&E,Mischief
4,Fairview,Theft<>$5K,Auto,B&E
5,Grandview - Woodland,Auto,Theft<>$5K,B&E
6,Hastings - Sunrise,Auto,Theft<>$5K,Mischief
7,Kensington - Cedar Cottage,Auto,Theft<>$5K,Mischief
8,Kerrisdale,Auto,B&E,Theft<>$5K
9,Killarney,Auto,Theft<>$5K,B&E
10,Kitsilano,Auto,Theft<>$5K,Mischief


## new data frame to conatin information of 'neighbourhood', 'location', 'top crime' and 'Total'

In [73]:
# adding 'geolocation data' to the 'neighbourhood' column

mergedStuff = pd.merge(neighbourhood,neighbourhoods_crimes_sorted, on=['Neighbourhood'], how='inner')
mergedStuff['Total']=vc_dum['Total']
mergedStuff.fillna(1093)


Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Total,1st Most Common crimes,2nd Most Common crimes,3rd Most Common crimes
0,Vancouver,Sunset,49.219593,-123.090239,1093.0,Auto,Theft<>$5K,Mischief
1,Vancouver,Mount Pleasant,49.26333,-123.096588,347.0,Theft<>$5K,Auto,B&E
2,Vancouver,Riley Park,49.247438,-123.102966,14688.0,Auto,Theft<>$5K,B&E
3,Vancouver,Kitsilano,49.26941,-123.155267,478.0,Auto,Theft<>$5K,Mischief
4,Vancouver,Kerrisdale,49.234673,-123.155389,2193.0,Auto,B&E,Theft<>$5K
5,Vancouver,West Point Grey,49.264484,-123.185433,2220.0,Auto,B&E,Theft<>$5K
6,Vancouver,Marpole,49.209223,-123.13615,1487.0,Auto,Theft<>$5K,B&E
7,Vancouver,Oakridge,49.230829,-123.131134,1655.0,Theft<>$5K,Auto,B&E
8,Vancouver,Shaughnessy,49.251863,-123.138023,458.0,Auto,B&E,Theft<>$5K
9,Vancouver,Fairview,49.264113,-123.126835,648.0,Theft<>$5K,Auto,B&E


In [74]:
# categorizing 'total crime' into 3 groups by their statistical measure. low=50%, medium=up to 75%, High=upto maximum.
mergedStuff.describe()

Unnamed: 0,Total
count,13.0
mean,2312.153846
std,3821.015372
min,33.0
25%,478.0
50%,1487.0
75%,2193.0
max,14688.0


In [75]:
low = 1487
medium=2193
high=14688

In [76]:
# categorizing 'total crime' into 3 groups

bins = [0, low, medium, high]
names = ['Low', 'Medium', 'High']
names_num=[1,2,3]
mergedStuff['Crime category'] = pd.cut(mergedStuff['Total'], bins, labels=names)
mergedStuff['Crime level'] = pd.cut(mergedStuff['Total'], bins, labels=names_num)
mergedStuff.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Total,1st Most Common crimes,2nd Most Common crimes,3rd Most Common crimes,Crime category,Crime level
0,Vancouver,Sunset,49.2196,-123.09,,Auto,Theft<>$5K,Mischief,,
1,Vancouver,Mount Pleasant,49.2633,-123.097,347.0,Theft<>$5K,Auto,B&E,Low,1.0
2,Vancouver,Riley Park,49.2474,-123.103,14688.0,Auto,Theft<>$5K,B&E,High,3.0
3,Vancouver,Kitsilano,49.2694,-123.155,478.0,Auto,Theft<>$5K,Mischief,Low,1.0
4,Vancouver,Kerrisdale,49.2347,-123.155,2193.0,Auto,B&E,Theft<>$5K,Medium,2.0


In [77]:
mergedStuff = mergedStuff.drop([0],axis=0)

In [78]:
mergedStuff['Crime level']=mergedStuff['Crime level'].astype(int)
mergedStuff.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Total,1st Most Common crimes,2nd Most Common crimes,3rd Most Common crimes,Crime category,Crime level
1,Vancouver,Mount Pleasant,49.2633,-123.097,347.0,Theft<>$5K,Auto,B&E,Low,1
2,Vancouver,Riley Park,49.2474,-123.103,14688.0,Auto,Theft<>$5K,B&E,High,3
3,Vancouver,Kitsilano,49.2694,-123.155,478.0,Auto,Theft<>$5K,Mischief,Low,1
4,Vancouver,Kerrisdale,49.2347,-123.155,2193.0,Auto,B&E,Theft<>$5K,Medium,2
5,Vancouver,West Point Grey,49.2645,-123.185,2220.0,Auto,B&E,Theft<>$5K,High,3


In [79]:
mergedStuff

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude,Total,1st Most Common crimes,2nd Most Common crimes,3rd Most Common crimes,Crime category,Crime level
1,Vancouver,Mount Pleasant,49.2633,-123.097,347.0,Theft<>$5K,Auto,B&E,Low,1
2,Vancouver,Riley Park,49.2474,-123.103,14688.0,Auto,Theft<>$5K,B&E,High,3
3,Vancouver,Kitsilano,49.2694,-123.155,478.0,Auto,Theft<>$5K,Mischief,Low,1
4,Vancouver,Kerrisdale,49.2347,-123.155,2193.0,Auto,B&E,Theft<>$5K,Medium,2
5,Vancouver,West Point Grey,49.2645,-123.185,2220.0,Auto,B&E,Theft<>$5K,High,3
6,Vancouver,Marpole,49.2092,-123.136,1487.0,Auto,Theft<>$5K,B&E,Low,1
7,Vancouver,Oakridge,49.2308,-123.131,1655.0,Theft<>$5K,Auto,B&E,Medium,2
8,Vancouver,Shaughnessy,49.2519,-123.138,458.0,Auto,B&E,Theft<>$5K,Low,1
9,Vancouver,Fairview,49.2641,-123.127,648.0,Theft<>$5K,Auto,B&E,Low,1
10,Vancouver,South Cambie,49.2467,-123.121,2104.0,Auto,Theft<>$5K,B&E,Medium,2


### create map to display crime rate in vancouver neighbourhood

### only crime

In [83]:
# vancouver map
address= 'Vancouver, BC'
geolocator = Nominatim(user_agent="vancouver_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
map_vc_crime = folium.Map(location=[latitude, longitude], zoom_start=12)
# numer of categories
categories=3

# set color scheme for the 'crime group'
x = np.arange(categories)
ys = [i + x + (i*x)**2 for i in range(categories)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, level, crime in zip(mergedStuff['Latitude'], mergedStuff['Longitude'], mergedStuff['Neighbourhood'], mergedStuff['Crime level'], mergedStuff['2nd Most Common crimes']):
    label = folium.Popup(str(poi) + ', Crime :' + str(crime), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10*level,
        popup=label,
        color=rainbow[level-1],
        fill=True,
        fill_color=rainbow[level-1],
        fill_opacity=0.7).add_to(map_vc_crime)
       
map_vc_crime

### crime and feature data together over the vancouver neighbourhood 

In [84]:
# adding 'feature' data to the 'crime' data

merged_feature_crime = pd.merge(vancouver_merged,mergedStuff, on=['Neighbourhood'], how='inner')

merged_feature_crime.head()

Unnamed: 0,Borough_x,Neighbourhood,Latitude_x,Longitude_x,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,...,10th Most Common Venue,Borough_y,Latitude_y,Longitude_y,Total,1st Most Common crimes,2nd Most Common crimes,3rd Most Common crimes,Crime category,Crime level
0,Vancouver,Mount Pleasant,49.2633,-123.097,3,Coffee Shop,Sandwich Place,Diner,Sushi Restaurant,Breakfast Spot,...,Thrift / Vintage Store,Vancouver,49.2633,-123.097,347.0,Theft<>$5K,Auto,B&E,Low,1
1,Vancouver,Riley Park,49.2474,-123.103,3,Coffee Shop,Japanese Restaurant,Restaurant,Café,Grocery Store,...,Chinese Restaurant,Vancouver,49.2474,-123.103,14688.0,Auto,Theft<>$5K,B&E,High,3
2,Vancouver,Kitsilano,49.2694,-123.155,3,Bakery,Coffee Shop,Sushi Restaurant,Thai Restaurant,Food Truck,...,American Restaurant,Vancouver,49.2694,-123.155,478.0,Auto,Theft<>$5K,Mischief,Low,1
3,Vancouver,Kerrisdale,49.2347,-123.155,3,Coffee Shop,Chinese Restaurant,Tea Room,Sushi Restaurant,Sandwich Place,...,Portuguese Restaurant,Vancouver,49.2347,-123.155,2193.0,Auto,B&E,Theft<>$5K,Medium,2
4,Vancouver,West Point Grey,49.2645,-123.185,3,Coffee Shop,Café,Japanese Restaurant,Sushi Restaurant,Bookstore,...,Spa,Vancouver,49.2645,-123.185,2220.0,Auto,B&E,Theft<>$5K,High,3


In [85]:
# vancouver map
map_vc_crime_feature = folium.Map(location=[latitude, longitude], zoom_start=12)
# numer of categories
categories=3

# set color scheme for the 'crime group'
x = np.arange(categories)
ys = [i + x + (i*x)**2 for i in range(categories)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, level, crime, feature in zip(merged_feature_crime['Latitude_x'], merged_feature_crime['Longitude_x'], merged_feature_crime['Neighbourhood'], merged_feature_crime['Crime level'], merged_feature_crime['2nd Most Common crimes'], merged_feature_crime['1st Most Common Venue']):
    label = folium.Popup(str(poi) + ', Crime :' + str(crime)+', Feature:'+str(feature), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=10*level,
        popup=label,
        color=rainbow[level-4],
        fill=True,
        fill_color=rainbow[level-4],
        fill_opacity=0.7).add_to(map_vc_crime_feature)
       
map_vc_crime_feature

### This completes the analysis of this work