# Step 1: Install Wikipedia library and functions for this environment

In [20]:

import sys
!{sys.executable} -m pip install wikipedia
!{sys.executable} -m pip install folium

Requirement not upgraded as not directly required: wikipedia in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages
Requirement not upgraded as not directly required: requests<3.0.0,>=2.0.0 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from wikipedia)
Requirement not upgraded as not directly required: beautifulsoup4 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from wikipedia)
Requirement not upgraded as not directly required: chardet<3.1.0,>=3.0.2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests<3.0.0,>=2.0.0->wikipedia)
Requirement not upgraded as not directly required: idna<2.7,>=2.5 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests<3.0.0,>=2.0.0->wikipedia)
Requirement not upgraded as not directly required: urllib3<1.23,>=1.21.1 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from requests<3.0.0,>=2.0.0->wikipedia)
Requirement not upgraded as not directly required: certifi>=2017.4.17 

# Step 2: Use inbuilt wikipedia functions to extract and clean the table in the desired datafame format

In [2]:
import pandas as pd
import wikipedia as wp
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn' used to ignore the warn message
 
#Get the html source
html = wp.page("List of postal codes of Canada: M").html().encode("UTF-8")
df = pd.read_html(html)[0]
df.to_csv('beautifulsoup_pandas.csv',header=0,index=False)
#Data cleanup - Renaming columns appropriately, marking Not assigned as NaN and dropping the irrelevant rows, using groupby and aggregate functions to get the data frame 
df1=df.rename(index=str,columns={0:"Postcode",1:"Borough",2:"Neighbourhood"})
df1['Borough'].replace("Borough",np.nan,inplace=True) 
df1['Borough'].replace("Not assigned",np.nan, inplace=True)
df2=df1.dropna()
df2.reset_index(drop=True, inplace=True)
df2['Neighbourhood'].replace("Not assigned",df2['Borough'],inplace=True)
df3=pd.DataFrame(df2.groupby('Postcode',as_index=False).agg(lambda x:','.join(set(x))))
df3


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Malvern,Rouge"
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek"
2,M1E,Scarborough,"Morningside,West Hill,Guildwood"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"Kennedy Park,Ionview,East Birchmount Park"
7,M1L,Scarborough,"Golden Mile,Clairlea,Oakridge"
8,M1M,Scarborough,"Cliffside,Cliffcrest,Scarborough Village West"
9,M1N,Scarborough,"Cliffside West,Birch Cliff"


# Step 3: Printing the dataframe shape

In [3]:
df3.shape

(103, 3)

# Step 4: Reading Geocoding Data

In [5]:
df4 = pd.read_csv("http://cocl.us/Geospatial_data")
df4.head()


Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


# Step 5: Adding geocoding data to the dataframe

In [8]:
keys= df3.Postcode.unique()
for keys in df4['Postal Code']:
     df3['Latitude'] = df4['Latitude']
     df3['Longitude']= df4['Longitude']
        
df3


Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Malvern,Rouge",43.806686,-79.194353
1,M1C,Scarborough,"Port Union,Rouge Hill,Highland Creek",43.784535,-79.160497
2,M1E,Scarborough,"Morningside,West Hill,Guildwood",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"Kennedy Park,Ionview,East Birchmount Park",43.727929,-79.262029
7,M1L,Scarborough,"Golden Mile,Clairlea,Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffside,Cliffcrest,Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Cliffside West,Birch Cliff",43.692657,-79.264848


In [39]:
from geopy.geocoders import Nominatim # importing the geocoder package
import folium

#Cleaning the data to replace east, west, central and downtown Toronto with Toronto
df3['Borough'].replace("East Toronto","Toronto",inplace=True)
df3['Borough'].replace("Central Toronto","Toronto",inplace=True)
df3['Borough'].replace("Downtown Toronto","Toronto",inplace=True)
df3['Borough'].replace("West Toronto","Toronto",inplace=True)
toronto_data = df3[df3['Borough'] == 'Toronto'].reset_index(drop=True)
toronto_data




Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M4E,Toronto,The Beaches,43.676357,-79.293031
1,M4K,Toronto,"The Danforth West,Riverdale",43.679557,-79.352188
2,M4L,Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572
3,M4M,Toronto,Studio District,43.659526,-79.340923
4,M4N,Toronto,Lawrence Park,43.72802,-79.38879
5,M4P,Toronto,Davisville North,43.712751,-79.390197
6,M4R,Toronto,North Toronto West,43.715383,-79.405678
7,M4S,Toronto,Davisville,43.704324,-79.38879
8,M4T,Toronto,"Summerhill East,Moore Park",43.689574,-79.38316
9,M4V,Toronto,"Summerhill West,Forest Hill SE,Deer Park,Rathn...",43.686412,-79.400049


# Step 6 : Setting up FourSquare API

In [40]:
CLIENT_ID = 'CCO0NCOVINXKNMLFOGJOPP2AUSXP44IJI5054EIYDOHPVHE0' # your Foursquare ID
CLIENT_SECRET = 'OUPMUQJEZ5VFSUFPDX3CBJXHNAPQAOWKPAAHK0NDFXCJNFXH' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
radius =50
LIMIT=100
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import json

# Step 7: Analyzing all Toronto neighbourhoods 

In [85]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighbourhood', 
                  'Neighbourhood Latitude', 
                  'Neighbourhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

toronto_venues = getNearbyVenues(names=toronto_data['Neighbourhood'],
                                   latitudes=toronto_data['Latitude'],
                                   longitudes=toronto_data['Longitude']
                                  )
#print(toronto_venues.shape)
#toronto_venues.head()
#toronto_venues.groupby('Neighborhood').count()
#print('There are {} uniques categories.'.format(len(toronto_venues['Venue Category'].unique())))
# one hot encoding
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
toronto_onehot['Neighbourhood'] = toronto_venues['Neighbourhood'] 

# move neighborhood column to the first column
fixed_columns = [toronto_onehot.columns[-1]] + list(toronto_onehot.columns[:-1])
toronto_onehot = toronto_onehot[fixed_columns]

toronto_onehot.head()
toronto_grouped = toronto_onehot.groupby('Neighbourhood').mean().reset_index()
toronto_grouped

Unnamed: 0,Neighbourhood,Adult Boutique,Afghan Restaurant,Airport,Airport Food Court,Airport Gate,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Video Game Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,"Adelaide,Richmond,King",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.01,0.0,0.0,0.01,0.0,0.0,0.01,0.0
1,Berczy Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Business Reply Mail Processing Centre 969 Eastern,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556
3,"Cabbagetown,St. James Town",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Central Bay Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012048,...,0.0,0.0,0.012048,0.0,0.0,0.012048,0.0,0.0,0.0,0.012048
5,"Chinatown,Kensington Market,Grange Park",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.05,0.0,0.05,0.01,0.0,0.0,0.0,0.0
6,Christie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Church and Wellesley,0.011628,0.011628,0.0,0.0,0.0,0.0,0.0,0.0,0.011628,...,0.0,0.0,0.0,0.011628,0.011628,0.0,0.0,0.011628,0.0,0.011628
8,"Commerce Court,Victoria Hotel",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0
9,Davisville,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Step 8 : Creating a new dataframe to capture Neighbourhood venue frequency information

In [95]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighbourhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighbourhood'] = toronto_grouped['Neighbourhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighbourhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,"Adelaide,Richmond,King",Coffee Shop,Café,Thai Restaurant,Steakhouse,American Restaurant,Clothing Store,Gym,Hotel,Bakery,Bar
1,Berczy Park,Coffee Shop,Restaurant,Cocktail Bar,Bakery,Cheese Shop,Farmers Market,Pub,Seafood Restaurant,Beer Bar,Italian Restaurant
2,Business Reply Mail Processing Centre 969 Eastern,Light Rail Station,Yoga Studio,Garden,Comic Shop,Pizza Place,Park,Recording Studio,Restaurant,Butcher,Burrito Place
3,"Cabbagetown,St. James Town",Coffee Shop,Restaurant,Bakery,Market,Café,Italian Restaurant,Pub,Pharmacy,Park,Pizza Place
4,Central Bay Street,Coffee Shop,Café,Italian Restaurant,Bar,Chinese Restaurant,Burger Joint,Middle Eastern Restaurant,Indian Restaurant,Ice Cream Shop,Sandwich Place


# Step 9: Running clustering analysis on neighbourhoods based on types of venues

In [96]:
from sklearn.cluster import KMeans
# set number of clusters
kclusters = 7 # Iterative setting to find the best possible clusters
toronto_grouped_clustering = toronto_grouped.drop('Neighbourhood', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
# add clustering labels
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = toronto_data

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighbourhood'), on='Neighbourhood')

toronto_merged.head() # check the last columns! 

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,M4E,Toronto,The Beaches,43.676357,-79.293031,6,Coffee Shop,Health Food Store,Neighborhood,Pub,Park,Electronics Store,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
1,M4K,Toronto,"The Danforth West,Riverdale",43.679557,-79.352188,6,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Yoga Studio,Brewery,Bakery,Juice Bar,Liquor Store
2,M4L,Toronto,"The Beaches West,India Bazaar",43.668999,-79.315572,0,Gym,Brewery,Sushi Restaurant,Steakhouse,Liquor Store,Fish & Chips Shop,Park,Fast Food Restaurant,Burger Joint,Intersection
3,M4M,Toronto,Studio District,43.659526,-79.340923,6,Café,Coffee Shop,Italian Restaurant,Bakery,American Restaurant,Fish Market,Bookstore,Middle Eastern Restaurant,Latin American Restaurant,Brewery
4,M4N,Toronto,Lawrence Park,43.72802,-79.38879,3,Park,Swim School,Dim Sum Restaurant,Bus Line,Yoga Studio,Doner Restaurant,Filipino Restaurant,Fast Food Restaurant,Farmers Market,Falafel Restaurant


# Step 9: Identification of different clusters
   Iteratively looking at the 'Cluster Labels' from 0 to 6 , 2 dense clusters were identified for labels -0 and labels -6

In [89]:
toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
2,Toronto,0,Gym,Brewery,Sushi Restaurant,Steakhouse,Liquor Store,Fish & Chips Shop,Park,Fast Food Restaurant,Burger Joint,Intersection
5,Toronto,0,Food & Drink Shop,Gym,Burger Joint,Clothing Store,Park,Sandwich Place,Hotel,Breakfast Spot,Event Space,Dog Run
7,Toronto,0,Pizza Place,Dessert Shop,Sandwich Place,Sushi Restaurant,Coffee Shop,Italian Restaurant,Café,Gym,Seafood Restaurant,Flower Shop
12,Toronto,0,Japanese Restaurant,Coffee Shop,Sushi Restaurant,Gay Bar,Restaurant,Burger Joint,Pub,Mediterranean Restaurant,Gastropub,Fast Food Restaurant
25,Toronto,0,Café,Coffee Shop,Bakery,Bookstore,Restaurant,Theater,Bar,Japanese Restaurant,Nightclub,Noodle House
26,Toronto,0,Café,Bar,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Bakery,Coffee Shop,Dumpling Restaurant,Chinese Restaurant,Mexican Restaurant,Dessert Shop
27,Toronto,0,Airport Lounge,Airport Service,Airport Terminal,Boat or Ferry,Boutique,Airport,Airport Food Court,Airport Gate,Plane,Harbor / Marina
30,Toronto,0,Café,Grocery Store,Park,Convenience Store,Restaurant,Baby Store,Nightclub,Diner,Italian Restaurant,Athletics & Sports
31,Toronto,0,Bakery,Pharmacy,Supermarket,Gym / Fitness Center,Pizza Place,Pool,Music Venue,Discount Store,Café,Middle Eastern Restaurant
32,Toronto,0,Bar,Men's Store,Coffee Shop,Asian Restaurant,Pizza Place,Vietnamese Restaurant,Restaurant,Bakery,Cocktail Bar,Café


In [97]:

toronto_merged.loc[toronto_merged['Cluster Labels'] == 6, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]

Unnamed: 0,Borough,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Toronto,6,Coffee Shop,Health Food Store,Neighborhood,Pub,Park,Electronics Store,Doner Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant
1,Toronto,6,Greek Restaurant,Coffee Shop,Ice Cream Shop,Italian Restaurant,Bookstore,Yoga Studio,Brewery,Bakery,Juice Bar,Liquor Store
3,Toronto,6,Café,Coffee Shop,Italian Restaurant,Bakery,American Restaurant,Fish Market,Bookstore,Middle Eastern Restaurant,Latin American Restaurant,Brewery
6,Toronto,6,Clothing Store,Sporting Goods Shop,Coffee Shop,Yoga Studio,Park,Chinese Restaurant,Rental Car Location,Dessert Shop,Salon / Barbershop,Sandwich Place
9,Toronto,6,Pub,Coffee Shop,Convenience Store,Light Rail Station,Sushi Restaurant,Supermarket,Sports Bar,Fried Chicken Joint,American Restaurant,Vietnamese Restaurant
11,Toronto,6,Coffee Shop,Restaurant,Bakery,Market,Café,Italian Restaurant,Pub,Pharmacy,Park,Pizza Place
13,Toronto,6,Coffee Shop,Bakery,Café,Park,Pub,Theater,Breakfast Spot,Mexican Restaurant,Restaurant,Gym / Fitness Center
14,Toronto,6,Coffee Shop,Clothing Store,Café,Middle Eastern Restaurant,Cosmetics Shop,Japanese Restaurant,Italian Restaurant,Restaurant,Pizza Place,Plaza
15,Toronto,6,Coffee Shop,Restaurant,Café,Hotel,Clothing Store,Cocktail Bar,Cosmetics Shop,Bakery,Park,Italian Restaurant
16,Toronto,6,Coffee Shop,Restaurant,Cocktail Bar,Bakery,Cheese Shop,Farmers Market,Pub,Seafood Restaurant,Beer Bar,Italian Restaurant


# Step 10 : Two Key clusters were noticed -  Label (0) - having cafes and Label (6)- having coffee shops as the highest no of venues in the locality of Toronto

In [111]:
clusters= pd.DataFrame(toronto_merged.groupby('Cluster Labels', as_index=False).agg(lambda x: set(x)))
clusters
clusters['Neighbourhood']


0    {The Beaches West,India Bazaar, King and Spadi...
1                                           {Roselawn}
2                                           {Rosedale}
3                                      {Lawrence Park}
4                         {Summerhill East,Moore Park}
5                 {Forest Hill West,Forest Hill North}
6    {Yorkville,The Annex,North Midtown, Design Exc...
Name: Neighbourhood, dtype: object