In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

## Question #1

### The table that is needed is encapsulated in tbody tags. and table rows are encapsulated in tr tags. For each row, when we just take the text, we will see that each value is in a newline of it's own. I will use \n to split the row into individual cells, but before doing that I am adding ", " before the newline character. This will come in handy when we need to group neighborhoods by postal code and Boroughs

In [50]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M').text
soup = BeautifulSoup(source,'lxml')
table = soup.find('tbody')

RowList = []

for rows in table.find_all('tr'):
    row = rows.text.replace('\n', ', \n')
    RowList.append(row.split('\n'))
RowList

[[', ', 'Postcode, ', 'Borough, ', 'Neighbourhood, ', ''],
 [', ', 'M1A, ', 'Not assigned, ', 'Not assigned, ', ''],
 [', ', 'M2A, ', 'Not assigned, ', 'Not assigned, ', ''],
 [', ', 'M3A, ', 'North York, ', 'Parkwoods, ', ''],
 [', ', 'M4A, ', 'North York, ', 'Victoria Village, ', ''],
 [', ', 'M5A, ', 'Downtown Toronto, ', 'Harbourfront, ', ''],
 [', ', 'M5A, ', 'Downtown Toronto, ', 'Regent Park, ', ''],
 [', ', 'M6A, ', 'North York, ', 'Lawrence Heights, ', ''],
 [', ', 'M6A, ', 'North York, ', 'Lawrence Manor, ', ''],
 [', ', 'M7A, ', "Queen's Park, ", 'Not assigned, ', ''],
 [', ', 'M8A, ', 'Not assigned, ', 'Not assigned, ', ''],
 [', ', 'M9A, ', 'Etobicoke, ', 'Islington Avenue, ', ''],
 [', ', 'M1B, ', 'Scarborough, ', 'Rouge, ', ''],
 [', ', 'M1B, ', 'Scarborough, ', 'Malvern, ', ''],
 [', ', 'M2B, ', 'Not assigned, ', 'Not assigned, ', ''],
 [', ', 'M3B, ', 'North York, ', 'Don Mills North, ', ''],
 [', ', 'M4B, ', 'East York, ', 'Woodbine Gardens, ', ''],
 [', ', 'M4B, ', '

In [51]:
#read it into a DataFrame, and discard empty columns
df = pd.DataFrame(RowList, columns=['empty', 'PostalCode','Borough','Neighborhood','empty2'])
df.drop(columns=['empty','empty2'], inplace=True)
df.drop(0, inplace=True)

#drop all rows that do not have a Borough assigned
df.drop(df[df.Borough=='Not assigned, '].index, axis=0, inplace=True)
df.head(3)

Unnamed: 0,PostalCode,Borough,Neighborhood
3,"M3A,","North York,","Parkwoods,"
4,"M4A,","North York,","Victoria Village,"
5,"M5A,","Downtown Toronto,","Harbourfront,"


In [4]:
# If Neighborhood is not assigned, then use assign Borough value
df.loc[df['Neighborhood'] == 'Not assigned, ', 'Neighborhood'] = df['Borough']

In [52]:
# Group by postal code and Borough and add the neighborhoods. This acheives combining multiple Neighborhoods under same Borough into one row
newdf = df.groupby(['PostalCode','Borough']).sum()
newdf=newdf.reset_index()

newdf.head(2)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,"M1B,","Scarborough,","Rouge, Malvern,"
1,"M1C,","Scarborough,","Highland Creek, Rouge Hill, Port Union,"


In [53]:
#However, we need to get rid of extra commas at the end. We use the code below to achevie that
newdf.PostalCode = newdf.PostalCode.apply(lambda x:x[:-2])
newdf.Borough = newdf.Borough.apply(lambda x: x[:-2])
newdf.Neighborhood = newdf.Neighborhood.apply(lambda x: x[:-2])

newdf.head(2)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"


## Answer to Question #1

In [57]:
newdf.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M1B,Scarborough,"Rouge, Malvern"
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union"
2,M1E,Scarborough,"Guildwood, Morningside, West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae
5,M1J,Scarborough,Scarborough Village
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park"
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge"
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West"
9,M1N,Scarborough,"Birch Cliff, Cliffside West"


In [60]:
newdf.shape

(103, 3)

## Question #2

In [47]:
#installing geocoder
!pip install geocoder

Collecting geocoder
  Downloading https://files.pythonhosted.org/packages/4f/6b/13166c909ad2f2d76b929a4227c952630ebaf0d729f6317eb09cbceccbab/geocoder-1.38.1-py2.py3-none-any.whl (98kB)
[K    100% |████████████████████████████████| 102kB 7.0MB/s ta 0:00:01
[?25hRequirement not upgraded as not directly required: click in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: future in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Collecting ratelim (from geocoder)
  Downloading https://files.pythonhosted.org/packages/f2/98/7e6d147fd16a10a5f821db6e25f192265d6ecca3d82957a4fdd592cad49c/ratelim-0.1.6-py2.py3-none-any.whl
Requirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from geocoder)
Requirement 

In [None]:
## Did not use geocoder as it was not working for me. I used the file provided instead. DO NOT PENALIZE FOR IT. EITHER OPTION SHOULD BE OKAY

In [49]:
import geocoder # import geocoder

# initialize your variable to None
#lat_lng_coords = None
#postalcode = 'M1A'
# loop until you get the coordinates
#while(lat_lng_coords is None):
#    g = geocoder.google('Toronto, Ontario'.format(postalcode))
#    print (g)
#    lat_lng_coords = g.latlng

#latitude = lat_lng_coords[0]
#longitude = lat_lng_coords[1]

#print (lat_lng_coords)

In [54]:
#reading the file into a DataFrame
lldf = pd.read_csv('http://cocl.us/Geospatial_data')
lldf.head(2)

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497


In [55]:
# doing an inner join on the two data frames to get latitude and longitude into the dataframe with other info
finaldf = pd.merge(newdf, lldf, how='inner',left_on = 'PostalCode', right_on = 'Postal Code')
finaldf.head(2)

Unnamed: 0,PostalCode,Borough,Neighborhood,Postal Code,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",M1B,43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",M1C,43.784535,-79.160497


In [56]:
#droppping the extra Postal Code column from the merge. Anad 
finaldf.drop(['Postal Code'],axis=1, inplace=True)
finaldf.head(2)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497


## Answer to Question #2

In [10]:
finaldf.head(10)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476
5,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
6,M1K,Scarborough,"East Birchmount Park, Ionview, Kennedy Park",43.727929,-79.262029
7,M1L,Scarborough,"Clairlea, Golden Mile, Oakridge",43.711112,-79.284577
8,M1M,Scarborough,"Cliffcrest, Cliffside, Scarborough Village West",43.716316,-79.239476
9,M1N,Scarborough,"Birch Cliff, Cliffside West",43.692657,-79.264848


In [58]:
finaldf.shape

(103, 5)

## Question #3

### Answer to this question is worth only 3 points. But given the question is open ended, I did not want to take any chances. This took a whole lot more time than the first two questions. But hey, I don't want to take a chance. 

In [12]:
#installing folium package
!pip install folium

Collecting folium
  Downloading https://files.pythonhosted.org/packages/55/e2/7e523df8558b7f4b2ab4c62014fd378ccecce3fdc14c9928b272a88ae4cc/folium-0.7.0-py3-none-any.whl (85kB)
[K    100% |████████████████████████████████| 92kB 7.5MB/s eta 0:00:01
[?25hCollecting branca>=0.3.0 (from folium)
  Downloading https://files.pythonhosted.org/packages/63/36/1c93318e9653f4e414a2e0c3b98fc898b4970e939afeedeee6075dd3b703/branca-0.3.1-py3-none-any.whl
Requirement not upgraded as not directly required: jinja2 in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: six in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: requests in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as not directly required: numpy in /opt/conda/envs/DSX-Python35/lib/python3.5/site-packages (from folium)
Requirement not upgraded as no

In [13]:
import folium
from sklearn.cluster import KMeans
import matplotlib as plt

In [62]:
# Initialized long and lat to Toronto coordinates. And marked all 103 entries on the map
Latitude = 43.6532
Longitude = -79.3832
map_toronto = folium.Map(location =[Latitude,Longitude],zoom_start=11)

for PostalCode,Borough,Neighborhood,Latitude,Longitude in zip(finaldf['PostalCode'],finaldf['Borough'],finaldf['Neighborhood'],finaldf['Latitude'],finaldf['Longitude']):
    label = PostalCode
    folium.CircleMarker(
        [Latitude,Longitude],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)
map_toronto

In [17]:
# The code was removed by Watson Studio for sharing.

In [18]:
#This is the function to retrive venues for a given longitude and latitude. Same as what we have seen in the lab exrecises.

import requests
import json
import pandas as pd

limit = 100
radius = 500
def get_nearby_venues(postcode,borough, name,latitude,longitude,radius=500):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        latitude,
        longitude,
        radius,
        limit)
    results = requests.get(url).json()
    
#I have not used JSON Normalize function. I read the JSON file the old fashioned way (so that I understand how it worked, but it is the same result
#iterate through the levels and pull the information we need from the file.

    list =[]
    new_results= results['response']['groups']
    for level1 in new_results:
        for level2 in level1['items']:
            venue_name = level2['venue']['name']
            venue_longitude=level2['venue']['location']['lng']
            venue_latitude = level2['venue']['location']['lat']
            for level3 in level2['venue']['categories']:
                venue_category = level3['shortName']
            list.append([postcode,borough,name,latitude,longitude,venue_category,venue_name,venue_longitude,venue_latitude])
        
        #Copied the data into a DataFrame
        
        df = pd.DataFrame(list,columns=['PostalCode','Borough','Neighborhood','Latitude','Longitude','Category','Venue_Name','Venue_Longitude','Venue_Latitude'])
        return df

In [21]:
#just like in the lab, we iterate through all the entries and get venues for all of them.

import pandas as pd

#initialize an empty data frame
venuedf_all = pd.DataFrame(columns=['PostalCode','Borough','Neighborhood','Latitude','Longitude','Category','Venue_Name','Venue_Longitude','Venue_Latitude'])

#iterate through all the entries and add the venues data frame returned by get_nearby_venues to the main DataFrame venuedf_all
for postcode,borough,neighborhood,latitude,longitude in zip(finaldf['PostalCode'],finaldf['Borough'],finaldf['Neighborhood'],finaldf['Latitude'],finaldf['Longitude']):
    venuedf = get_nearby_venues(postcode,borough,neighborhood,latitude,longitude)
    if isinstance(venuedf, pd.DataFrame):
        venuedf_all = pd.concat([venuedf_all,venuedf])
    else:
        print ("No venues : " & postcode,borough,neighborhood,latitude,longitude)

In [64]:
#the resulting DataFrame looks like below
venuedf_all.head(5)

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,Category,Venue_Name,Venue_Longitude,Venue_Latitude
0,M1B,Scarborough,"Rouge, Malvern",43.806686,-79.194353,Fast Food,Wendy's,-79.199056,43.807448
0,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Bar,Royal Canadian Legion,-79.163085,43.782533
1,M1C,Scarborough,"Highland Creek, Rouge Hill, Port Union",43.784535,-79.160497,Moving Target,Affordable Toronto Movers,-79.162977,43.787919
0,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Pizza,Swiss Chalet Rotisserie & Grill,-79.189914,43.767697
1,M1E,Scarborough,"Guildwood, Morningside, West Hill",43.763573,-79.188711,Electronics,G & G Electronics,-79.191537,43.765309


In [65]:
#doing one hot encoding on Category to do K means clustering later on.
# one hot encoding
venuedf_all_onehot = pd.get_dummies(venuedf_all['Category'],prefix="",prefix_sep="")

#add Neighborhood to the dataframe
venuedf_all_onehot.insert(loc=0,column='Neighborhood_Name',value=venuedf_all['Neighborhood'])
#add all the venue types by neighborhood (in thelab, the instructor took average, I am not sure why. But shouldn't make a difference eitherway)
venues_grouped = venuedf_all_onehot.groupby('Neighborhood_Name').sum().reset_index()
venues_grouped.head(5)

Unnamed: 0,Neighborhood_Name,Accessories,Adult Boutique,Afghan,Airport,Airport Service,American,Antiques,Apparel,Aquarium,...,Vegetarian / Vegan,Video Games,Video Store,Vietnamese,Warehouse Store,Wine Bar,Wings,Women's Store,Yoga Studio,Yogurt
0,"Adelaide, King, Richmond",0,0,0,0,0,4,0,3,0,...,1,0,0,0,0,1,0,1,0,0
1,Agincourt,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,"Alderwood, Long Branch",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
#this function, for a given neighborhood and # of venues returns two things 1) a sorted list of venue type in descending order for up to # of venues
#passed to the function. 2) A data frame of venue types and count of those venues for a given neighborhood,

def return_common_venues (neighborhood, no_of_venues):
    temp = venues_grouped[venues_grouped.Neighborhood_Name == neighborhood].T.reset_index()
    
    categories = temp.iloc[1:]
    sorted_categories = categories.sort_values(by=categories.columns.values[1],  ascending= False)
    topx = sorted_categories.head(no_of_venues).drop(categories.columns.values[1],axis=1).T.reset_index(drop=True)
    topx.insert(loc=0, column='Neighborhood', value = neighborhood)
    topxlist = topx.values.tolist()
    
    # this is for a second data frame that also stores # of venues
    temp.columns = ['venue','frequency']
    temp = temp[temp.venue != 'Neighborhood_Name']
    temp.sort_values('frequency', ascending = False, inplace = True)
    temp.reset_index(drop=True, inplace = True)
    temp.insert(loc=0,column='venue_rank', value = 1+ temp.index.values)
    temp.insert(loc=0,column='Neighborhood', value = neighborhood)
    
    return temp.head(no_of_venues), topxlist

In [25]:
no_of_venues=10
first_time = True
#created an empty list and an empty data frame to collect the information returned by the function above
venues_sorted_list=[]
venuesrankeddf_all = pd.DataFrame(columns = ['neighborhood','venue_rank','venue','frequency'])
for neighborhood in venues_grouped.Neighborhood_Name:
    venuesrankeddf, toplist = return_common_venues(neighborhood, no_of_venues)
    venues_sorted_list.extend(toplist)
    if isinstance(venuesrankeddf, pd.DataFrame):
        if first_time:
            venuesrankeddf_all = venuesrankeddf
            first_time = False
        else:
            venuesrankeddf_all = pd.concat([venuesrankeddf_all,venuesrankeddf])            
venues_sorted_all = pd.DataFrame(venues_sorted_list,columns = ['Neighborhood','1','2','3','4','5','6','7','8','9','10'])

In [70]:
# We will use this four our cluster analysis later on
venues_sorted_all.head(3)

Unnamed: 0,Neighborhood,1,2,3,4,5,6,7,8,9,10
0,"Adelaide, King, Richmond",Coffee Shop,Café,Thai,American,Steakhouse,Restaurant,Bar,Hotel,Gym,Apparel
1,Agincourt,Sandwiches,Breakfast,Skating Rink,Lounge,Accessories,Music Store,New American,Neighborhood,Music Venue,Museum
2,"Agincourt North, L'Amoreaux East, Milliken, St...",Playground,Park,Accessories,Moving Target,New American,Neighborhood,Music Venue,Music Store,Museum,Movie Theater


In [71]:
# If you look at these results, there seem to be a whole bunch of Neighborhoods that have ZERO venues.It seems like Coffee Shops dominate the scene(no surpise)
# Based on data, it looks like The Danforth West, and Riverdale neighborhoods are home to Greek diaspora. there are more insights here, bit I have given 3
# marks worth of insight so far already 
venuesrankeddf_all.sort_values(by = ['frequency','Neighborhood'],ascending = False)

Unnamed: 0,Neighborhood,venue_rank,venue,frequency
0,"Harbourfront East, Toronto Islands, Union Station",1,Coffee Shop,14
0,"Design Exchange, Toronto Dominion Centre",1,Coffee Shop,14
0,Central Bay Street,1,Coffee Shop,13
0,"The Danforth West, Riverdale",1,Greek,10
0,Queen's Park,1,Coffee Shop,10
0,"Commerce Court, Victoria Hotel",1,Coffee Shop,10
0,Stn A PO Boxes 25 The Esplanade,1,Coffee Shop,9
0,"Ryerson, Garden District",1,Coffee Shop,9
1,"Ryerson, Garden District",2,Apparel,8
0,"Little Portugal, Trinity",1,Bar,8


In [72]:
# set number of clusters
kclusters = 5

venues_grouped_clustering = venues_grouped.drop('Neighborhood_Name', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(venues_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_ 


array([2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 0, 3, 1, 3,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 2, 1, 1,
       1, 1, 3, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1,
       1, 1, 1, 1, 3, 1, 1, 1, 3, 4, 1, 2, 2, 3, 1, 1, 1, 3, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1], dtype=int32)

In [73]:
# added clusters to the DataFrame
venues_grouped['cluster'] = kmeans.labels_ 
venues_grouped.head(5)

Unnamed: 0,Neighborhood_Name,Accessories,Adult Boutique,Afghan,Airport,Airport Service,American,Antiques,Apparel,Aquarium,...,Video Games,Video Store,Vietnamese,Warehouse Store,Wine Bar,Wings,Women's Store,Yoga Studio,Yogurt,cluster
0,"Adelaide, King, Richmond",0,0,0,0,0,4,0,3,0,...,0,0,0,0,1,0,1,0,0,2
1,Agincourt,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,"Agincourt North, L'Amoreaux East, Milliken, St...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,"Alderwood, Long Branch",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [74]:
venues_grouped.shape[0]

99

In [75]:
#adding latitude, longitude and Borough columns to the DataFrame
venues_grouped.insert(loc=1,column='longitude',value=0.0)
venues_grouped.insert(loc=2,column='latitude',value=0.0)
venues_grouped.insert(loc=3,column='borough',value="")

In [76]:
#populating these values from finaldf DataFrame
venues_grouped.loc[venues_grouped.Neighborhood_Name.isin(finaldf.Neighborhood),['longitude']] = finaldf['Longitude']
venues_grouped.loc[venues_grouped.Neighborhood_Name.isin(finaldf.Neighborhood),['latitude']] = finaldf['Latitude']
venues_grouped.loc[venues_grouped.Neighborhood_Name.isin(finaldf.Neighborhood),['borough']] = finaldf['Borough']

In [77]:
#adding cluster column to the sorted venues DataFrame so that we can do cluster analysis 
venues_sorted_all.insert(loc=1,column='cluster', value = 5)

In [78]:
#populating the cluster column
venues_sorted_all.loc[venues_sorted_all.Neighborhood.isin(venues_grouped.Neighborhood_Name),['cluster']] = venues_grouped['cluster']

In [79]:
#this is the final DataFrame that we will use for cluster analysis
venues_sorted_all.head(5)

Unnamed: 0,Neighborhood,cluster,1,2,3,4,5,6,7,8,9,10
0,"Adelaide, King, Richmond",2,Coffee Shop,Café,Thai,American,Steakhouse,Restaurant,Bar,Hotel,Gym,Apparel
1,Agincourt,1,Sandwiches,Breakfast,Skating Rink,Lounge,Accessories,Music Store,New American,Neighborhood,Music Venue,Museum
2,"Agincourt North, L'Amoreaux East, Milliken, St...",1,Playground,Park,Accessories,Moving Target,New American,Neighborhood,Music Venue,Music Store,Museum,Movie Theater
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",1,Grocery Store,Sandwiches,Pizza,Fried Chicken,Pharmacy,Coffee Shop,Beer Store,Liquor Store,Fast Food,Accessories
4,"Alderwood, Long Branch",1,Pizza,Gym,Pool,Dance Studio,Pub,Sandwiches,Coffee Shop,Pharmacy,Skating Rink,Accessories


In [37]:
#creating the map to plot K means clusters
import numpy as np
import matplotlib.cm as cm
import matplotlib.colors as colors

#create map
map_clusters = folium.Map(location=[latitude,longitude], zoom_start =11)

#set color scheme for the clusters
x = np.arange(kclusters+1)
ys = [i+x+(i*x)**2 for i in range(kclusters+1)]
print(ys)
colors_array = cm.rainbow(np.linspace(0,1,len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
marker_colors = []

[array([0, 1, 2, 3, 4, 5]), array([ 1,  3,  7, 13, 21, 31]), array([  2,   7,  20,  41,  70, 107]), array([  3,  13,  41,  87, 151, 233]), array([  4,  21,  70, 151, 264, 409]), array([  5,  31, 107, 233, 409, 635])]


In [80]:
#plotting the K mean cluster on the map. Clusters are colored differently by cluster#
for lat, lon, poi, cluster in zip(venues_grouped['latitude'], venues_grouped['longitude'], venues_grouped['Neighborhood_Name'], venues_grouped['cluster']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

## Looks like Maps output doesn't show in Github. I have pasted the direct link the answers as well, if you want to take a look

## Cluster 1

### Dominated by coffee shops and hotels. Urban areas? with lot of visitors?

In [81]:
venues_sorted_all.loc[venues_sorted_all.cluster == 0, venues_sorted_all.columns[list(range(0, venues_sorted_all.shape[1]))]]

Unnamed: 0,Neighborhood,cluster,1,2,3,4,5,6,7,8,9,10
19,Central Bay Street,0,Coffee Shop,Café,Italian,Burgers,Bar,Sandwiches,Salad,Ice Cream,Chinese,Indian
32,"Design Exchange, Toronto Dominion Centre",0,Coffee Shop,Hotel,Café,Restaurant,American,Deli / Bodega,Italian,Gastropub,Gym,Seafood
49,"Harbourfront East, Toronto Islands, Union Station",0,Coffee Shop,Hotel,Aquarium,Pizza,Café,Restaurant,Italian,Scenic Lookout,Brewery,Bakery


## Cluster 2

### Based on a quick analysis, a reasonable hypothesis is these are work/entertainment/business areas of the city. Not spending any more time on analysis of clusters given this whole question counts only for 3 marks

In [42]:
venues_sorted_all.loc[venues_sorted_all.cluster == 1, venues_sorted_all.columns[list(range(0, venues_sorted_all.shape[1]))]]

Unnamed: 0,Neighborhood,cluster,1,2,3,4,5,6,7,8,9,10
1,Agincourt,1,Sandwiches,Breakfast,Skating Rink,Lounge,Accessories,Music Store,New American,Neighborhood,Music Venue,Museum
2,"Agincourt North, L'Amoreaux East, Milliken, St...",1,Playground,Park,Accessories,Moving Target,New American,Neighborhood,Music Venue,Music Store,Museum,Movie Theater
3,"Albion Gardens, Beaumond Heights, Humbergate, ...",1,Grocery Store,Sandwiches,Pizza,Fried Chicken,Pharmacy,Coffee Shop,Beer Store,Liquor Store,Fast Food,Accessories
4,"Alderwood, Long Branch",1,Pizza,Gym,Pool,Dance Studio,Pub,Sandwiches,Coffee Shop,Pharmacy,Skating Rink,Accessories
5,"Bathurst Manor, Downsview North, Wilson Heights",1,Coffee Shop,Yogurt,Grocery Store,Pizza,Diner,Pharmacy,Restaurant,Sandwiches,Fast Food,Bridal
6,Bayview Village,1,Japanese,Chinese,Bank,Café,Motel,Movie Theater,Moving Target,Museum,Music Store,Opera House
7,"Bedford Park, Lawrence Manor East",1,Fast Food,Sushi,Italian,Juice Bar,Coffee Shop,Pub,Restaurant,Butcher,Café,Sandwiches
9,"Birch Cliff, Cliffside West",1,Café,Stadium,Entertainment,Skating Rink,Museum,New American,Neighborhood,Music Venue,Music Store,Accessories
10,"Bloordale Gardens, Eringate, Markland Wood, Ol...",1,Pharmacy,Beer Store,Liquor Store,Pizza,Café,Convenience Store,Motel,Molecular Gastronomy,Movie Theater,Moving Target
11,"Brockton, Exhibition Place, Parkdale Village",1,Breakfast,Café,Coffee Shop,Furniture / Home,Office,Grocery Store,Gym,Climbing Gym,Performing Arts,Caribbean


## Cluster 3

In [82]:
venues_sorted_all.loc[venues_sorted_all.cluster == 2, venues_sorted_all.columns[list(range(0, venues_sorted_all.shape[1]))]]

Unnamed: 0,Neighborhood,cluster,1,2,3,4,5,6,7,8,9,10
0,"Adelaide, King, Richmond",2,Coffee Shop,Café,Thai,American,Steakhouse,Restaurant,Bar,Hotel,Gym,Apparel
27,"Commerce Court, Victoria Hotel",2,Coffee Shop,Café,Hotel,Restaurant,American,Bakery,Gastropub,Gym,Seafood,Steakhouse
43,"First Canadian Place, Underground city",2,Coffee Shop,Café,Hotel,Restaurant,American,Deli / Bodega,Bakery,Seafood,Gastropub,Gym
80,St. James Town,2,Coffee Shop,Restaurant,Café,Hotel,Apparel,Bakery,Breakfast,Park,Gastropub,Cosmetics
81,Stn A PO Boxes 25 The Esplanade,2,Coffee Shop,Restaurant,Café,Seafood,Pub,Beer Bar,Cocktail,Hotel,Italian,Fast Food


## Cluster 4

In [44]:
venues_sorted_all.loc[venues_sorted_all.cluster == 3, venues_sorted_all.columns[list(range(0, venues_sorted_all.shape[1]))]]

Unnamed: 0,Neighborhood,cluster,1,2,3,4,5,6,7,8,9,10
8,Berczy Park,3,Coffee Shop,Restaurant,Cocktail,Seafood,Steakhouse,Farmer's Market,Bakery,Pub,Beer Bar,Café
15,"Cabbagetown, St. James Town",3,Restaurant,Coffee Shop,Italian,Bakery,Pub,Pizza,Café,Market,Chinese,Gift Shop
20,"Chinatown, Grange Park, Kensington Market",3,Bar,Café,Vegetarian / Vegan,Vietnamese,Coffee Shop,Dumplings,Bakery,Chinese,Mexican,Burgers
22,Church and Wellesley,3,Japanese,Sushi,Coffee Shop,Gay Bar,Restaurant,Burgers,Men's Store,Bubble Tea,Gastropub,Pub
48,"Harbord, University of Toronto",3,Café,Gym,Coffee Shop,Restaurant,Bar,Bakery,Japanese,Bookstore,Noodles,Chinese
50,"Harbourfront, Regent Park",3,Coffee Shop,Bakery,Park,Pub,Café,Theater,Mexican,Breakfast,Restaurant,Farmer's Market
64,"Little Portugal, Trinity",3,Bar,Men's Store,Coffee Shop,Asian,Restaurant,Café,Pizza,Bakery,Cocktail,Vietnamese
73,Queen's Park,3,Coffee Shop,Gym,Japanese,Sushi,Diner,Bar,Nightclub,Smoothie Shop,Fast Food,Sandwiches
77,"Runnymede, Swansea",3,Pizza,Café,Coffee Shop,Sushi,Diner,Italian,Gastropub,Restaurant,Bookstore,South American
82,Studio District,3,Café,Coffee Shop,American,Italian,Bakery,Brewery,Seafood,Neighborhood,Park,Sandwiches


## Cluster 5

In [45]:
venues_sorted_all.loc[venues_sorted_all.cluster == 4, venues_sorted_all.columns[list(range(0, venues_sorted_all.shape[1]))]]

Unnamed: 0,Neighborhood,cluster,1,2,3,4,5,6,7,8,9,10
42,"Fairview, Henry Farm, Oriole",4,Apparel,Fast Food,Coffee Shop,Toys & Games,Restaurant,Electronics,Kids Store,Tea Room,Bakery,Food Court
78,"Ryerson, Garden District",4,Coffee Shop,Apparel,Café,Cosmetics,Middle Eastern,Theater,Diner,Restaurant,Ramen,Plaza
