## Jupyter notebook for Applied Data Science Capstone project

#### Importing necessary libraries

In [207]:
import pandas as pd
import numpy as np
import geocoder
import folium
import os
import requests
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

## Part 1 -Toronto postal codes

#### I used Pandas *read_html* method to read table from wikipedia

In [7]:
df_list = pd.read_html('https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M')

#### This method return *list*, and correct dataframe is it's first element

In [8]:
df_postal_codes = df_list[0]

In [9]:
df_postal_codes.head()

Unnamed: 0,Postal Code,Borough,Neighborhood
0,M1A,Not assigned,
1,M2A,Not assigned,
2,M3A,North York,Parkwoods
3,M4A,North York,Victoria Village
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### **Boroughs** with *Not assigned* value are filtered

In [10]:
df_postal_codes = df_postal_codes[~(df_postal_codes['Borough']=='Not assigned')]

#### I checked if there is no more missing values

In [11]:
df_postal_codes.isnull().sum()

Postal Code     0
Borough         0
Neighborhood    0
dtype: int64

In [12]:
df_postal_codes[df_postal_codes['Neighborhood']=='Not assigned'].count()

Postal Code     0
Borough         0
Neighborhood    0
dtype: int64

#### I checked if **Borough** with **Postal Code** M5A has two **Neighborhoods** assigned

In [13]:
df_postal_codes[df_postal_codes['Postal Code']=='M5A']

Unnamed: 0,Postal Code,Borough,Neighborhood
4,M5A,Downtown Toronto,"Regent Park, Harbourfront"


#### Shape of dataframe:

In [14]:
df_postal_codes.shape

(103, 3)

## Part 2 -neighborhoods coordinates

#### Google Maps API did not work, so I decided to test of *geocoder* package with data provider *arcgis* and it worked

In [15]:
 g = geocoder.arcgis('M5G, Toronto, Ontario')

In [16]:
g.geojson

{'type': 'FeatureCollection',
 'features': [{'type': 'Feature',
   'properties': {'address': 'M5G',
    'bbox': [-79.39065318999997,
     43.65107218800005,
     -79.38065318999998,
     43.661072188000055],
    'confidence': 7,
    'lat': 43.65607218800005,
    'lng': -79.38565318999997,
    'ok': True,
    'quality': 'Postal',
    'raw': {'name': 'M5G',
     'extent': {'xmin': -79.39065318999997,
      'ymin': 43.65107218800005,
      'xmax': -79.38065318999998,
      'ymax': 43.661072188000055},
     'feature': {'geometry': {'x': -79.38565318999997, 'y': 43.65607218800005},
      'attributes': {'Score': 100, 'Addr_Type': 'Postal'}}},
    'score': 100,
    'status': 'OK'},
   'bbox': [-79.39065318999997,
    43.65107218800005,
    -79.38065318999998,
    43.661072188000055],
   'geometry': {'type': 'Point',
    'coordinates': [-79.38565318999997, 43.65607218800005]}}]}

#### Retrieving geodata for all postal codes

In [17]:
postal_codes_coordinates=[]

In [18]:
for postal_code in df_postal_codes['Postal Code']:
    address = '{}, Toronto, Ontario'.format(postal_code)
    postal_code_coordinates = geocoder.arcgis(address)
    postal_codes_coordinates.append(postal_code_coordinates)

#### Verification of retrieved data.

In [19]:
len(postal_codes_coordinates)

103

In [20]:
df_postal_codes['Postal Code']

2      M3A
3      M4A
4      M5A
5      M6A
6      M7A
      ... 
160    M8X
165    M4Y
168    M7Y
169    M8Y
178    M8Z
Name: Postal Code, Length: 103, dtype: object

#### Extracting coordinates from geodata to separate lists.

In [21]:
longitudes = []
latitudes = []

In [22]:
for coordinates in postal_codes_coordinates:
    longitude = coordinates.geojson['features'][0]['geometry']['coordinates'][0]
    longitudes.append(longitude)
    
    latitude = coordinates.geojson['features'][0]['geometry']['coordinates'][1]
    latitudes.append(latitude)

#### Adding coordinates to dataframe and verification.

In [23]:
df_postal_codes.reset_index(inplace=True)

In [24]:
df_postal_codes['Latitude'] = latitudes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [25]:
df_postal_codes['Longitude'] = longitudes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [26]:
df_postal_codes.isnull().sum()

index           0
Postal Code     0
Borough         0
Neighborhood    0
Latitude        0
Longitude       0
dtype: int64

In [141]:
df_postal_codes.drop('index', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


#### Saving dataframe for potential future use.

In [28]:
df_postal_codes.to_csv('Toronto_postal_codes.csv')

### Part 3 -clustering neighborhoods

##### Creating map of Toronto with created postal codes coordinates.

In [30]:
toronto_coordinates = [43.7325, -79.3993]
map_toronto = folium.Map(location=toronto_coordinates, zoom_start=10)

# add markers to map
for lat, lng, postal_code, borough, neighborhood in zip(df_postal_codes['Latitude'], df_postal_codes['Longitude'],df_postal_codes['Postal Code'], df_postal_codes['Borough'], df_postal_codes['Neighborhood']):
    label = '{}, {}, {}'.format(postal_code,neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_toronto)  
    
map_toronto

#### Preparing parameters for url request.

In [47]:
CLIENT_ID = os.environ.get('FOURSQUAREID')
CLIENT_SECRET = os.environ.get('FOURSQUARESECRET')
VERSION = '20180605'
LIMIT = 100

##### Function for retrieving nearby venues.

In [48]:
def get_nearby_venues(names, latitudes, longitudes, radius=500):
    venues_list = []
    
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)

        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        results = requests.get(url).json()["response"]['groups'][0]['items']
        #print(results)
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
              'Neighborhood Latitude', 
              'Neighborhood Longitude', 
              'Venue', 
              'Venue Latitude', 
              'Venue Longitude', 
              'Venue Category']

    return(nearby_venues)

#### Venues for all neighborhoods.

In [228]:
toronto_venues = get_nearby_venues(names=df_postal_codes['Neighborhood'],
                                   latitudes=df_postal_codes['Latitude'],
                                   longitudes=df_postal_codes['Longitude']
                                  )

Parkwoods
Victoria Village
Regent Park, Harbourfront
Lawrence Manor, Lawrence Heights
Queen's Park, Ontario Provincial Government
Islington Avenue, Humber Valley Village
Malvern, Rouge
Don Mills
Parkview Hill, Woodbine Gardens
Garden District, Ryerson
Glencairn
West Deane Park, Princess Gardens, Martin Grove, Islington, Cloverdale
Rouge Hill, Port Union, Highland Creek
Don Mills
Woodbine Heights
St. James Town
Humewood-Cedarvale
Eringate, Bloordale Gardens, Old Burnhamthorpe, Markland Wood
Guildwood, Morningside, West Hill
The Beaches
Berczy Park
Caledonia-Fairbanks
Woburn
Leaside
Central Bay Street
Christie
Cedarbrae
Hillcrest Village
Bathurst Manor, Wilson Heights, Downsview North
Thorncliffe Park
Richmond, Adelaide, King
Dufferin, Dovercourt Village
Scarborough Village
Fairview, Henry Farm, Oriole
Northwood Park, York University
East Toronto, Broadview North (Old East York)
Harbourfront East, Union Station, Toronto Islands
Little Portugal, Trinity
Kennedy Park, Ionview, East Birchmo

In [229]:
toronto_venues.shape

(2274, 7)

In [230]:
toronto_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Parkwoods,43.752935,-79.335641,Brookbanks Park,43.751976,-79.33214,Park
1,Parkwoods,43.752935,-79.335641,Variety Store,43.751974,-79.333114,Food & Drink Shop
2,Parkwoods,43.752935,-79.335641,649 Variety,43.754513,-79.331942,Convenience Store
3,Victoria Village,43.728102,-79.31189,Tim Hortons,43.725517,-79.313103,Coffee Shop
4,Victoria Village,43.728102,-79.31189,Portugril,43.725819,-79.312785,Portuguese Restaurant


#### Saving dataframe for potential future use.

In [231]:
toronto_venues.to_csv('toronto_venues.csv')

In [232]:
#toronto_venues = pd.read_csv('toronto_venues.csv')

##### Inspecting retrieved data.

In [233]:
toronto_venues.groupby('Neighborhood').count().sort_values(by='Venue')

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Roselawn,1,1,1,1,1,1
"East Toronto, Broadview North (Old East York)",1,1,1,1,1,1
"Forest Hill North & West, Forest Hill Road Park",1,1,1,1,1,1
"Wexford, Maryvale",1,1,1,1,1,1
"Malvern, Rouge",1,1,1,1,1,1
...,...,...,...,...,...,...
Canada Post Gateway Processing Centre,100,100,100,100,100,100
"Business reply mail Processing Centre, South Central Letter Processing Plant Toronto",100,100,100,100,100,100
"Garden District, Ryerson",100,100,100,100,100,100
"First Canadian Place, Underground city",100,100,100,100,100,100


In [234]:
len(toronto_venues.Neighborhood.unique())

97

In [235]:
toronto_venues.groupby('Venue Category').count().sort_values(by='Venue', ascending=False)

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude
Venue Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Coffee Shop,186,186,186,186,186,186
Café,96,96,96,96,96,96
Restaurant,71,71,71,71,71,71
Park,58,58,58,58,58,58
Pizza Place,53,53,53,53,53,53
...,...,...,...,...,...,...
Pilates Studio,1,1,1,1,1,1
Golf Course,1,1,1,1,1,1
Government Building,1,1,1,1,1,1
Peruvian Restaurant,1,1,1,1,1,1


#### Some venues had category *Neighborhood*, and it coused problems in analyzing data, do I decided remove this rows.

In [236]:
(toronto_venues[['Venue Category']]=='Neighborhood').any()

Venue Category    True
dtype: bool

In [237]:
toronto_venues = toronto_venues.loc[~(toronto_venues['Venue Category']=='Neighborhood')]

#### one hot encoding

In [238]:
toronto_onehot = pd.get_dummies(toronto_venues[['Venue Category']], prefix="", prefix_sep="")
toronto_onehot.head()

Unnamed: 0,ATM,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,...,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Concatenate Neighborhood and onehot

In [239]:
toronto_neighborhood_df = pd.DataFrame(toronto_venues['Neighborhood'])
toronto_onehot= pd.concat([toronto_neighborhood_df,toronto_onehot], axis=1)
toronto_onehot.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Parkwoods,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Victoria Village,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [240]:
toronto_onehot.shape

(2270, 265)

#### Grouping data by *Neighborhood*

In [241]:
toronto_grouped = toronto_onehot.groupby('Neighborhood').mean().reset_index()
toronto_grouped.head()

Unnamed: 0,Neighborhood,ATM,Accessories Store,Afghan Restaurant,Airport,American Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Vegetarian / Vegan Restaurant,Veterinarian,Video Game Store,Video Store,Vietnamese Restaurant,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Agincourt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Alderwood, Long Branch",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Bathurst Manor, Wilson Heights, Downsview North",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0
3,Bayview Village,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Bedford Park, Lawrence Manor East",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [242]:
toronto_grouped.shape

(97, 265)

#### Printing top venues for each neighborhood.

In [243]:
num_top_venues = 5

for hood in toronto_grouped['Neighborhood']:
    print("----"+hood+"----")
    temp = toronto_grouped[toronto_grouped['Neighborhood'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')

----Agincourt----
              venue  freq
0    Breakfast Spot   0.2
1       Supermarket   0.2
2   Badminton Court   0.2
3  Sushi Restaurant   0.2
4      Skating Rink   0.2


----Alderwood, Long Branch----
               venue  freq
0     Sandwich Place  0.11
1        Coffee Shop  0.11
2  Convenience Store  0.11
3                Pub  0.11
4        Pizza Place  0.11


----Bathurst Manor, Wilson Heights, Downsview North----
                 venue  freq
0                 Bank  0.11
1          Coffee Shop  0.11
2       Ice Cream Shop  0.05
3           Restaurant  0.05
4  Fried Chicken Joint  0.05


----Bayview Village----
                        venue  freq
0  Construction & Landscaping   0.5
1                       Trail   0.5
2                      Museum   0.0
3                Noodle House   0.0
4                Night Market   0.0


----Bedford Park, Lawrence Manor East----
                venue  freq
0      Sandwich Place  0.10
1         Coffee Shop  0.10
2  Italian Restaurant  0.10
3

4  New American Restaurant   0.0


----Mimico NW, The Queensway West, South of Bloor, Kingsway Park South West, Royal York South West----
               venue  freq
0      Burrito Place  0.14
1       Burger Joint  0.07
2  Fish & Chips Shop  0.07
3    Thai Restaurant  0.07
4               Bank  0.07


----Moore Park, Summerhill East----
                     venue  freq
0             Tennis Court  0.25
1                     Park  0.25
2               Playground  0.25
3                      Gym  0.25
4  New American Restaurant  0.00


----New Toronto, Mimico South, Humber Bay Shores----
           venue  freq
0            ATM   0.2
1  Grocery Store   0.2
2   Skating Rink   0.2
3           Park   0.2
4    Yoga Studio   0.2


----North Park, Maple Leaf Park, Upwood Park----
                     venue  freq
0                   Bakery  0.33
1         Basketball Court  0.33
2                     Park  0.33
3                      ATM  0.00
4  North Indian Restaurant  0.00


----North Toronto We

#### Top 10 venues in neighborhoods.

In [244]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [265]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = toronto_grouped['Neighborhood']

for ind in np.arange(toronto_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(toronto_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Agincourt,Breakfast Spot,Supermarket,Badminton Court,Skating Rink,Sushi Restaurant,Fast Food Restaurant,Farmers Market,Field,Farm,Dog Run
1,"Alderwood, Long Branch",Pizza Place,Pub,Coffee Shop,Gas Station,Gym,Athletics & Sports,Convenience Store,Pharmacy,Sandwich Place,Farm
2,"Bathurst Manor, Wilson Heights, Downsview North",Coffee Shop,Bank,Restaurant,Sushi Restaurant,Diner,Gas Station,Supermarket,Sandwich Place,Deli / Bodega,Middle Eastern Restaurant
3,Bayview Village,Construction & Landscaping,Trail,Yoga Studio,Falafel Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Farm
4,"Bedford Park, Lawrence Manor East",Italian Restaurant,Sandwich Place,Coffee Shop,Indian Restaurant,Butcher,Liquor Store,Sports Club,Café,Sushi Restaurant,Juice Bar


#### Grouping neighborhoods in 5 clusters.

In [266]:
kclusters = 5

toronto_grouped_clustering = toronto_grouped.drop('Neighborhood', 1)

kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(toronto_grouped_clustering)

kmeans.labels_[0:10]

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1], dtype=int32)

In [267]:
neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

toronto_merged = df_postal_codes.iloc[:,1:]

toronto_merged = toronto_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')

toronto_merged.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,North York,Parkwoods,43.752935,-79.335641,4.0,Food & Drink Shop,Convenience Store,Park,Falafel Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Farm
1,North York,Victoria Village,43.728102,-79.31189,1.0,Coffee Shop,Pizza Place,Portuguese Restaurant,Park,Intersection,French Restaurant,Ethiopian Restaurant,Dog Run,Donut Shop,Dumpling Restaurant
2,Downtown Toronto,"Regent Park, Harbourfront",43.650964,-79.353041,1.0,Pub,Coffee Shop,Café,Athletics & Sports,Bakery,Bank,Tech Startup,Chocolate Shop,Thai Restaurant,Theater
3,North York,"Lawrence Manor, Lawrence Heights",43.723265,-79.451211,1.0,Clothing Store,Cosmetics Shop,Pharmacy,Food Court,Restaurant,Toy / Game Store,Bookstore,American Restaurant,Men's Store,Furniture / Home Store
4,Downtown Toronto,"Queen's Park, Ontario Provincial Government",43.66179,-79.38939,3.0,Coffee Shop,Café,Sandwich Place,Italian Restaurant,Sushi Restaurant,Fried Chicken Joint,Bookstore,Smoothie Shop,Burrito Place,Yoga Studio


#### Verification of results. Because 2 rows dataframe contained some NA values, I decided to drop them. 

In [268]:
toronto_merged.isnull().sum()

Borough                   0
Neighborhood              0
Latitude                  0
Longitude                 0
Cluster Labels            2
1st Most Common Venue     2
2nd Most Common Venue     2
3rd Most Common Venue     2
4th Most Common Venue     2
5th Most Common Venue     2
6th Most Common Venue     2
7th Most Common Venue     2
8th Most Common Venue     2
9th Most Common Venue     2
10th Most Common Venue    2
dtype: int64

In [269]:
toronto_merged.loc[toronto_merged['Cluster Labels'].isnull()]

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
12,Scarborough,"Rouge Hill, Port Union, Highland Creek",43.785779,-79.157368,,,,,,,,,,,
95,Scarborough,Upper Rouge,43.834768,-79.204101,,,,,,,,,,,


In [270]:
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    '43.834768', 
    '-79.204101', 
    500, 
    LIMIT)

print(requests.get(url).json())



In [271]:
toronto_merged = toronto_merged.loc[~toronto_merged['Cluster Labels'].isnull()]

#### Creating map with neighborhoods markers.

In [272]:
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

markers_colors = []
for lat, lon, poi, cluster in zip(toronto_merged['Latitude'], toronto_merged['Longitude'], toronto_merged['Neighborhood'], toronto_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[int(cluster)-1],
        fill=True,
        fill_color=rainbow[int(cluster)-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

#### Examing clusters.

#### First cluster contain only 3 postal codes, and it seems Foursquare returned same results for two of them.

In [273]:
c0 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 0, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
c0

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
6,"Malvern, Rouge",Trail,Yoga Studio,Ethiopian Restaurant,Dog Run,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Falafel Restaurant,Distribution Center
26,Cedarbrae,Construction & Landscaping,Trail,Yoga Studio,Falafel Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Farm
39,Bayview Village,Construction & Landscaping,Trail,Yoga Studio,Falafel Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Farm


In [275]:
df_postal_codes.iloc[[26,39]]

Unnamed: 0,Postal Code,Borough,Neighborhood,Latitude,Longitude
26,M1H,Scarborough,Cedarbrae,43.768791,-79.238813
39,M2K,North York,Bayview Village,43.780607,-79.376921


#### Next cluster contain much more postal codes, and most popular venue is Coffee Shop

In [289]:
c1 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 1, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
c1.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Victoria Village,Coffee Shop,Pizza Place,Portuguese Restaurant,Park,Intersection,French Restaurant,Ethiopian Restaurant,Dog Run,Donut Shop,Dumpling Restaurant
2,"Regent Park, Harbourfront",Pub,Coffee Shop,Café,Athletics & Sports,Bakery,Bank,Tech Startup,Chocolate Shop,Thai Restaurant,Theater
3,"Lawrence Manor, Lawrence Heights",Clothing Store,Cosmetics Shop,Pharmacy,Food Court,Restaurant,Toy / Game Store,Bookstore,American Restaurant,Men's Store,Furniture / Home Store
8,"Parkview Hill, Woodbine Gardens",Fast Food Restaurant,Pizza Place,Gastropub,Rock Climbing Spot,Pharmacy,Pet Store,Gym / Fitness Center,Breakfast Spot,Athletics & Sports,Bank
9,"Garden District, Ryerson",Coffee Shop,Clothing Store,Middle Eastern Restaurant,Sandwich Place,Italian Restaurant,Cosmetics Shop,Hotel,Restaurant,Café,Bar


In [290]:
c1.shape

(69, 11)

#### Checking how many times given type of venue appears in top 10

In [291]:
c1.iloc[:,1:].stack().value_counts()[:5]

Coffee Shop       49
Café              28
Sandwich Place    24
Restaurant        24
Pizza Place       24
dtype: int64

#### Checking how many times given type of venue is 1st Most Common Venue

In [292]:
c1['1st Most Common Venue'].value_counts()[:5]

Coffee Shop       22
Pizza Place        7
Café               6
Park               3
Clothing Store     2
Name: 1st Most Common Venue, dtype: int64

#### Cluster with only one postal code

In [293]:
c2 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 2, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
c2.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
68,"Forest Hill North & West, Forest Hill Road Park",Gym / Fitness Center,Yoga Studio,Dog Run,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Falafel Restaurant,Distribution Center


#### Cluster with 10 postal codes, again mostly with Coffee Shops

In [294]:
c3 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 3, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
c3.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
4,"Queen's Park, Ontario Provincial Government",Coffee Shop,Café,Sandwich Place,Italian Restaurant,Sushi Restaurant,Fried Chicken Joint,Bookstore,Smoothie Shop,Burrito Place,Yoga Studio
7,Don Mills,Athletics & Sports,Bank,Burger Joint,Park,Other Great Outdoors,Restaurant,Trail,Coffee Shop,Ethiopian Restaurant,Donut Shop
13,Don Mills,Athletics & Sports,Bank,Burger Joint,Park,Other Great Outdoors,Restaurant,Trail,Coffee Shop,Ethiopian Restaurant,Donut Shop
22,Woburn,Coffee Shop,Korean Restaurant,Falafel Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Yoga Studio,Doctor's Office
45,"York Mills, Silver Hills",Bank,Coffee Shop,Ice Cream Shop,Burger Joint,Supermarket,Butcher,Sandwich Place,Baseball Field,Pharmacy,Cosmetics Shop


In [295]:
c3.iloc[:,1:].stack().value_counts()[:5]

Coffee Shop             8
Ethiopian Restaurant    6
Bank                    5
Donut Shop              5
Park                    5
dtype: int64

In [297]:
c3.shape

(10, 11)

#### In last cluster most popular are Parks

In [309]:
c4 = toronto_merged.loc[toronto_merged['Cluster Labels'] == 4, toronto_merged.columns[[1] + list(range(5, toronto_merged.shape[1]))]]
c4.head()

Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Parkwoods,Food & Drink Shop,Convenience Store,Park,Falafel Restaurant,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Farm
5,"Islington Avenue, Humber Valley Village",Park,Baseball Field,Skating Rink,Falafel Restaurant,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Yoga Studio,Dog Run
16,Humewood-Cedarvale,Hockey Arena,Grocery Store,Trail,Park,Field,Fast Food Restaurant,Farmers Market,Farm,Falafel Restaurant,Ethiopian Restaurant
21,Caledonia-Fairbanks,Park,Women's Store,Sporting Goods Shop,Spa,Bakery,Mexican Restaurant,Gym,Beer Store,Yoga Studio,Ethiopian Restaurant
27,Hillcrest Village,Residential Building (Apartment / Condo),Dog Run,Park,Yoga Studio,Donut Shop,Dumpling Restaurant,Eastern European Restaurant,Electronics Store,Ethiopian Restaurant,Distribution Center


In [305]:
c4.shape

(18, 11)

In [307]:
c4.iloc[:,1:].stack().value_counts()[:10]

Park                           18
Ethiopian Restaurant           13
Electronics Store              11
Eastern European Restaurant    10
Dumpling Restaurant            10
Donut Shop                      9
Yoga Studio                     9
Falafel Restaurant              9
Dog Run                         7
Farm                            7
dtype: int64

In [303]:
c4['1st Most Common Venue'].value_counts()[:5]

Playground        4
Park              4
Breakfast Spot    2
Theme Park        1
Yoga Studio       1
Name: 1st Most Common Venue, dtype: int64