# A Recommender System for Sports Shop

In [1]:
# Importing all my modules
import pandas as pd
import numpy as np
import folium
import json
from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors
import requests

## Postal Codes in Toronto

In [2]:
df = pd.read_csv("tronto.csv")
df = df.loc[(df.Borough != 'Not assigned')] # Removing the not assigned values in Borough

df = df.groupby(['Postcode','Borough'])['Neighbourhood'].apply(",".join).reset_index() # Grouping the Neighbourhood with same Postcode

df.Neighbourhood[df.Neighbourhood == 'Not assigned'] = df.Borough # Replacing the Not Assigned Neighbourhood with Borough name

df = pd.DataFrame(df)
print(df.head(10))

print(df.columns)

print(df.shape)


  Postcode      Borough                                  Neighbourhood
0      M1B  Scarborough                                  Rouge,Malvern
1      M1C  Scarborough           Highland Creek,Rouge Hill,Port Union
2      M1E  Scarborough                Guildwood,Morningside,West Hill
3      M1G  Scarborough                                         Woburn
4      M1H  Scarborough                                      Cedarbrae
5      M1J  Scarborough                            Scarborough Village
6      M1K  Scarborough      East Birchmount Park,Ionview,Kennedy Park
7      M1L  Scarborough                  Clairlea,Golden Mile,Oakridge
8      M1M  Scarborough  Cliffcrest,Cliffside,Scarborough Village West
9      M1N  Scarborough                     Birch Cliff,Cliffside West
Index(['Postcode', 'Borough', 'Neighbourhood'], dtype='object')
(103, 3)


In [3]:
df_2 = pd.read_csv('Geospatial_Coordinates.csv')
df_2 = df_2.rename(columns={'Postal Code': 'Postcode'})

#print(df_2.head())
df_3 =  pd.merge(df, df_2, on='Postcode')
#print(df_3)

df_toronto = df_3
print(df_toronto.head())

  Postcode      Borough                         Neighbourhood   Latitude  \
0      M1B  Scarborough                         Rouge,Malvern  43.806686   
1      M1C  Scarborough  Highland Creek,Rouge Hill,Port Union  43.784535   
2      M1E  Scarborough       Guildwood,Morningside,West Hill  43.763573   
3      M1G  Scarborough                                Woburn  43.770992   
4      M1H  Scarborough                             Cedarbrae  43.773136   

   Longitude  
0 -79.194353  
1 -79.160497  
2 -79.188711  
3 -79.216917  
4 -79.239476  


## Create a Map of Toronto City (with its Postal Codes' Regions)

In [4]:
# for the city Toronto, latitude and longtitude are manually extracted via google search
toronto_latitude = 43.6932; toronto_longitude = -79.3832
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighbourhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

In [41]:
%%html
<img src ="IBM Capstone (Toronto).png",height = 200,width=500>

## Focusing on the "North York" Borough in Toronto (its neighborhoods)

In [5]:

# selecting only neighborhoods regarding to "North York" borough.
north_york_data = df_toronto[df_toronto['Borough'] == 'North York']
north_york_data = north_york_data.reset_index(drop=True)
north_york_data.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M2H,North York,Hillcrest Village,43.803762,-79.363452
1,M2J,North York,"Fairview,Henry Farm,Oriole",43.778517,-79.346556
2,M2K,North York,Bayview Village,43.786947,-79.385975
3,M2L,North York,"Silver Hills,York Mills",43.75749,-79.374714
4,M2M,North York,"Newtonbrook,Willowdale",43.789053,-79.408493


## Create a Map of North York and Its Neighbourhoods

In [6]:

address_scar = 'North York, Toronto'
latitude_scar = 43.7615
longitude_scar = -79.4111
print('The geograpical coordinate of "North York" are: {}, {}.'.format(latitude_scar, longitude_scar))

map_North_York = folium.Map(location=[latitude_scar, longitude_scar], zoom_start=11.5)

# add markers to map
for lat, lng, label in zip(north_york_data['Latitude'],north_york_data['Longitude'],north_york_data['Neighbourhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 10,
        popup = label,
        color ='blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7).add_to(map_North_York)  
    
map_North_York

The geograpical coordinate of "North York" are: 43.7615, -79.4111.


In [42]:
%%html
<img src ="IBM Capstone (North York).png",height = 200,width=500>

In [7]:
def foursquare_crawler (postal_code_list, neighborhood_list, lat_list, lng_list, LIMIT = 500, radius = 1000):
    result_ds = []
    counter = 0
    for postal_code, neighborhood, lat, lng in zip(postal_code_list, neighborhood_list, lat_list, lng_list):
         
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, 
            lat, lng, radius, LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        tmp_dict = {}
        tmp_dict['Postal Code'] = postal_code; tmp_dict['Neighborhood(s)'] = neighborhood; 
        tmp_dict['Latitude'] = lat; tmp_dict['Longitude'] = lng;
        tmp_dict['Crawling_result'] = results;
        result_ds.append(tmp_dict)
        counter += 1
        print('{}.'.format(counter))
        print('Data is Obtained, for the Postal Code {} (and Neighborhoods {}) SUCCESSFULLY.'.format(postal_code, neighborhood))
    return result_ds;

In [8]:
CLIENT_SECRET = 'RNHJ0JQKZOH2OL43TB41XR2GTHPPW4FJOB3QXW2V122AGTZK'
CLIENT_ID = 'CE4EKZPXO1IGFXYKXIDIJ2CSCWCS4QPXU5LWYHOQHWFB442H' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: CE4EKZPXO1IGFXYKXIDIJ2CSCWCS4QPXU5LWYHOQHWFB442H
CLIENT_SECRET:RNHJ0JQKZOH2OL43TB41XR2GTHPPW4FJOB3QXW2V122AGTZK


## Crawling Internet (in fact only Foursquare database) for
## Venues in the Neighborhoods inside "North York"

In [9]:
print('Crawling different neighborhoods inside "North York"')
North_York_dataset = foursquare_crawler(list(north_york_data['Postcode']),
                                                   list(north_york_data['Neighbourhood']),
                                                   list(north_york_data['Latitude']),
                                                   list(north_york_data['Longitude']),)

Crawling different neighborhoods inside "North York"
1.
Data is Obtained, for the Postal Code M2H (and Neighborhoods Hillcrest Village) SUCCESSFULLY.
2.
Data is Obtained, for the Postal Code M2J (and Neighborhoods Fairview,Henry Farm,Oriole) SUCCESSFULLY.
3.
Data is Obtained, for the Postal Code M2K (and Neighborhoods Bayview Village) SUCCESSFULLY.
4.
Data is Obtained, for the Postal Code M2L (and Neighborhoods Silver Hills,York Mills) SUCCESSFULLY.
5.
Data is Obtained, for the Postal Code M2M (and Neighborhoods Newtonbrook,Willowdale) SUCCESSFULLY.
6.
Data is Obtained, for the Postal Code M2N (and Neighborhoods Willowdale South) SUCCESSFULLY.
7.
Data is Obtained, for the Postal Code M2P (and Neighborhoods York Mills West) SUCCESSFULLY.
8.
Data is Obtained, for the Postal Code M2R (and Neighborhoods Willowdale West) SUCCESSFULLY.
9.
Data is Obtained, for the Postal Code M3A (and Neighborhoods Parkwoods) SUCCESSFULLY.
10.
Data is Obtained, for the Postal Code M3B (and Neighborhoods Don 

### Saving results of Foursquare

In [10]:
import pickle
with open("North_York_foursquare_dataset.txt", "wb") as fp:   #Pickling
    pickle.dump(North_York_dataset, fp)
print('Received Data from Internet is Saved to Computer.')

Received Data from Internet is Saved to Computer.


In [11]:
with open("North_York_foursquare_dataset.txt", "rb") as fp:   # Unpickling
    North_York_dataset = pickle.load(fp)
# print(type(Scarborough_foursquare_dataset))
# Scarborough_foursquare_dataset

## Cleaning the RAW Data Received from Foursquare Database

In [12]:
# This function is created to connect to the saved list which is the received database. It will extract each venue 
# for every neighborhood inside the database

def get_venue_dataset(foursquare_dataset):
    result_df = pd.DataFrame(columns = ['Postal Code', 'Neighborhood', 
                                           'Neighborhood Latitude', 'Neighborhood Longitude',
                                          'Venue', 'Venue Summary', 'Venue Category', 'Distance'])
    # print(result_df)
    
    for neigh_dict in foursquare_dataset:
        postal_code = neigh_dict['Postal Code']; neigh = neigh_dict['Neighborhood(s)']
        lat = neigh_dict['Latitude']; lng = neigh_dict['Longitude']
        print('Number of Venuse in Coordination "{}" Posal Code and "{}" Negihborhood(s) is:'.format(postal_code, neigh))
        print(len(neigh_dict['Crawling_result']))
        
        for venue_dict in neigh_dict['Crawling_result']:
            summary = venue_dict['reasons']['items'][0]['summary']
            name = venue_dict['venue']['name']
            dist = venue_dict['venue']['location']['distance']
            cat =  venue_dict['venue']['categories'][0]['name']
            
            
            # print({'Postal Code': postal_code, 'Neighborhood': neigh, 
            #                   'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
            #                   'Venue': name, 'Venue Summary': summary, 
            #                   'Venue Category': cat, 'Distance': dist})
            
            result_df = result_df.append({'Postal Code': postal_code, 'Neighborhood': neigh, 
                              'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
                              'Venue': name, 'Venue Summary': summary, 
                              'Venue Category': cat, 'Distance': dist}, ignore_index = True)
            # print(result_df)
    
    return(result_df)

In [13]:
north_york_venues = get_venue_dataset(North_York_dataset)

Number of Venuse in Coordination "M2H" Posal Code and "Hillcrest Village" Negihborhood(s) is:
22
Number of Venuse in Coordination "M2J" Posal Code and "Fairview,Henry Farm,Oriole" Negihborhood(s) is:
44
Number of Venuse in Coordination "M2K" Posal Code and "Bayview Village" Negihborhood(s) is:
15
Number of Venuse in Coordination "M2L" Posal Code and "Silver Hills,York Mills" Negihborhood(s) is:
4
Number of Venuse in Coordination "M2M" Posal Code and "Newtonbrook,Willowdale" Negihborhood(s) is:
31
Number of Venuse in Coordination "M2N" Posal Code and "Willowdale South" Negihborhood(s) is:
100
Number of Venuse in Coordination "M2P" Posal Code and "York Mills West" Negihborhood(s) is:
14
Number of Venuse in Coordination "M2R" Posal Code and "Willowdale West" Negihborhood(s) is:
11
Number of Venuse in Coordination "M3A" Posal Code and "Parkwoods" Negihborhood(s) is:
29
Number of Venuse in Coordination "M3B" Posal Code and "Don Mills North" Negihborhood(s) is:
31
Number of Venuse in Coordin

## Showing Venues for Each Neighborhood in North York

In [14]:
north_york_venues.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
0,M2H,Hillcrest Village,43.803762,-79.363452,고려삼계탕 Korean Ginseng Chicken Soup & Bibimbap,This spot is popular,Korean Restaurant,754
1,M2H,Hillcrest Village,43.803762,-79.363452,Tastee,This spot is popular,Bakery,692
2,M2H,Hillcrest Village,43.803762,-79.363452,Cummer Park,This spot is popular,Park,776
3,M2H,Hillcrest Village,43.803762,-79.363452,Galati,This spot is popular,Grocery Store,815
4,M2H,Hillcrest Village,43.803762,-79.363452,Tim Hortons,This spot is popular,Coffee Shop,731


In [15]:
north_york_venues.tail()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
604,M9M,"Emery,Humberlea",43.724766,-79.532242,auctionmaxx,This spot is popular,Discount Store,625
605,M9M,"Emery,Humberlea",43.724766,-79.532242,Fiera Foods,This spot is popular,Bakery,823
606,M9M,"Emery,Humberlea",43.724766,-79.532242,Hwy 401 & Hwy 400,This spot is popular,Intersection,875
607,M9M,"Emery,Humberlea",43.724766,-79.532242,Joseph Bannon Park,This spot is popular,Park,889
608,M9M,"Emery,Humberlea",43.724766,-79.532242,U-Haul at Weston Rd,This spot is popular,Storage Facility,917


## End of Processing the Retrieved Information from Foursquare
## Saving a Cleaned Version of DataFrame as the Results from Foursquare

In [16]:
north_york_venues.to_csv('north_york_venues.csv')

### Loading Data from File (Saved "Foursquare " DataFrame for Venues)

In [17]:
north_york_venues = pd.read_csv('north_york_venues.csv')

### Some Summary Information about Neighborhoods inside "North York"

In [18]:
neigh_list = list(north_york_venues['Neighborhood'].unique())
print('Number of Neighborhoods inside North York:')
print(len(neigh_list))
print('List of Neighborhoods inside North York:')
neigh_list

Number of Neighborhoods inside North York:
24
List of Neighborhoods inside North York:


['Hillcrest Village',
 'Fairview,Henry Farm,Oriole',
 'Bayview Village',
 'Silver Hills,York Mills',
 'Newtonbrook,Willowdale',
 'Willowdale South',
 'York Mills West',
 'Willowdale West',
 'Parkwoods',
 'Don Mills North',
 'Flemingdon Park,Don Mills South',
 'Bathurst Manor,Downsview North,Wilson Heights',
 'Northwood Park,York University',
 'CFB Toronto,Downsview East',
 'Downsview West',
 'Downsview Central',
 'Downsview Northwest',
 'Victoria Village',
 'Bedford Park,Lawrence Manor East',
 'Lawrence Heights,Lawrence Manor',
 'Glencairn',
 'Downsview,North Park,Upwood Park',
 'Humber Summit',
 'Emery,Humberlea']

#### Some Summary Information about Neighborhoods inside "North York" Cont'd

In [19]:
neigh_venue_summary = north_york_venues.groupby('Neighborhood').count()
neigh_venue_summary.drop(columns = ['Unnamed: 0']).head()

Unnamed: 0_level_0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
"Bathurst Manor,Downsview North,Wilson Heights",31,31,31,31,31,31,31
Bayview Village,15,15,15,15,15,15,15
"Bedford Park,Lawrence Manor East",37,37,37,37,37,37,37
"CFB Toronto,Downsview East",18,18,18,18,18,18,18
Don Mills North,31,31,31,31,31,31,31


In [20]:
print('There are {} uniques categories.'.format(len(north_york_venues['Venue Category'].unique())))

print('Here is the list of different categories:')
list(north_york_venues['Venue Category'].unique())

There are 152 uniques categories.
Here is the list of different categories:


['Korean Restaurant',
 'Bakery',
 'Park',
 'Grocery Store',
 'Coffee Shop',
 'Bank',
 'Pizza Place',
 'Sandwich Place',
 'Fast Food Restaurant',
 'Pharmacy',
 'Housing Development',
 'Chinese Restaurant',
 'Ice Cream Shop',
 'Shopping Mall',
 'Recreation Center',
 'Pool',
 'Residential Building (Apartment / Condo)',
 'Diner',
 'Convenience Store',
 'Toy / Game Store',
 'Movie Theater',
 'Burger Joint',
 'Tea Room',
 'Electronics Store',
 'American Restaurant',
 'Candy Store',
 'Department Store',
 'Salon / Barbershop',
 'Juice Bar',
 'Smoothie Shop',
 'Clothing Store',
 'Theater',
 'Caribbean Restaurant',
 'Japanese Restaurant',
 'Food Court',
 'Supermarket',
 'Restaurant',
 'Cosmetics Shop',
 'Liquor Store',
 'Video Game Store',
 'Beer Store',
 'Café',
 'Skating Rink',
 'Skate Park',
 'Intersection',
 'Trail',
 'Asian Restaurant',
 'Hookah Bar',
 'Middle Eastern Restaurant',
 'Dessert Shop',
 'Fried Chicken Joint',
 'Hot Dog Joint',
 'Indian Restaurant',
 'Ramen Restaurant',
 'Bus Lin

In [21]:
# optional
# Just for fun and deeper understanding
print(type(north_york_venues[['Venue Category']]))

print(type(north_york_venues['Venue Category']))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


### One-hot Encoding the "categroies" Column into Every Unique Categorical Feature.

In [22]:
# one hot encoding
north_york_onehot = pd.get_dummies(data = north_york_venues, drop_first  = False, 
                              prefix = "", prefix_sep = "", columns = ['Venue Category'])
north_york_onehot.head()

Unnamed: 0.1,Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Distance,Accessories Store,Airport,...,Toy / Game Store,Trail,Train Station,Turkish Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Wings Joint,Women's Store,Yoga Studio
0,0,M2H,Hillcrest Village,43.803762,-79.363452,고려삼계탕 Korean Ginseng Chicken Soup & Bibimbap,This spot is popular,754,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,M2H,Hillcrest Village,43.803762,-79.363452,Tastee,This spot is popular,692,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,M2H,Hillcrest Village,43.803762,-79.363452,Cummer Park,This spot is popular,776,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,M2H,Hillcrest Village,43.803762,-79.363452,Galati,This spot is popular,815,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,M2H,Hillcrest Village,43.803762,-79.363452,Tim Hortons,This spot is popular,731,0,0,...,0,0,0,0,0,0,0,0,0,0


## Manually Selecting (Subsetting) Related Features for the Sports Hub Contractor

In [23]:
# This list is created manually 
important_list_of_features = [
    
    
     'Neighborhood',
     'Neighborhood Latitude',
     'Neighborhood Longitude',
    
    
    
    
    'Recreation Center',
    'Toy / Game Store',
    'Skate Park',
    'Gym',
    'Sports Bar',
    'Yoga Studio',
    'Gym / Fitness Center',
    'Basketball Court',
    'Athletics & Sports',
    'Hockey Arena',
    'Ski Area',
    'Tennis Court',
    'Skating Rink',
    'Soccer Field',
    'Baseball Field',
    'Golf Course',
    'Sports Club',
    'Bowling Alley',
    'Playground'
       ]

## Updating the One-hot Encoded DataFrame and
## Grouping the Data by Neighborhoods

In [24]:
north_york_onehot = north_york_onehot[important_list_of_features].drop(
    columns = ['Neighborhood Latitude', 'Neighborhood Longitude']).groupby(
    'Neighborhood').sum()
    

north_york_onehot.head()

Unnamed: 0_level_0,Recreation Center,Toy / Game Store,Skate Park,Gym,Sports Bar,Yoga Studio,Gym / Fitness Center,Basketball Court,Athletics & Sports,Hockey Arena,Ski Area,Tennis Court,Skating Rink,Soccer Field,Baseball Field,Golf Course,Sports Club,Bowling Alley,Playground
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
"Bathurst Manor,Downsview North,Wilson Heights",0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Bayview Village,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
"Bedford Park,Lawrence Manor East",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
"CFB Toronto,Downsview East",0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
Don Mills North,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0


In [25]:
north_york_onehot

Unnamed: 0_level_0,Recreation Center,Toy / Game Store,Skate Park,Gym,Sports Bar,Yoga Studio,Gym / Fitness Center,Basketball Court,Athletics & Sports,Hockey Arena,Ski Area,Tennis Court,Skating Rink,Soccer Field,Baseball Field,Golf Course,Sports Club,Bowling Alley,Playground
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
"Bathurst Manor,Downsview North,Wilson Heights",0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
Bayview Village,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
"Bedford Park,Lawrence Manor East",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
"CFB Toronto,Downsview East",0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0
Don Mills North,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0
Downsview Central,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
Downsview Northwest,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0
Downsview West,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
"Downsview,North Park,Upwood Park",0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
"Emery,Humberlea",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0


### Run k-means to Cluster Neighborhoods into 5 Clusters

In [36]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

# run k-means clustering
kmeans = KMeans(n_clusters = 5, random_state = 0).fit(north_york_onehot)

## Showing Centers of Each Cluster

In [37]:
means_df = pd.DataFrame(kmeans.cluster_centers_)
means_df.columns = north_york_onehot.columns
means_df.index = ['G1','G2','G3','G4','G5']
means_df['Total Sum'] = means_df.sum(axis = 1)
means_df.sort_values(axis = 0, by = ['Total Sum'], ascending=False)

Unnamed: 0,Recreation Center,Toy / Game Store,Skate Park,Gym,Sports Bar,Yoga Studio,Gym / Fitness Center,Basketball Court,Athletics & Sports,Hockey Arena,Ski Area,Tennis Court,Skating Rink,Soccer Field,Baseball Field,Golf Course,Sports Club,Bowling Alley,Playground,Total Sum
G5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
G2,0.0,0.0,0.0,0.4,0.0,0.0,0.6,0.2,1.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,0.0,0.2,0.2,3.0
G4,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.5,2.5
G1,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,1.0,0.0,0.0,0.0,0.333333,0.0,0.0,2.0
G3,0.076923,0.076923,0.0,0.076923,0.153846,0.076923,0.076923,0.0,0.0,0.076923,0.076923,0.0,0.0,0.0,0.076923,0.153846,0.0,0.0,0.0,0.923077


## Result:
### Best Group is G5;
### Inserting "kmeans.labels_" into the Original Scarborough DataFrame
#### Finding the Corresponding Group for Each Neighborhood.

In [38]:
neigh_summary = pd.DataFrame([north_york_onehot.index, 1 + kmeans.labels_]).T
neigh_summary.columns = ['Neighborhood', 'Group']
neigh_summary

Unnamed: 0,Neighborhood,Group
0,"Bathurst Manor,Downsview North,Wilson Heights",3
1,Bayview Village,1
2,"Bedford Park,Lawrence Manor East",1
3,"CFB Toronto,Downsview East",2
4,Don Mills North,2
5,Downsview Central,3
6,Downsview Northwest,2
7,Downsview West,3
8,"Downsview,North Park,Upwood Park",2
9,"Emery,Humberlea",3


# Deducing Results:Best Neighborhood Are...
## Best Neighborhood is


In [39]:
neigh_summary[neigh_summary['Group'] == 5]

Unnamed: 0,Neighborhood,Group
11,"Flemingdon Park,Don Mills South",5


In [40]:
name_of_neigh = list(neigh_summary[neigh_summary['Group'] == 5]['Neighborhood'])[0]
north_york_venues[north_york_venues['Neighborhood'] == name_of_neigh].iloc[0,1:5].to_dict()

{'Postal Code': 'M3C',
 'Neighborhood': 'Flemingdon Park,Don Mills South',
 'Neighborhood Latitude': 43.72589970000001,
 'Neighborhood Longitude': -79.340923}

### Thank You
#### Siddharth Pathania - siddp11@gmail.com