# Neighborhood Clustering for Remote Employees
### This Notebook will analyze, cluster, and rank a coordinate system of 11 by 11 miles, with 121 markers around a central location.  These rankings will be based on number of bars and coffee shops in a half mile radius, and the walkscore and bikescore.  This data is location based and will be pulled from Foursquare API and Walkscore API.
### Author: Skyler Schilke

In [1]:
# input city, location, address, etc as a string
place = 'Portland, OR'

In [2]:
# install geopy and folium
# !conda install -c conda-forge geopy --yes # uncomment this line if geopy is downloaded
# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if folium is downloaded
# import list of neighborhoods
import types
import pandas as pd
from botocore.client import Config
import ibm_boto3
import numpy as np
try:
    from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
except:
    print("Geopy Installing...")
    !conda install -c conda-forge geopy --yes
    print("Geopy Installed!")
try:
    import folium
except:
    print("Folium Installing...")
    !conda install -c conda-forge folium=0.5.0 --yes
    print("Folium Installed!")
import requests
from sklearn import preprocessing
from sklearn.cluster import KMeans
import matplotlib.cm as cm
import matplotlib.colors as colors

In [3]:
# use geopy library to get lat and long of the desired area

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(str(place))
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates are {}, {}.'.format(latitude, longitude))

The geograpical coordinates are 45.5202471, -122.6741949.


In [4]:
# create an 11 x 11 grid with this location at the center
lng_start = longitude - (.018 * 5)
lng_end = longitude + (.018 * 5)
lat_start = latitude - (.014 * 5)
lat_end = latitude + (.014 * 5)
lat_list = np.linspace(lat_start, lat_end, 11)
lng_list = np.linspace(lng_start, lng_end, 11)
lat_list = lat_list.tolist()
lng_list = lng_list.tolist()


In [5]:
# exapnd lists to exhaust all possibilities of latitude and longitude
full_lng_list = lng_list * 11
full_lat_list = []
i = 0
while i < 11:
    j = 0
    while j < 11:
        full_lat_list.append(lat_list[i])
        j += 1
    i += 1

In [6]:
# create area names
area = []
for i in np.arange(1,122):
    area.append('A_' + str(i))
# create a pandas dataframe with the latitude and longitude of all 121 areas
data = {'Name':area, 'Latitude':full_lat_list, 'Longitude':full_lng_list}
df_locations = pd.DataFrame(data=data)
df_locations = df_locations[['Name', 'Latitude', 'Longitude']]
df_locations.head()

Unnamed: 0,Name,Latitude,Longitude
0,A_1,45.450247,-122.764195
1,A_2,45.450247,-122.746195
2,A_3,45.450247,-122.728195
3,A_4,45.450247,-122.710195
4,A_5,45.450247,-122.692195


In [7]:
# create a map with folium of all the areas
# create a map of Portland with neighborhoods marked
# create map of Portland using latitude and longitude values
map_areas = folium.Map(location=[latitude, longitude], zoom_start=11.5)

# add markers to map
for lat, lng, neigh in zip(df_locations['Latitude'], df_locations['Longitude'], df_locations['Name']):
    label = '{}'.format(neigh)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_areas)  
map_areas

In [8]:
# The code was removed by Watson Studio for sharing.

In [9]:
# The above cell contains my foresquare API CLIENT_ID, CLIENT_SECRET, and VERSION
# now we will use Foursquare API to generate nearby bars and coffee houses
# explore neighborhoods in manhattan
# create function to repeat the same process to all neighborhoods

LIMIT = 200
radius = 800 # half mile radius, since all markers are roughly a mile a part
def getNearbyVenues(query, names, latitudes, longitudes, radius=radius):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&query={}&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            query,
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Area', 
                  'Area Latitude', 
                  'Area Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [10]:
# get coffee shops
df_neigh_coffee = getNearbyVenues(query='coffee', names=df_locations['Name'], latitudes=df_locations['Latitude'], longitudes=df_locations['Longitude'])
print('Coffee shop DF created')

Coffee shop DF created


In [11]:
df_neigh_bars = getNearbyVenues(query='bars', names=df_locations['Name'], latitudes=df_locations['Latitude'], longitudes=df_locations['Longitude'])
print('Bars DF created')

Bars DF created


In [12]:
# group and merge the counts into the df
df_neigh_coffee_count = df_neigh_coffee.groupby(['Area']).size().reset_index(name='CoffeeCount')
df_neigh_coffee_count = df_neigh_coffee_count.rename(columns={'Area':'Name'})

df_locations = pd.merge(df_locations, df_neigh_coffee_count,
               on=['Name'], how="left")

# turn all NaN's in CoffeeCount to 0
df_locations['CoffeeCount'] = df_locations['CoffeeCount'].fillna(0)
# convert to integer
df_locations['CoffeeCount'] = df_locations['CoffeeCount'].astype(float)
df_locations.head()

Unnamed: 0,Name,Latitude,Longitude,CoffeeCount
0,A_1,45.450247,-122.764195,0.0
1,A_2,45.450247,-122.746195,0.0
2,A_3,45.450247,-122.728195,4.0
3,A_4,45.450247,-122.710195,0.0
4,A_5,45.450247,-122.692195,0.0


In [13]:
# group and merge the counts into the df
df_neigh_bars_count = df_neigh_bars.groupby(['Area']).size().reset_index(name='BarsCount')
df_neigh_bars_count = df_neigh_bars_count.rename(columns={'Area':'Name'})

df_locations = pd.merge(df_locations, df_neigh_bars_count,
               on=['Name'],
               how="left")

# turn all NaN's in BarsCount to 0
df_locations['BarsCount'] = df_locations['BarsCount'].fillna(0)
# convert to integer
df_locations['BarsCount'] = df_locations['BarsCount'].astype(float)
df_locations.head()

Unnamed: 0,Name,Latitude,Longitude,CoffeeCount,BarsCount
0,A_1,45.450247,-122.764195,0.0,1.0
1,A_2,45.450247,-122.746195,0.0,2.0
2,A_3,45.450247,-122.728195,4.0,1.0
3,A_4,45.450247,-122.710195,0.0,1.0
4,A_5,45.450247,-122.692195,0.0,0.0


In [14]:
# The code was removed by Watson Studio for sharing.

In [15]:
# the above cell is where the walkscore_api is defined
# use the Walkscore API to extract walkscore and bikescore based on coordinates
# add a blank column for walkscore and bikescore to df_locations
df_locations['Walkscore'] = None
df_locations['Bikescore'] = None

# create a loop that goes through df_locations and appends the walkscore and bikescore
# search for df_locations['Name'][i] + ' Neighborhood, Portland, OR'
# if that raises an error, search for df_locations['Name'][i] + ', Portland, OR'
# if that raises an error, set them = None
for i in range(0, len(df_locations), 1):
    try:
        lat = df_locations['Latitude'][i]
        lng = df_locations['Longitude'][i]
        request_url = 'http://api.walkscore.com/score?format=json&lat=' + str(lat) + '&lon=' + str(lng) + '&transit=1&bike=1&wsapikey=' + walkscore_api
        request = pd.read_json(request_url)
        walk = request['walkscore']['score']
        df_locations.iat[i, df_locations.columns.get_loc('Walkscore')] = walk
        bike = request['bike']['score']
        df_locations.iat[i, df_locations.columns.get_loc('Bikescore')] = bike
    except:
        bike = None
        walk = None
    
print('Walkscore and Bikescore loaded!')

Walkscore and Bikescore loaded!


In [16]:
# change walkscore and bikescore to floats and change NaNs to 0
df_locations['Walkscore'] = df_locations['Walkscore'].astype(float)
df_locations['Bikescore'] = df_locations['Bikescore'].astype(float)
df_locations['Walkscore'] = df_locations['Walkscore'].fillna(0)
df_locations['Bikescore'] = df_locations['Bikescore'].fillna(0)

### All data is now loaded.  Now we will normalize the data, cluster it, and rank it

In [17]:
# drop columns to get only the metrics
df_locations_cluster = df_locations.drop('Name', 1)
df_locations_cluster = df_locations_cluster.drop('Latitude', 1)
df_locations_cluster = df_locations_cluster.drop('Longitude', 1)
df_locations_cluster.head()

Unnamed: 0,CoffeeCount,BarsCount,Walkscore,Bikescore
0,0.0,1.0,48.0,72.0
1,0.0,2.0,24.0,44.0
2,4.0,1.0,65.0,75.0
3,0.0,1.0,27.0,45.0
4,0.0,0.0,10.0,40.0


In [18]:
# normalize columns
x = df_locations_cluster.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_locations_normal = pd.DataFrame(x_scaled)
df_locations_normal.columns = df_locations_cluster.columns
df_locations_normal.head()

Unnamed: 0,CoffeeCount,BarsCount,Walkscore,Bikescore
0,0.0,0.011765,0.484848,0.678161
1,0.0,0.023529,0.242424,0.356322
2,0.057971,0.011765,0.656566,0.712644
3,0.0,0.011765,0.272727,0.367816
4,0.0,0.0,0.10101,0.310345


In [19]:
# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_locations_normal)

# insert the kmeans array to df_neigh
df_locations.insert(0, 'ClusterLabels', kmeans.labels_)
df_locations.head()

Unnamed: 0,ClusterLabels,Name,Latitude,Longitude,CoffeeCount,BarsCount,Walkscore,Bikescore
0,2,A_1,45.450247,-122.764195,0.0,1.0,48.0,72.0
1,1,A_2,45.450247,-122.746195,0.0,2.0,24.0,44.0
2,0,A_3,45.450247,-122.728195,4.0,1.0,65.0,75.0
3,1,A_4,45.450247,-122.710195,0.0,1.0,27.0,45.0
4,1,A_5,45.450247,-122.692195,0.0,0.0,10.0,40.0


In [20]:
# add an equally weighted 'overall score' column based on the average of the normalized column and sort by that
df_locations['Score'] = (df_locations_normal['BarsCount'] + 
                         df_locations_normal['CoffeeCount'] + 
                         df_locations_normal['Walkscore'] + 
                         df_locations_normal['Bikescore']) / 4
# reorder columns and SORT from highest score to lowest
df_locations = df_locations[['Name', 'Score', 'ClusterLabels', 
                     'Latitude', 'Longitude', 'BarsCount', 
                     'CoffeeCount', 'Walkscore', 'Bikescore']]
df_locations = df_locations.sort_values(by=['Score'], ascending=False)
df_locations = df_locations.reset_index(drop=True)

# view top 20 locations
df_locations.head(20)

Unnamed: 0,Name,Score,ClusterLabels,Latitude,Longitude,BarsCount,CoffeeCount,Walkscore,Bikescore
0,A_61,0.985632,3,45.520247,-122.674195,85.0,69.0,99.0,95.0
1,A_62,0.820053,3,45.520247,-122.656195,69.0,36.0,96.0,98.0
2,A_60,0.756642,3,45.520247,-122.692195,61.0,36.0,95.0,85.0
3,A_71,0.675553,4,45.534247,-122.692195,34.0,29.0,93.0,95.0
4,A_83,0.633187,4,45.548247,-122.674195,35.0,17.0,90.0,97.0
5,A_63,0.628679,4,45.520247,-122.638195,40.0,11.0,91.0,97.0
6,A_73,0.611332,4,45.534247,-122.656195,18.0,19.0,96.0,99.0
7,A_96,0.598962,4,45.562247,-122.638195,36.0,14.0,83.0,94.0
8,A_53,0.596191,4,45.506247,-122.620195,27.0,11.0,91.0,99.0
9,A_52,0.583255,4,45.506247,-122.638195,26.0,13.0,91.0,93.0


In [21]:
# add a link to the dataframe that will open the location in google maps
links_list = []
for lat, lon in zip(df_locations['Latitude'], df_locations['Longitude']):
    link = 'https://www.google.com/maps/search/?api=1&query=' + str(lat) + ',' + str(lon)
    links_list.append(link)

df_locations['Link'] = links_list

In [22]:
# create the same map as before, this time color coded based on cluster
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(place)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinates are {}, {}.'.format(latitude, longitude))

The geograpical coordinates are 45.5202471, -122.6741949.


In [23]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11.5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, link, score in zip(df_locations['Latitude'], df_locations['Longitude'], 
                                        df_locations['Name'], df_locations['ClusterLabels'], 
                                               df_locations['Link'], df_locations['Score']):
#     label = folium.Popup(str(poi) + ' Cluster ' + str(cluster) + '\n' + link, parse_html=True)
#     label = folium.Popup('<a href=" + link + "target="_blank">" + str(poi) + ' Cluster ' + str(cluster) + ' </a>')
#     label = folium.Popup('<a href=" [URL GOES HERE] "target="_blank"> [text for link goes here]' </a>')
    label = folium.Popup('<a href="' + link + '"target="_blank">' + poi + '</a>, Cluster: ' 
                         + str(cluster) + ', Score: ' + "{:.2f}".format(score))

    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [24]:
# cluster 0 details
df_locations[df_locations['ClusterLabels'] == 0]

Unnamed: 0,Name,Score,ClusterLabels,Latitude,Longitude,BarsCount,CoffeeCount,Walkscore,Bikescore,Link
20,A_86,0.498701,0,45.548247,-122.620195,7.0,6.0,84.0,98.0,https://www.google.com/maps/search/?api=1&quer...
21,A_31,0.496157,0,45.478247,-122.620195,7.0,4.0,87.0,97.0,https://www.google.com/maps/search/?api=1&quer...
22,A_84,0.484547,0,45.548247,-122.656195,5.0,6.0,83.0,96.0,https://www.google.com/maps/search/?api=1&quer...
23,A_41,0.482573,0,45.492247,-122.638195,8.0,6.0,81.0,94.0,https://www.google.com/maps/search/?api=1&quer...
24,A_44,0.471673,0,45.492247,-122.584195,7.0,7.0,73.0,97.0,https://www.google.com/maps/search/?api=1&quer...
25,A_55,0.471108,0,45.506247,-122.584195,3.0,8.0,76.0,97.0,https://www.google.com/maps/search/?api=1&quer...
26,A_104,0.468308,0,45.576247,-122.692195,7.0,1.0,78.0,99.0,https://www.google.com/maps/search/?api=1&quer...
27,A_64,0.463314,0,45.520247,-122.620195,6.0,5.0,76.0,95.0,https://www.google.com/maps/search/?api=1&quer...
28,A_106,0.453842,0,45.576247,-122.656195,5.0,5.0,70.0,98.0,https://www.google.com/maps/search/?api=1&quer...
29,A_93,0.452981,0,45.562247,-122.692195,9.0,5.0,65.0,98.0,https://www.google.com/maps/search/?api=1&quer...


In [25]:
# cluster 1 details
df_locations[df_locations['ClusterLabels'] == 1]

Unnamed: 0,Name,Score,ClusterLabels,Latitude,Longitude,BarsCount,CoffeeCount,Walkscore,Bikescore,Link
83,A_120,0.206163,1,45.590247,-122.602195,6.0,13.0,23.0,42.0,https://www.google.com/maps/search/?api=1&quer...
84,A_11,0.201041,1,45.450247,-122.584195,4.0,6.0,14.0,59.0,https://www.google.com/maps/search/?api=1&quer...
86,A_37,0.190405,1,45.492247,-122.710195,0.0,1.0,33.0,49.0,https://www.google.com/maps/search/?api=1&quer...
87,A_17,0.178141,1,45.464247,-122.674195,1.0,0.0,25.0,52.0,https://www.google.com/maps/search/?api=1&quer...
88,A_45,0.177449,1,45.506247,-122.764195,0.0,3.0,25.0,49.0,https://www.google.com/maps/search/?api=1&quer...
89,A_6,0.174068,1,45.450247,-122.674195,0.0,0.0,20.0,56.0,https://www.google.com/maps/search/?api=1&quer...
90,A_59,0.170551,1,45.520247,-122.710195,0.0,1.0,24.0,50.0,https://www.google.com/maps/search/?api=1&quer...
91,A_4,0.163077,1,45.450247,-122.710195,1.0,0.0,27.0,45.0,https://www.google.com/maps/search/?api=1&quer...
92,A_117,0.158568,1,45.590247,-122.656195,0.0,0.0,15.0,55.0,https://www.google.com/maps/search/?api=1&quer...
93,A_2,0.155569,1,45.450247,-122.746195,2.0,0.0,24.0,44.0,https://www.google.com/maps/search/?api=1&quer...


In [26]:
# cluster 2 details
df_locations[df_locations['ClusterLabels'] == 2]

Unnamed: 0,Name,Score,ClusterLabels,Latitude,Longitude,BarsCount,CoffeeCount,Walkscore,Bikescore,Link
52,A_15,0.36257,2,45.464247,-122.710195,5.0,8.0,58.0,73.0,https://www.google.com/maps/search/?api=1&quer...
54,A_8,0.359271,2,45.450247,-122.638195,7.0,4.0,51.0,81.0,https://www.google.com/maps/search/?api=1&quer...
55,A_72,0.355022,2,45.534247,-122.674195,10.0,12.0,48.0,69.0,https://www.google.com/maps/search/?api=1&quer...
56,A_16,0.348726,2,45.464247,-122.692195,3.0,4.0,64.0,70.0,https://www.google.com/maps/search/?api=1&quer...
57,A_13,0.347923,2,45.464247,-122.746195,3.0,2.0,62.0,74.0,https://www.google.com/maps/search/?api=1&quer...
58,A_39,0.341744,2,45.492247,-122.674195,4.0,9.0,62.0,62.0,https://www.google.com/maps/search/?api=1&quer...
59,A_103,0.331325,2,45.576247,-122.710195,3.0,1.0,58.0,73.0,https://www.google.com/maps/search/?api=1&quer...
60,A_27,0.329637,2,45.478247,-122.692195,3.0,2.0,65.0,65.0,https://www.google.com/maps/search/?api=1&quer...
61,A_34,0.320561,2,45.492247,-122.764195,4.0,2.0,50.0,74.0,https://www.google.com/maps/search/?api=1&quer...
62,A_21,0.314573,2,45.464247,-122.602195,2.0,0.0,46.0,80.0,https://www.google.com/maps/search/?api=1&quer...


In [27]:
# cluster 3 details
df_locations[df_locations['ClusterLabels'] == 3]

Unnamed: 0,Name,Score,ClusterLabels,Latitude,Longitude,BarsCount,CoffeeCount,Walkscore,Bikescore,Link
0,A_61,0.985632,3,45.520247,-122.674195,85.0,69.0,99.0,95.0,https://www.google.com/maps/search/?api=1&quer...
1,A_62,0.820053,3,45.520247,-122.656195,69.0,36.0,96.0,98.0,https://www.google.com/maps/search/?api=1&quer...
2,A_60,0.756642,3,45.520247,-122.692195,61.0,36.0,95.0,85.0,https://www.google.com/maps/search/?api=1&quer...


In [28]:
# cluster 4 details
df_locations[df_locations['ClusterLabels'] == 4]

Unnamed: 0,Name,Score,ClusterLabels,Latitude,Longitude,BarsCount,CoffeeCount,Walkscore,Bikescore,Link
3,A_71,0.675553,4,45.534247,-122.692195,34.0,29.0,93.0,95.0,https://www.google.com/maps/search/?api=1&quer...
4,A_83,0.633187,4,45.548247,-122.674195,35.0,17.0,90.0,97.0,https://www.google.com/maps/search/?api=1&quer...
5,A_63,0.628679,4,45.520247,-122.638195,40.0,11.0,91.0,97.0,https://www.google.com/maps/search/?api=1&quer...
6,A_73,0.611332,4,45.534247,-122.656195,18.0,19.0,96.0,99.0,https://www.google.com/maps/search/?api=1&quer...
7,A_96,0.598962,4,45.562247,-122.638195,36.0,14.0,83.0,94.0,https://www.google.com/maps/search/?api=1&quer...
8,A_53,0.596191,4,45.506247,-122.620195,27.0,11.0,91.0,99.0,https://www.google.com/maps/search/?api=1&quer...
9,A_52,0.583255,4,45.506247,-122.638195,26.0,13.0,91.0,93.0,https://www.google.com/maps/search/?api=1&quer...
10,A_51,0.566798,4,45.506247,-122.656195,17.0,13.0,87.0,100.0,https://www.google.com/maps/search/?api=1&quer...
11,A_95,0.556461,4,45.562247,-122.656195,13.0,12.0,89.0,100.0,https://www.google.com/maps/search/?api=1&quer...
12,A_94,0.54818,4,45.562247,-122.674195,15.0,6.0,92.0,100.0,https://www.google.com/maps/search/?api=1&quer...


# Now here is the whole code ran in the sam ecell for place = 'New York City' instead

In [29]:
# input city, location, address, etc as a string
place = 'New York City, NY'

# use geopy library to get lat and long of the desired area

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(str(place))
latitude = location.latitude
longitude = location.longitude

# create an 11 x 11 grid with this location at the center
lng_start = longitude - (.018 * 5)
lng_end = longitude + (.018 * 5)
lat_start = latitude - (.014 * 5)
lat_end = latitude + (.014 * 5)
lat_list = np.linspace(lat_start, lat_end, 11)
lng_list = np.linspace(lng_start, lng_end, 11)
lat_list = lat_list.tolist()
lng_list = lng_list.tolist()

# exapnd lists to exhaust all possibilities of latitude and longitude
full_lng_list = lng_list * 11
full_lat_list = []
i = 0
while i < 11:
    j = 0
    while j < 11:
        full_lat_list.append(lat_list[i])
        j += 1
    i += 1
    
# create area names
area = []
for i in np.arange(1,122):
    area.append('A_' + str(i))
# create a pandas dataframe with the latitude and longitude of all 121 areas
data = {'Name':area, 'Latitude':full_lat_list, 'Longitude':full_lng_list}
df_locations = pd.DataFrame(data=data)
df_locations = df_locations[['Name', 'Latitude', 'Longitude']]

radius = 800
# get coffee shops
df_neigh_coffee = getNearbyVenues(query='coffee', names=df_locations['Name'], latitudes=df_locations['Latitude'], longitudes=df_locations['Longitude'])
print('Coffee shop DF created')
# get bars
df_neigh_bars = getNearbyVenues(query='bars', names=df_locations['Name'], latitudes=df_locations['Latitude'], longitudes=df_locations['Longitude'])
print('Bars DF created')

# group and merge the counts into the coffee df
df_neigh_coffee_count = df_neigh_coffee.groupby(['Area']).size().reset_index(name='CoffeeCount')
df_neigh_coffee_count = df_neigh_coffee_count.rename(columns={'Area':'Name'})

df_locations = pd.merge(df_locations, df_neigh_coffee_count,
               on=['Name'], how="left")
# turn all NaN's in CoffeeCount to 0
df_locations['CoffeeCount'] = df_locations['CoffeeCount'].fillna(0)
# convert to integer
df_locations['CoffeeCount'] = df_locations['CoffeeCount'].astype(float)

# group and merge the counts into the df
df_neigh_bars_count = df_neigh_bars.groupby(['Area']).size().reset_index(name='BarsCount')
df_neigh_bars_count = df_neigh_bars_count.rename(columns={'Area':'Name'})

df_locations = pd.merge(df_locations, df_neigh_bars_count,
               on=['Name'],
               how="left")
# turn all NaN's in BarsCount to 0
df_locations['BarsCount'] = df_locations['BarsCount'].fillna(0)
# convert to integer
df_locations['BarsCount'] = df_locations['BarsCount'].astype(float)

# the above cell is where the walkscore_api is defined
# use the Walkscore API to extract walkscore and bikescore based on coordinates
# add a blank column for walkscore and bikescore to df_locations
df_locations['Walkscore'] = None
df_locations['Bikescore'] = None

# create a loop that goes through df_locations and appends the walkscore and bikescore
# search for df_locations['Name'][i] + ' Neighborhood, Portland, OR'
# if that raises an error, search for df_locations['Name'][i] + ', Portland, OR'
# if that raises an error, set them = None
for i in range(0, len(df_locations), 1):
    try:
        lat = df_locations['Latitude'][i]
        lng = df_locations['Longitude'][i]
        request_url = 'http://api.walkscore.com/score?format=json&lat=' + str(lat) + '&lon=' + str(lng) + '&transit=1&bike=1&wsapikey=' + walkscore_api
        request = pd.read_json(request_url)
        walk = request['walkscore']['score']
        df_locations.iat[i, df_locations.columns.get_loc('Walkscore')] = walk
        bike = request['bike']['score']
        df_locations.iat[i, df_locations.columns.get_loc('Bikescore')] = bike
    except:
        bike = None
        walk = None
print('Walkscore and Bikescore loaded!')

# change walkscore and bikescore to floats and change NaNs to 0
df_locations['Walkscore'] = df_locations['Walkscore'].astype(float)
df_locations['Bikescore'] = df_locations['Bikescore'].astype(float)
df_locations['Walkscore'] = df_locations['Walkscore'].fillna(0)
df_locations['Bikescore'] = df_locations['Bikescore'].fillna(0)

# drop columns to get only the metrics
df_locations_cluster = df_locations.drop('Name', 1)
df_locations_cluster = df_locations_cluster.drop('Latitude', 1)
df_locations_cluster = df_locations_cluster.drop('Longitude', 1)
df_locations_cluster.head()

# normalize columns
x = df_locations_cluster.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_locations_normal = pd.DataFrame(x_scaled)
df_locations_normal.columns = df_locations_cluster.columns
df_locations_normal.head()

# set number of clusters
kclusters = 5

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(df_locations_normal)

# insert the kmeans array to df_neigh
df_locations.insert(0, 'ClusterLabels', kmeans.labels_)

# add an equally weighted 'overall score' column based on the average of the normalized column and sort by that
df_locations['Score'] = (df_locations_normal['BarsCount'] + 
                         df_locations_normal['CoffeeCount'] + 
                         df_locations_normal['Walkscore'] + 
                         df_locations_normal['Bikescore']) / 4
# reorder columns and SORT from highest score to lowest
df_locations = df_locations[['Name', 'Score', 'ClusterLabels', 
                     'Latitude', 'Longitude', 'BarsCount', 
                     'CoffeeCount', 'Walkscore', 'Bikescore']]
df_locations = df_locations.sort_values(by=['Score'], ascending=False)
df_locations = df_locations.reset_index(drop=True)

# add a link to the dataframe that will open the location in google maps
links_list = []
for lat, lon in zip(df_locations['Latitude'], df_locations['Longitude']):
    link = 'https://www.google.com/maps/search/?api=1&query=' + str(lat) + ',' + str(lon)
    links_list.append(link)
df_locations['Link'] = links_list

# create a map, color coded by cluster and with interactive labels
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(place)
latitude = location.latitude
longitude = location.longitude


# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11.5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster, link, score in zip(df_locations['Latitude'], df_locations['Longitude'], 
                                        df_locations['Name'], df_locations['ClusterLabels'], 
                                               df_locations['Link'], df_locations['Score']):
    label = folium.Popup('<a href="' + link + '"target="_blank">' + poi + '</a>, Cluster: ' 
                         + str(cluster) + ', Score: ' + "{:.2f}".format(score))

    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters



Coffee shop DF created
Bars DF created
Walkscore and Bikescore loaded!
