# Capstone assignment

Importing relevant libraries

In [1]:
import pandas as pd
import requests

import numpy as np # library to handle data in a vectorized manner

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

!conda install -c conda-forge geopy --yes
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

!conda install -c conda-forge folium=0.5.0 --yes
#=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

Solving environment: done

## Package Plan ##

  environment location: /home/jupyterlab/conda

  added / updated specs: 
    - geopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    geopy-1.18.1               |             py_0          51 KB  conda-forge
    openssl-1.0.2p             |    h14c3975_1002         3.1 MB  conda-forge
    geographiclib-1.49         |             py_0          32 KB  conda-forge
    ------------------------------------------------------------
                                           Total:         3.2 MB

The following NEW packages will be INSTALLED:

    geographiclib: 1.49-py_0         conda-forge
    geopy:         1.18.1-py_0       conda-forge

The following packages will be UPDATED:

    openssl:       1.0.2p-h470a237_1 conda-forge --> 1.0.2p-h14c3975_1002 conda-forge


Downloading and Extracting Packages
geopy-1.18.1         | 51 KB     | ##########

##### Downloading New York data, same from previous assignment in course

In [2]:
!wget -q -O 'newyork_data.json' https://ibm.box.com/shared/static/fbpwbovar7lf8p5sgddm06cgipa2rxpe.json
print('Data downloaded!')

Data downloaded!


##### Load and explore data

In [3]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

##### Look at the data

In [4]:
newyork_data

{'type': 'FeatureCollection',
 'totalFeatures': 306,
 'features': [{'type': 'Feature',
   'id': 'nyu_2451_34572.1',
   'geometry': {'type': 'Point',
    'coordinates': [-73.84720052054902, 40.89470517661]},
   'geometry_name': 'geom',
   'properties': {'name': 'Wakefield',
    'stacked': 1,
    'annoline1': 'Wakefield',
    'annoline2': None,
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.84720052054902,
     40.89470517661,
     -73.84720052054902,
     40.89470517661]}},
  {'type': 'Feature',
   'id': 'nyu_2451_34572.2',
   'geometry': {'type': 'Point',
    'coordinates': [-73.82993910812398, 40.87429419303012]},
   'geometry_name': 'geom',
   'properties': {'name': 'Co-op City',
    'stacked': 2,
    'annoline1': 'Co-op',
    'annoline2': 'City',
    'annoline3': None,
    'annoangle': 0.0,
    'borough': 'Bronx',
    'bbox': [-73.82993910812398,
     40.87429419303012,
     -73.82993910812398,
     40.87429419303012]}},
  {'type': 'Feature',
 

##### Retrieve the relevant data in the *features* key, which is basically a list of the neighborhoods.

In [5]:
neighborhoods_data = newyork_data['features']

##### Tranform the data into a pandas dataframe
###### Start with creating an empty dataframe

In [6]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)

##### Loop through the data and fill the dataframe one row at a time.

In [7]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [8]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


#### Segment and cluster only the neighborhoods in Manhattan. So let's slice the original dataframe and create a new dataframe of the Manhattan data.

In [9]:
manhattan_data = neighborhoods[neighborhoods['Borough'] == 'Manhattan'].reset_index(drop=True)
manhattan_data.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688


##### Get geographical coordinates of Manhattan

In [10]:
address = 'Manhattan, New York City'

geolocator = Nominatim(user_agent="capstoneProject")
location = geolocator.geocode(address, timeout=60, exactly_one=True)
latitude = location.latitude
longitude = location.longitude
print('The decimal coordinates of Manhattan are {}, {}.'.format(latitude, longitude))

The decimal coordinates of Manhattan are 40.7900869, -73.9598295.


#### Copy the dataframe to restore work easily

In [11]:
df_man = manhattan_data.copy()
df_man

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Manhattan,Marble Hill,40.876551,-73.91066
1,Manhattan,Chinatown,40.715618,-73.994279
2,Manhattan,Washington Heights,40.851903,-73.9369
3,Manhattan,Inwood,40.867684,-73.92121
4,Manhattan,Hamilton Heights,40.823604,-73.949688
5,Manhattan,Manhattanville,40.816934,-73.957385
6,Manhattan,Central Harlem,40.815976,-73.943211
7,Manhattan,East Harlem,40.792249,-73.944182
8,Manhattan,Upper East Side,40.775639,-73.960508
9,Manhattan,Yorkville,40.77593,-73.947118


#### Create a map of Manhattan

In [12]:
# create map of Manhattan using latitude and longitude values
map_manhattan = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to map
for lat, lng, local in zip(df_man['Latitude'], df_man['Longitude'], df_man['Neighborhood']):
    label = '{}'.format(local)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_manhattan)  
    
map_manhattan

#### Function to repeat the exploring process to all the neighborhoods in Manhattan

In [32]:
import urllib
def getNearbyVenues(names, latitudes, longitudes, radius=5000, limit=500, categoryIds=''):
    try:
        venues_list=[]
        for name, lat, lng in zip(names, latitudes, longitudes):
            #print(name)

            # create the API request URL
            url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, VERSION, lat, lng, radius, limit)

            if (categoryIds != ''):
                url = url + '&categoryId={}'
                url = url.format(categoryIds)

            # make the GET request
            response = requests.get(url).json()
            results = response["response"]['venues']

            # return only relevant information for each nearby venue
            for v in results:
                success = False
                try:
                    category = v['categories'][0]['name']
                    success = True
                except:
                    pass

                if success:
                    venues_list.append([(
                        name, 
                        lat, 
                        lng, 
                        v['name'], 
                        v['location']['lat'], 
                        v['location']['lng'],
                        v['categories'][0]['name']
                    )])

        nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
        nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    except:
        print(url)
        print(response)
        print(results)
        print(nearby_venues)

    return(nearby_venues)

##### Define Foursquare Credentials and Version

In [33]:
CLIENT_ID = 'FMR3OGXQDOSLEBPX30G4YWRNANUJWHIWWIK0A0ARKJDRCT5Y'
CLIENT_SECRET = 'H5US3SIKGAL0JNQSMVG0G3H4BCNIVTRG3UWFNVRR1DSXHIER'
VERSION = '20180605'

##### Retrieve healthy food stores on Manhattan

In [20]:
# Use category id 50aa9e744b90af0d42d5de0e from Foursquare to only get healthy food stores
man_health = getNearbyVenues(names=df_man['Neighborhood'], latitudes=df_man['Latitude'], longitudes=df_man['Longitude'], 
                             radius=1000, limit=500, categoryIds='50aa9e744b90af0d42d5de0e')
man_health.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Vitamin Shoppe,40.87716,-73.905632,Supplement Shop
1,Marble Hill,40.876551,-73.91066,GNC,40.87976,-73.904484,Supplement Shop
2,Chinatown,40.715618,-73.994279,Wing Fung Hong Ltd,40.714108,-73.99492,Health Food Store
3,Chinatown,40.715618,-73.994279,Whole Foods Market,40.723875,-73.991976,Grocery Store
4,Chinatown,40.715618,-73.994279,GNC,40.718647,-73.995652,Supplement Shop


In [21]:
man_health.shape

(361, 7)

##### Function to add markers for given venues to map

In [22]:
def addToMap(df, color, existingMap):
    for lat, lng, local, venue, venueCat in zip(df['Venue Latitude'], df['Venue Longitude'], df['Neighborhood'], df['Venue'], df['Venue Category']):
        label = '{} ({}) - {}'.format(venue, venueCat, local)
        label = folium.Popup(label, parse_html=True)
        folium.CircleMarker(
            [lat, lng],
            radius=5,
            popup=label,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.7).add_to(existingMap)

##### Map health food stores on Manhattan

In [23]:
map_man_health = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(man_health, 'red', map_man_health)
map_man_health

##### Find gyms on Manhattan and map them

In [29]:
# Use category id 4bf58dd8d48988d175941735 from Foursquare to only get gym/fitness centres
man_gym = getNearbyVenues(names=df_man['Neighborhood'], latitudes=df_man['Latitude'], longitudes=df_man['Longitude'], radius=1000, 
                          limit=50, categoryIds='4bf58dd8d48988d175941735')
man_gym.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Bronx Boxing,40.875671,-73.908355,Boxing Gym
1,Marble Hill,40.876551,-73.91066,La Palestra 99,40.873919,-73.917065,Gym
2,Marble Hill,40.876551,-73.91066,Astral Fitness & Wellness Center,40.876705,-73.906372,Gym
3,Marble Hill,40.876551,-73.91066,3210 Riverdale Avenue - Wellness Center & Gym,40.882746,-73.907625,Gym
4,Marble Hill,40.876551,-73.91066,Planet Fitness,40.874088,-73.909137,Gym / Fitness Center


In [30]:
man_gym.shape

(1885, 7)

In [31]:
map_man_gym = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(man_gym, 'green', map_man_gym)
map_man_gym

##### Find metro stations on Manhattan

In [34]:
# Use category id 4bf58dd8d48988d1fd931735 from Foursquare to only get metro stations
man_met = getNearbyVenues(names=df_man['Neighborhood'], latitudes=df_man['Latitude'], longitudes=df_man['Longitude'], radius=1000, limit=500, categoryIds='4bf58dd8d48988d1fd931735')
man_met.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,MTA Subway - 225th St/Marble Hill (1),40.874486,-73.909589,Metro Station
1,Marble Hill,40.876551,-73.91066,MTA Subway - 231st St (1),40.878825,-73.90493,Metro Station
2,Marble Hill,40.876551,-73.91066,MTA Subway - 215th St (1),40.869417,-73.915396,Metro Station
3,Marble Hill,40.876551,-73.91066,215th Street Subway Stop (1 Line),40.869881,-73.915858,Metro Station
4,Marble Hill,40.876551,-73.91066,Mac #3,40.867435,-73.909081,Metro Station


In [35]:
man_met.shape

(497, 7)

In [36]:
map_man_met = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(man_met, 'gold', map_man_met)
map_man_met

##### Find offices on Manhattan

In [37]:
# Use category id 4bf58dd8d48988d124941735 from Foursquare to only get offices
man_off = getNearbyVenues(names=df_man['Neighborhood'], latitudes=df_man['Latitude'], longitudes=df_man['Longitude'], radius=1000, limit=500, categoryIds='4bf58dd8d48988d124941735')
man_off.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Bettter Taxes,40.876311,-73.913145,Coworking Space
1,Marble Hill,40.876551,-73.91066,River Plaza Management Office,40.874742,-73.909925,Office
2,Marble Hill,40.876551,-73.91066,All Over Travel,40.874414,-73.911446,Office
3,Marble Hill,40.876551,-73.91066,The Dungeon,40.869243,-73.916442,Coworking Space
4,Marble Hill,40.876551,-73.91066,"Security Guards - National Security Service, LLC",40.877849,-73.910064,Office


In [38]:
man_off.shape

(1922, 7)

In [39]:
map_man_off = folium.Map(location=[latitude, longitude], zoom_start=12)
addToMap(man_off, 'fuchsia', map_man_off)
map_man_off

##### Find Residential Building (Apartment / Condo) on Manhattan

In [40]:
# Use category id 4d954b06a243a5684965b473 from Foursquare to only get Residential Building (Apartment / Condo
man_res = getNearbyVenues(names=df_man['Neighborhood'], latitudes=df_man['Latitude'], longitudes=df_man['Longitude'], radius=1000, categoryIds='4d954b06a243a5684965b473')
man_res.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Marble Hill,40.876551,-73.91066,Promenade Apts,40.875424,-73.913576,Residential Building (Apartment / Condo)
1,Marble Hill,40.876551,-73.91066,98 P.T.E.,40.871177,-73.915525,Residential Building (Apartment / Condo)
2,Marble Hill,40.876551,-73.91066,2390 Palisade AVE,40.87957,-73.920777,Residential Building (Apartment / Condo)
3,Marble Hill,40.876551,-73.91066,The Century Riverdale,40.878855,-73.914762,Residential Building (Apartment / Condo)
4,Marble Hill,40.876551,-73.91066,West 230th Street,40.875714,-73.90313,Residential Building (Apartment / Condo)


In [41]:
man_res.shape

(1998, 7)

##### Function to add columns

In [42]:
def addColumn(startDf, columnTitle, dataDf):
    grouped = dataDf.groupby('Neighborhood').count()
    
    for n in startDf['Neighborhood']:
        try:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = grouped.loc[n, 'Venue']
        except:
            startDf.loc[startDf['Neighborhood'] == n,columnTitle] = 0

##### Copy of dataframe and adding columns

In [43]:
df_data = df_man.copy()
#df_data.rename(columns={'Localidades':'Localidad'}, inplace=True)
addColumn(df_data, 'Health food stores', man_health)
addColumn(df_data, 'Gym', man_gym)
addColumn(df_data, 'Metro stations', man_met)
addColumn(df_data, 'Offices', man_off)
addColumn(df_data, 'Residential building', man_res)
df_data

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,Health food stores,Gym,Metro stations,Offices,Residential building
0,Manhattan,Marble Hill,40.876551,-73.91066,2.0,26.0,5.0,39.0,48.0
1,Manhattan,Chinatown,40.715618,-73.994279,9.0,50.0,15.0,50.0,50.0
2,Manhattan,Washington Heights,40.851903,-73.9369,1.0,22.0,5.0,37.0,50.0
3,Manhattan,Inwood,40.867684,-73.92121,3.0,13.0,11.0,38.0,50.0
4,Manhattan,Hamilton Heights,40.823604,-73.949688,2.0,39.0,6.0,45.0,50.0
5,Manhattan,Manhattanville,40.816934,-73.957385,1.0,46.0,5.0,46.0,50.0
6,Manhattan,Central Harlem,40.815976,-73.943211,3.0,48.0,14.0,49.0,50.0
7,Manhattan,East Harlem,40.792249,-73.944182,2.0,50.0,7.0,44.0,50.0
8,Manhattan,Upper East Side,40.775639,-73.960508,5.0,50.0,5.0,50.0,50.0
9,Manhattan,Yorkville,40.77593,-73.947118,4.0,50.0,3.0,49.0,50.0


In [44]:
# negative weight, because I want to open a healthy food store and therefore want to avoid competition as much as possible
weight_health = -1

# positive weight, because people prefer when communiting by metro to have the store close
weight_met = 1

# positive weight, because people exercisng at the gym want to combine training with health food
weight_gym = 1.5

# positive weight, because people working at offices want "fast food" and healthy
weight_off = 1.2

# positive weight because resedentials want to be able to eat healthy
weight_res = 2

In [45]:
df_weighted = df_data[['Neighborhood']].copy()

In [48]:
df_weighted['Score'] = df_data['Health food stores'] * weight_health + df_data['Gym'] * weight_gym + df_data['Metro stations'] * weight_met + df_data['Offices'] * weight_off + df_data['Residential building'] * weight_res
df_weighted = df_weighted.sort_values(by=['Score'], ascending=False)
df_weighted

Unnamed: 0,Neighborhood,Score
29,Financial District,252.0
28,Battery Park City,251.0
32,Civic Center,250.0
22,Little Italy,248.0
21,Tribeca,246.0
31,Noho,244.0
18,Greenwich Village,243.0
6,Central Harlem,241.8
1,Chinatown,241.0
23,Soho,241.0


### The best neighborhood to open a health food store in on Manhattan

In [51]:
map_man_result = folium.Map(location=[latitude, longitude], zoom_start=11)

man_win = df_man[df_man['Neighborhood'] == 'Financial District']

for lat, lng, local in zip(man_win['Latitude'], man_win['Longitude'], man_win['Neighborhood']):
    label = '{}'.format(local)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.7).add_to(map_man_result) 

addToMap(man_health[man_health['Neighborhood'] == 'Financial District'], 'red', map_man_result)
addToMap(man_gym[man_gym['Neighborhood'] == 'Financial District'], 'green', map_man_result)
addToMap(man_met[man_met['Neighborhood'] == 'Financial District'], 'gold', map_man_result)
addToMap(man_off[man_off['Neighborhood'] == 'Financial District'], 'fuchsia', map_man_result)
addToMap(man_res[man_res['Neighborhood'] == 'Financial District'], 'orange', map_man_result)

map_man_result

### Financial district is the best option for opening a healthy food store