## The Battle of the Neighborhoods - Week 2

### Imports Libraries

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import lxml
pd.set_option('display.max_rows', None)
import json # library to handle JSON files
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
from sklearn.metrics.pairwise import cosine_similarity # pairwise cosine similarity for content-based filtering
# Matplotlib and associated plotting modules
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium # map rendering library
print('Libraries imported.')

Libraries imported.


### Specify Hometown and its State Abbreviation

In [2]:
hometown = "Cupertino"#"Dallas" #"Cupertino" #"Mountain View"# "Hollywood" #"Menlo Park" # Tempe
state = "CA" #"TX" # "AZ"

**Read US city to geospatial coordinates mapping table into Pandas DataFrame**

In [3]:
df_cities = pd.read_csv('uscitiesv1.4.csv')
df_cities.head()

Unnamed: 0,city,city_ascii,state_id,state_name,county_fips,county_name,lat,lng,population,population_proper,density,source,incorporated,timezone,zips,id
0,Prairie Ridge,Prairie Ridge,WA,Washington,53053,Pierce,47.1443,-122.1408,,,1349.8,polygon,False,America/Los_Angeles,98360 98391,1840037882
1,Edison,Edison,WA,Washington,53057,Skagit,48.5602,-122.4311,,,127.4,polygon,False,America/Los_Angeles,98232,1840017314
2,Packwood,Packwood,WA,Washington,53041,Lewis,46.6085,-121.6702,,,213.9,polygon,False,America/Los_Angeles,98361,1840025265
3,Wautauga Beach,Wautauga Beach,WA,Washington,53035,Kitsap,47.5862,-122.5482,,,261.7,point,False,America/Los_Angeles,98366,1840037725
4,Harper,Harper,WA,Washington,53035,Kitsap,47.5207,-122.5196,,,342.1,point,False,America/Los_Angeles,98366,1840037659


**Retrieve Venue information of howntown city with Foursquare API** 

### NEW YORK Borough and Neighborhood

#### Use geopy library to get the latitude and longitude values of New York City.

In [5]:
address = 'New York City, NY'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7308619, -73.9871558.


In [6]:
with open('newyork_data.json') as json_data:
    newyork_data = json.load(json_data)

In [7]:
neighborhoods_data = newyork_data['features']

In [8]:
# define the dataframe columns
column_names = ['Borough', 'Neighborhood', 'Latitude', 'Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
df_hometown = pd.DataFrame(columns=column_names)

In [9]:
for data in neighborhoods_data:
    borough = neighborhood_name = data['properties']['borough'] 
    neighborhood_name = data['properties']['name']
        
    neighborhood_latlon = data['geometry']['coordinates']
    neighborhood_lat = neighborhood_latlon[1]
    neighborhood_lon = neighborhood_latlon[0]
    
    neighborhoods = neighborhoods.append({'Borough': borough,
                                          'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat,
                                          'Longitude': neighborhood_lon}, ignore_index=True)

In [10]:
neighborhoods.head()

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [11]:
def getNearbyVenues(names, latitudes, longitudes, radius=800, LIMIT=100):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
# newyork_venues = getNearbyVenues(names=neighborhoods['Neighborhood'],
#                                    latitudes=neighborhoods['Latitude'],
#                                    longitudes=neighborhoods['Longitude'],
#                                  radius = 800,
#                                  LIMIT = 100
#                                   )

In [13]:
newyork_venues = pd.read_csv('newyork_venues.csv')

In [14]:
print(newyork_venues.shape)
newyork_venues.head()
#newyork_venues.to_csv('newyork_venues_r800_l100.csv',index=False)

(10326, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
2,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Wakefield,40.894705,-73.847201,Dunkin Donuts,40.890631,-73.849027,Donut Shop
4,Wakefield,40.894705,-73.847201,Cooler Runnings Jamaican Restaurant Inc,40.898276,-73.850381,Caribbean Restaurant


#### Check New York Neighborhoods and the number of its Venues

In [15]:
newyork_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Allerton,29,29,29,29,29,29
Annadale,11,11,11,11,11,11
Arden Heights,5,5,5,5,5,5
Arlington,5,5,5,5,5,5
Arrochar,21,21,21,21,21,21
Arverne,16,16,16,16,16,16
Astoria,100,100,100,100,100,100
Astoria Heights,15,15,15,15,15,15
Auburndale,20,20,20,20,20,20
Bath Beach,47,47,47,47,47,47


In [16]:
# neighborhoods_data = newyork_data['features']

In [17]:
print('There are {} uniques categories.'.format(len(newyork_venues['Venue Category'].unique())))

There are 427 uniques categories.


#### Look Up Hometown geospatial coordinates and retrieve Venues information

In [18]:
df_tmp = df_cities[(df_cities.city == hometown) & (df_cities.state_id == state)]
latitude = df_tmp.lat.values[0]
longitude = df_tmp.lng.values[0]

df_hometown = df_hometown.append({'Borough': hometown,
                                          'Neighborhood': hometown,
                                          'Latitude': latitude,
                                          'Longitude': longitude}, ignore_index=True)

In [19]:
LIMIT = 150
radius = 1600
hometown_venues = getNearbyVenues(names=df_hometown['Neighborhood'],
                                   latitudes=df_hometown['Latitude'],
                                   longitudes=df_hometown['Longitude'],
                                  radius=radius,
                                  LIMIT = LIMIT
                                  )

Cupertino


In [20]:
print(hometown_venues.shape)
hometown_venues.head()

(100, 7)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Cupertino,37.3167,-122.0465,Apple Fitness Center Results Way,37.31749,-122.051129,Gym
1,Cupertino,37.3167,-122.0465,Fujitsu Planetarium De Anza College,37.318837,-122.046217,Planetarium
2,Cupertino,37.3167,-122.0465,Ike's Place,37.322801,-122.04122,Sandwich Place
3,Cupertino,37.3167,-122.0465,Shane Co.,37.323249,-122.046537,Jewelry Store
4,Cupertino,37.3167,-122.0465,Whole Foods Market,37.323478,-122.039759,Grocery Store


In [21]:
print('There are {} uniques categories.'.format(len(hometown_venues['Venue Category'].unique())))

There are 58 uniques categories.


#### merge howntown_venues with newyork_venues into merged_venues

In [22]:
merged_venues = pd.concat([hometown_venues,newyork_venues])
merged_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Cupertino,37.3167,-122.0465,Apple Fitness Center Results Way,37.31749,-122.051129,Gym
1,Cupertino,37.3167,-122.0465,Fujitsu Planetarium De Anza College,37.318837,-122.046217,Planetarium
2,Cupertino,37.3167,-122.0465,Ike's Place,37.322801,-122.04122,Sandwich Place
3,Cupertino,37.3167,-122.0465,Shane Co.,37.323249,-122.046537,Jewelry Store
4,Cupertino,37.3167,-122.0465,Whole Foods Market,37.323478,-122.039759,Grocery Store


In [23]:
print('There are {} uniques categories.'.format(len(merged_venues['Venue Category'].unique())))

There are 430 uniques categories.


In [24]:
# one hot encoding 
merged_onehot = pd.get_dummies(merged_venues[['Venue Category']], prefix="", prefix_sep="")
merged_onehot['Neighborhood'] = merged_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [merged_onehot.columns[-1]] + list(merged_onehot.columns[:-1])
merged_onehot = merged_onehot[fixed_columns]

In [25]:
merged_onehot.shape

(10426, 430)

In [26]:
merged_onehot.mean(axis=0).sort_values(ascending=False)[:10]

Pizza Place           0.041243
Italian Restaurant    0.031076
Deli / Bodega         0.026856
Coffee Shop           0.026568
Bakery                0.023115
Chinese Restaurant    0.021581
Bar                   0.020909
Mexican Restaurant    0.018224
Sandwich Place        0.017936
Grocery Store         0.017456
dtype: float64

In [27]:
# one hot encoding - later / after merge
hometown_onehot = pd.get_dummies(hometown_venues[['Venue Category']], prefix="", prefix_sep="")
# move neighborhood column to the first column
fixed_columns = [hometown_onehot.columns[-1]] + list(hometown_onehot.columns[:-1])
hometown_onehot.head() 

Unnamed: 0,American Restaurant,Asian Restaurant,Bakery,Bank,Big Box Store,Breakfast Spot,Bubble Tea Shop,Burger Joint,Café,Cantonese Restaurant,...,Sandwich Place,Spa,Sporting Goods Shop,Supplement Shop,Sushi Restaurant,Thai Restaurant,Video Game Store,Video Store,Vietnamese Restaurant,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
hometown_onehot.sum(axis=0).sort_values(ascending=False)[:10]

Coffee Shop              6
Chinese Restaurant       5
Park                     4
Bakery                   4
Sandwich Place           4
Grocery Store            3
Bubble Tea Shop          3
Vietnamese Restaurant    3
Japanese Restaurant      3
Mexican Restaurant       3
dtype: int64

In [29]:
merged_grouped = merged_onehot.groupby('Neighborhood').sum() 

### Use content-based recommender system appoarch

Where hometown city is equivalent to user profile (N x 1)
and neighborhoods x venues matrix is equivalent to Movie/Content matrix (M,N), where M is number of new york neighborhoods and N is the number of selected Venues.
To get the recommended neighborhoods that close to hometown, we just need to multiply Neighborhood(content) Matrix with hometown venues(user profile) and rank the results accordingly.

The governing formula is: 
find argmax(neighborhood i) = (M(New York neighborhoods),N(venues)).dot(hometown Venues N x 1)
Normalization of inputs is necessary - pairwise cosine similarity has built in normalization and is implemented here 

In [30]:
merged_grouped.shape

(303, 429)

In [31]:
# loop version for cosine similarity between hometown and each neighborhood

hometown_vec = merged_grouped.loc[hometown].values.reshape(1,-1)

scores = []
for i, neigh in enumerate(merged_grouped.index):
    neigh_vec = merged_grouped.loc[neigh].values.reshape(1,-1)
    score = np.round(cosine_similarity(hometown_vec, neigh_vec),2)
    scores.append(score)
    print(neigh, score)

Allerton [[0.31]]
Annadale [[0.25]]
Arden Heights [[0.26]]
Arlington [[0.26]]
Arrochar [[0.14]]
Arverne [[0.09]]
Astoria [[0.35]]
Astoria Heights [[0.21]]
Auburndale [[0.21]]
Bath Beach [[0.4]]
Battery Park City [[0.53]]
Bay Ridge [[0.39]]
Bay Terrace [[0.21]]
Baychester [[0.29]]
Bayside [[0.31]]
Bayswater [[0.15]]
Bedford Park [[0.32]]
Bedford Stuyvesant [[0.36]]
Beechhurst [[0.25]]
Bellaire [[0.19]]
Belle Harbor [[0.2]]
Bellerose [[0.34]]
Belmont [[0.28]]
Bensonhurst [[0.47]]
Bergen Beach [[0.]]
Blissville [[0.1]]
Bloomfield [[0.]]
Boerum Hill [[0.49]]
Borough Park [[0.32]]
Breezy Point [[0.]]
Briarwood [[0.25]]
Brighton Beach [[0.25]]
Broad Channel [[0.1]]
Broadway Junction [[0.13]]
Bronxdale [[0.32]]
Brooklyn Heights [[0.49]]
Brookville [[0.]]
Brownsville [[0.2]]
Bulls Head [[0.38]]
Bushwick [[0.41]]
Butler Manor [[0.03]]
Cambria Heights [[0.1]]
Canarsie [[0.31]]
Carnegie Hill [[0.47]]
Carroll Gardens [[0.34]]
Castle Hill [[0.2]]
Castleton Corners [[0.27]]
Central Harlem [[0.26]]
C

In [32]:
merged_grouped.insert(0,'scores',scores)

In [33]:
merged_grouped = merged_grouped.sort_values(by=['scores'], ascending=False)

In [34]:
merged_grouped.reset_index().head()

Unnamed: 0,Neighborhood,scores,Yoga Studio,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,Airport Tram,American Restaurant,...,Warehouse Store,Waste Facility,Watch Shop,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,Cupertino,[[1.0]],1,0,0,0,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
1,Manhattan Valley,[[0.6]],2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
2,Park Slope,[[0.57]],1,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,Rego Park,[[0.54]],0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,Chinatown,[[0.54]],0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0


### Create Top Ten Common Venues of Howntown and Recommended Neighborhoods

In [35]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


In [36]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = merged_grouped.reset_index()['Neighborhood']

for ind in np.arange(merged_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(merged_grouped.iloc[ind, :], num_top_venues)

print("Top Ten Common Venues of Howntown and Recommended Neighborhoods\n")
neighborhoods_venues_sorted.head(6)

Top Ten Common Venues of Howntown and Recommended Neighborhoods



Unnamed: 0,Neighborhood,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Cupertino,Coffee Shop,Chinese Restaurant,Bakery,Sandwich Place,Park,Vietnamese Restaurant,Grocery Store,American Restaurant,Bubble Tea Shop,Mexican Restaurant
1,Manhattan Valley,Pizza Place,Coffee Shop,Yoga Studio,Café,French Restaurant,Mexican Restaurant,Bar,Chinese Restaurant,Thai Restaurant,Burger Joint
2,Park Slope,Coffee Shop,Pizza Place,American Restaurant,Italian Restaurant,Burger Joint,Bookstore,Japanese Restaurant,Pub,Spa,Bagel Shop
3,Rego Park,Bakery,Grocery Store,Pharmacy,Pizza Place,Sushi Restaurant,Chinese Restaurant,Sandwich Place,Restaurant,Donut Shop,Cosmetics Shop
4,Chinatown,Chinese Restaurant,American Restaurant,Cocktail Bar,Dim Sum Restaurant,Vietnamese Restaurant,Bubble Tea Shop,Noodle House,Bakery,Salon / Barbershop,Ice Cream Shop
5,Lower East Side,Coffee Shop,Art Gallery,Chinese Restaurant,Café,Pizza Place,Ramen Restaurant,Cocktail Bar,Japanese Restaurant,Sandwich Place,Shoe Store


In [37]:
newyork_merged = neighborhoods
newyork_merged = newyork_merged.join(neighborhoods_venues_sorted.set_index('Neighborhood'), on='Neighborhood')
newyork_merged.head() # check the last columns!

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Bronx,Wakefield,40.894705,-73.847201,Pharmacy,Caribbean Restaurant,Donut Shop,Laundromat,Pizza Place,Dessert Shop,Ice Cream Shop,Food Truck,Sandwich Place,Exhibit
1,Bronx,Co-op City,40.874294,-73.829939,Baseball Field,Bus Station,Discount Store,Pharmacy,Grocery Store,Park,Pizza Place,Gift Shop,Salon / Barbershop,Mattress Store
2,Bronx,Eastchester,40.887556,-73.827806,Caribbean Restaurant,Deli / Bodega,Bus Station,Diner,Metro Station,Bus Stop,Bakery,Bowling Alley,Seafood Restaurant,Fast Food Restaurant
3,Bronx,Fieldston,40.895437,-73.905643,Playground,Plaza,Women's Store,Field,Event Service,Event Space,Exhibit,Eye Doctor,Factory,Falafel Restaurant
4,Bronx,Riverdale,40.890834,-73.912585,Park,Plaza,Food Truck,Bus Station,Gym,Home Service,Bank,Playground,Women's Store,Event Space


In [38]:
address = 'New York City, NY'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of New York City are 40.7308619, -73.9871558.


In [39]:
neighborhoods_venues_sorted.loc[0,'Neighborhood']

'Cupertino'

### Mark Top 5 Neighborhood recommendation on New York Map

In [40]:
# create map
map_top_match = folium.Map(location=[latitude, longitude], zoom_start = 11)
k_top = 5
df_top = merged_grouped.reset_index()[1:k_top+1]
# set color scheme for the clusters
x = np.arange(k_top)
ys = [i + x + (i*x)**2 for i in range(k_top)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

for i, neigh in enumerate(df_top['Neighborhood']):
    ny_neigh = newyork_merged.loc[newyork_merged['Neighborhood']==neigh]

    poi = neigh
    lat = float(ny_neigh['Latitude'].values)
    lon = float(ny_neigh['Longitude'].values)

    label = folium.Popup(' Top ' + str(i+1) + ' Match for ' + hometown + ": " + str(poi), \
                         parse_html=True, sticky=True, max_width=500)
    folium.CircleMarker(
        [lat, lon],
        radius= 15-i*2,
        popup=label,
        label = label,
        color=rainbow[i],
        fill=True,
        fill_color=rainbow[i],
        fill_opacity=0.7).add_to(map_top_match)

map_top_match
