In [None]:
import os

import shapely

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import json # library to handle JSON files
from pprint import pprint # data pretty printer

import requests # library to handle requests
from bs4 import BeautifulSoup  # library to handle web scraping

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import folium # map rendering library

import matplotlib.cm as cm # Matplotlib and associated plotting modules
import matplotlib.colors as colors # Matplotlib and associated plotting modules

from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from collections import Counter # count occurrences 

from sklearn.cluster import KMeans # import k-means from clustering stage


In [None]:
# open & load downloaded file, print its type
with open('static/Minneapolis_Neighborhoods.geojson') as json_data:
    minneapolis_data = json.load(json_data)
type(minneapolis_data)

In [None]:
print ("minneapolis_data key and value's length are: ") 
for key, value in minneapolis_data.items(): 
    print (key, len(str(value)))

In [None]:
neighborhoods_data = minneapolis_data['features']
pprint(neighborhoods_data[0])

In [None]:
column_names = ['Neighborhood', 'Latitude','Longitude'] 

# instantiate the dataframe
neighborhoods = pd.DataFrame(columns=column_names)
neighborhoods

In [None]:
for data in neighborhoods_data:
    neighborhood_name = data['properties']['BDNAME']
    neighborhood_lat = data['geometry']['coordinates'][0][0][1]
    neighborhood_lon = data['geometry']['coordinates'][0][0][0]
    
    neighborhoods = neighborhoods.append({'Neighborhood': neighborhood_name,
                                          'Latitude': neighborhood_lat, 'Longitude':neighborhood_lon}, ignore_index=True)

In [None]:
neighborhoods.head()

In [None]:
address = 'Minneapolis, MN'
location = None

# define an instance of the geocoder -> ny_explorer
while location == None:
    try:
        geolocator = Nominatim(user_agent="mp_explorer")
        location = geolocator.geocode(address)
        latitude = location.latitude
        longitude = location.longitude
    except:
        pass
print('The geograpical coordinate of Minneapolis.MN are {}, {}.'.format(latitude, longitude))

In [None]:
# create map of New York using latitude and longitude values
map_minneapolis = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, neighborhood in zip(neighborhoods['Latitude'], neighborhoods['Longitude'], neighborhoods['Neighborhood']):
    label = '{}'.format(neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_minneapolis)

map_minneapolis

In [None]:
CLIENT_ID = 'FHL1NZTKNDIXDSZR0AIPDW2WQX0RZTJNRY0IUMMXB5E5CTHC' # your Foursquare ID
CLIENT_SECRET = '0SAYOXRSKX0HW10ZYDPDCVKLUGBVJNJRELOSGEXKNV5TGV1B' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentials:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
url = 'https://api.foursquare.com/v2/venues/categories?&client_id={}&client_secret={}&v={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION)
category_results = requests.get(url).json()

In [None]:
for key, value in category_results['response']['categories'][0].items():
    print(key, len(str(value)))

In [None]:
category_list = category_results['response']['categories']

In [None]:
len(category_list)

In [None]:
for data in category_list:
    print(data['id'], data['name'])

In [None]:
# function to flatten a 'parent_id' category, returns all categories if checkParentID = False
def flatten_Hierarchy(category_list, checkParentID, category_dict, parent_id = ''):
    for data in category_list:
        
        if checkParentID == True and data['id'] == parent_id:
            category_dict[data['id']] = data['name']
            flatten_Hierarchy(category_list = data['categories'], checkParentID = False, category_dict = category_dict)
        
        elif checkParentID == False:
            category_dict[data['id']] = data['name']
            if len(data['categories']) != 0:
                flatten_Hierarchy(category_list = data['categories'], checkParentID = False, category_dict = category_dict)
    
    return category_dict

In [None]:
# this function takes the parent ID and returns 'name' and 'id' of all the sub-categories
category_dict = flatten_Hierarchy(category_list, checkParentID=True, category_dict = {}, parent_id = '4d4b7105d754a06374d81259')

In [None]:
neighborhoods.loc[0, 'Neighborhood']

In [None]:
neighborhood_latitude = neighborhoods.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = neighborhoods.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = neighborhoods.loc[0, 'Neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

In [None]:
LIMIT = 1 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
categoryId = '4d4b7105d754a06374d81259' # category ID for "Food"

# create URL

url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius,
    categoryId,
    LIMIT)
url # display URL

In [None]:
results = requests.get(url).json()

In [None]:
results['response']['venues']

In [None]:
categoryId_list = []
for key, value in category_dict.items():
    categoryId_list.append(key)
categoryId = ','.join(categoryId_list)

In [None]:
def getNearbyFood(names, latitudes, longitudes, radius=1000, LIMIT=500):
    not_found = 0
    print('***Start ', end='')
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(' .', end='')
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/search?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius,
            "4d4b7105d754a06374d81259", # "Food" category id
            LIMIT)
            
        try:
            # make the GET request
            results = requests.get(url).json()['response']['venues']
            
            # return only relevant information for each nearby venue
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['name'], 
                v['location']['lat'], 
                v['location']['lng'],  
                v['categories'][0]['name']) for v in results])
        except:
            not_found += 1


    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    print("\nDone*** with {} venues with incompelete information.".format(not_found))
    return(nearby_venues)

In [None]:
import pickle # to serialize and deserialize a Python object structure
try:
    with open('minneapolis_food_venues.pkl', 'rb') as f:
        minneapolis_venues = pickle.load(f)
    print("---Dataframe Existed and Deserialized---")
except:
    minneapolis_venues = getNearbyFood(names=neighborhoods['Neighborhood'],
                                        latitudes=neighborhoods['Latitude'],
                                        longitudes=neighborhoods['Longitude']
                                       )
    with open('minneapolis_food_venues.pkl', 'wb') as f:
        pickle.dump(minneapolis_venues, f)
    print("---Dataframe Created and Serialized---")

In [None]:
print(minneapolis_venues.shape)
minneapolis_venues.head()

In [None]:
print('There are {} uniques categories.'.format(len(minneapolis_venues['Venue Category'].unique())))
minneapolis_venues.groupby('Venue Category')['Venue Category'].count().sort_values(ascending=False)

In [None]:
# list all the categories
unique_categories = minneapolis_venues['Venue Category'].unique().tolist()
print(', '.join(str(x) for x in unique_categories))

In [None]:
# manually create a list of generalized categories
general_categories = ['Dessert Shop','Food','Bakery','Comfort Food Restaurant',
                    'Deli / Bodega','Food Truck','Restaurant','Frozen Yogurt Shop','Coffee Shop',
                    'Diner','Café','Juice Bar','Breakfast Spot','Grocery Store','Bar','Cupcake Shop',
                    'Pub','Cafeteria','Other Nightlife','Arcade','Hot Dog Joint','Food Court',
                    'Health Food Store','Convenience Store','Food & Drink Shop','Cocktail Bar','Cheese Shop',
                    'Snack Place','Sports Bar','Lounge','Theme Restaurant','Buffet','Bubble Tea Shop','Building',
                    'Irish Pub','College Cafeteria','Tea Room','Supermarket','Hotpot Restaurant','Gastropub','Beer Garden',
                    'Fish Market','Beer Bar','Clothing Store','Music Venue','Bistro','Salad Place','Wine Bar','Gourmet Shop',
                    'Indie Movie Theater','Art Gallery','Gift Shop','Pie Shop','Fruit & Vegetable Store',
                    'Street Food Gathering','Dive Bar','Factory','Farmers Market','Creperie',
                    'Candy Store','Event Space','Skating Rink','Miscellaneous Shop','Gas Station','Organic Grocery',
                    'Pastry Shop','Club House','Flea Market','Hotel','Furniture / Home Store','Bookstore','Pet Café',
                    'Gym / Fitness Center','Flower Shop','Financial or Legal Service','Hotel Bar','Hookah Bar',
                    'Market','Gluten-free Restaurant','Smoothie Shop','Butcher','Food Stand','Beach Bar','Beach',
                    'Soup Place','Rock Club','Residential Building (Apartment / Condo)','Laundry Service',
                    'Government Building','Bowling Alley','Nightclub','Park','Moving Target','Gay Bar', 'Racetrack','College Residence Hall',
                    'Office', 'Hospital', 'Ski Area', 'Food Service', 'Shopping Mall','Liquor Store', 'Bike Shop', 'Acupuncturist',
                    'Karaoke Bar', 'Church']

In [None]:
# fetch all the required food categories
food_categories =  list(set(unique_categories) - set(general_categories))
print(', '.join(str(x) for x in food_categories))

In [None]:
minneapolis_venues = minneapolis_venues[minneapolis_venues['Venue Category'].isin(food_categories)].reset_index()
minneapolis_venues.head(5)

In [None]:
print('There are {} unique FOOD categories.'.format(len(minneapolis_venues['Venue Category'].unique())))

In [None]:
print('There are {} uniques venues.'.format(len(minneapolis_venues['Venue'].unique())))

In [None]:
# one hot encoding
minneapolis_onehot = pd.get_dummies(minneapolis_venues[['Venue Category']], prefix="", prefix_sep="")
minneapolis_onehot.head()

In [None]:
# add neighborhood column back to dataframe
minneapolis_onehot['Neighborhood'] = minneapolis_venues['Neighborhood'] 
minneapolis_onehot.head()

In [None]:
# move neighborhood column to the first column
Neighborhood = minneapolis_onehot['Neighborhood']

minneapolis_onehot.drop(labels=['Neighborhood'], axis=1,inplace = True)
minneapolis_onehot.insert(0, 'Neighborhood', Neighborhood)

minneapolis_onehot.head()

In [None]:
minneapolis_onehot.shape

In [None]:
venue_counts = minneapolis_onehot.groupby('Neighborhood').sum()
venue_counts

In [None]:
venue_counts_described = venue_counts.describe().transpose()

In [None]:
venue_top10 = venue_counts_described.sort_values('max', ascending=False)[0:200]
venue_top10

In [None]:
venue_top10_list = venue_top10.index.values.tolist()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, axes =plt.subplots(5, 2, figsize=(20,20), sharex=True)
axes = axes.flatten()

for ax, category in zip(axes, venue_top10_list):
    data = venue_counts[[category]].sort_values([category], ascending=False)[0:10]
    pal = sns.color_palette("Blues", len(data))
    sns.barplot(x=category, y=data.index, data=data, ax=ax, palette=np.array(pal[::-1]))

plt.tight_layout()
plt.show();

In [None]:
minneapolis_grouped = minneapolis_onehot.groupby('Neighborhood').mean().reset_index()
minneapolis_grouped.head()

In [None]:
minneapolis_grouped.shape

In [None]:
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [None]:
num_top_venues = 5

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['Neighborhood']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))  

In [None]:
# create a new dataframe
neighborhoods_venues_sorted = pd.DataFrame(columns=columns)
neighborhoods_venues_sorted['Neighborhood'] = minneapolis_grouped['Neighborhood']

In [None]:
for ind in np.arange(minneapolis_grouped.shape[0]):
    neighborhoods_venues_sorted.iloc[ind, 1:] = return_most_common_venues(minneapolis_grouped.iloc[ind, :], num_top_venues)

neighborhoods_venues_sorted

In [None]:
minneapolis_grouped_clustering = minneapolis_grouped.drop('Neighborhood', 1)

In [None]:
sum_of_squared_distances = []
K = range(1,50)
for k in K:
    print(k, end=' ')
    kmeans = KMeans(n_clusters=k).fit(minneapolis_grouped_clustering)
    sum_of_squared_distances.append(kmeans.inertia_)

In [None]:
plt.plot(K, sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('sum_of_squared_distances')
plt.title('Elbow Method For Optimal k');

In [None]:
from sklearn.metrics import silhouette_score

sil = []
K_sil = range(2,50)
# minimum 2 clusters required, to define dissimilarity
for k in K_sil:
    print(k, end=' ')
    kmeans = KMeans(n_clusters = k).fit(minneapolis_grouped_clustering)
    labels = kmeans.labels_
    sil.append(silhouette_score(minneapolis_grouped_clustering, labels, metric = 'euclidean'))

In [None]:
plt.plot(K_sil, sil, 'bx-')
plt.xlabel('k')
plt.ylabel('silhouette_score')
plt.title('Silhouette Method For Optimal k')
plt.show()

In [None]:
# set number of clusters
kclusters = 8

# run k-means clustering
kmeans = KMeans(init="k-means++", n_clusters=kclusters, n_init=50).fit(minneapolis_grouped_clustering)

print(Counter(kmeans.labels_))

In [None]:
# add clustering labels
try:
    neighborhoods_venues_sorted.drop('Cluster Labels', axis=1)
except:
    neighborhoods_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

In [None]:
neighborhoods_venues_sorted.head(5)

In [None]:
# merge neighborhoods_venues_sorted with nyc_data to add latitude/longitude for each neighborhood
minneapolis_merged = neighborhoods_venues_sorted.join(neighborhoods.set_index('Neighborhood'), on='Neighborhood')
minneapolis_merged.head()

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=10)

# set color scheme for the clusters
colors_array = cm.rainbow(np.linspace(0, 1, kclusters))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(minneapolis_merged['Latitude'], minneapolis_merged['Longitude'], minneapolis_merged['Neighborhood'], minneapolis_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
required_column_indices = [2,3,4]
required_column = [list(minneapolis_merged.columns.values)[i] for i in required_column_indices]
required_column_indices = [2,3,4]

In [None]:
cluster_0 = minneapolis_merged.loc[minneapolis_merged['Cluster Labels'] == 0, minneapolis_merged.columns[1:12]]
cluster_0

In [None]:
for col in required_column:
    print(cluster_0[col].value_counts(ascending = False))
    print("---------------------------------------------")

In [None]:
cluster_1 = minneapolis_merged.loc[minneapolis_merged['Cluster Labels'] == 1, minneapolis_merged.columns[1:12]]
cluster_1

In [None]:
for col in required_column:
    print(cluster_1[col].value_counts(ascending = False))
    print("---------------------------------------------")

In [None]:
cluster_2 = minneapolis_merged.loc[minneapolis_merged['Cluster Labels'] == 2, minneapolis_merged.columns[1:12]]
cluster_2

In [None]:
for col in required_column:
    print(cluster_2[col].value_counts(ascending = False))
    print("---------------------------------------------")

In [None]:
cluster_3 = minneapolis_merged.loc[minneapolis_merged['Cluster Labels'] == 3, minneapolis_merged.columns[1:12]]
cluster_3

In [None]:
for col in required_column:
    print(cluster_3[col].value_counts(ascending = False))
    print("---------------------------------------------")

In [None]:
cluster_4 = minneapolis_merged.loc[minneapolis_merged['Cluster Labels'] == 4, minneapolis_merged.columns[1:12]]
cluster_4

In [None]:
for col in required_column:
    print(cluster_4[col].value_counts(ascending = False))
    print("---------------------------------------------")

In [None]:
cluster_5 = minneapolis_merged.loc[minneapolis_merged['Cluster Labels'] == 5, minneapolis_merged.columns[1:12]]
cluster_5

In [None]:
for col in required_column:
    print(cluster_5[col].value_counts(ascending = False))
    print("---------------------------------------------")

In [None]:
cluster_6 = minneapolis_merged.loc[minneapolis_merged['Cluster Labels'] == 6, minneapolis_merged.columns[1:12]]
cluster_6

In [None]:
for col in required_column:
    print(cluster_6[col].value_counts(ascending = False))
    print("---------------------------------------------")

In [None]:
cluster_7 = minneapolis_merged.loc[minneapolis_merged['Cluster Labels'] == 7, minneapolis_merged.columns[1:12]]
cluster_7

In [None]:
for col in required_column:
    print(cluster_7[col].value_counts(ascending = False))
    print("---------------------------------------------")

In [2]:
import tensorflow
import keras


Using TensorFlow backend.
