This notebook is to be used for my IBM Data Science coursera course capstone.

In [1]:
import sys

In [2]:
import pandas as pd
import numpy as np
import sys
import json
from geopy.geocoders import Nominatim
import requests
from pandas.io.json import json_normalize
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium

In [144]:
from colour import Color

Import the geodata file we have found for Colorado

In [38]:
df_colorado = pd.read_csv('us-zip-code.csv', sep = ";")  

In [39]:
df_colorado.head()

Unnamed: 0,Zip,City,State,Latitude,Longitude,Timezone,Daylight savings time flag,geopoint
0,80309,Boulder,CO,40.087835,-105.373507,-7,1,"40.087835,-105.373507"
1,80807,Burlington,CO,39.3124,-102.25094,-7,1,"39.3124,-102.25094"
2,81076,Sugar City,CO,38.26144,-103.67101,-7,1,"38.26144,-103.67101"
3,80525,Fort Collins,CO,40.532354,-105.0535,-7,1,"40.532354,-105.0535"
4,80610,Ault,CO,40.61798,-104.66762,-7,1,"40.61798,-104.66762"


Take only the Denver data points and drop all unneccesary columns

In [169]:
df_denver = df_colorado[df_colorado['City']=="Denver"]

In [170]:
df_denver = df_denver.drop(columns = ["Timezone", "Daylight savings time flag","geopoint"])

Sort values by zipcode and then a new index

In [171]:
df_denver = df_denver.sort_values(by=['Zip'])
df_denver = df_denver.set_index('Zip')
df_denver = df_denver.reset_index()

In [172]:
df_denver.head()

Unnamed: 0,Zip,City,State,Latitude,Longitude
0,80201,Denver,CO,39.726303,-104.856808
1,80202,Denver,CO,39.751586,-104.99699
2,80203,Denver,CO,39.731286,-104.98306
3,80204,Denver,CO,39.734686,-105.01966
4,80205,Denver,CO,39.758986,-104.96678


Find the latitude and logitude of Denver

In [173]:
address = 'Denver, CO'

geolocator = Nominatim(user_agent="dever_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Denver are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Denver are 39.7392364, -104.9848623.


In [174]:
map_Denver = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, zipcode in zip(df_denver['Latitude'], df_denver['Longitude'], df_denver['Zip']):
    label = '{}, {}, {}'.format(zipcode, lat, lng)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Denver)  
    
map_Denver

As we can see in our data here, there are a few far away postal codes that we wouldnt expect to be included. This is the denver airport which is for verious reasons included in the Denver area. There are no houses available here and it is not a reasonable inclusion in our housing tool, so we remove these data points.

In [175]:
df_denver = df_denver[df_denver['Longitude']<-104.41]

In [176]:
map_Denver = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, zipcode in zip(df_denver['Latitude'], df_denver['Longitude'], df_denver['Zip']):
    label = '{}'.format(zipcode)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Denver)  
    
map_Denver

Next we will define the functions we need to parse fourquare data

In [177]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']
    
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

Then we take our foursquare login to call for data

In [178]:
CLIENT_ID = '2AZXQDJHAAE2QWA02L3JH3GCC5BZUZJN5UOK3VVFFPWIAFYR' # your Foursquare ID
CLIENT_SECRET = 'SCYQVNCZOJH3D1L1CU4NR021H4AK4OUCHFIHFPOVCDVW0NK0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT = 400 # limit of number of venues returned by Foursquare API

radius = 1000 # define radius

Make a call for each of the zipcodes in our data

In [179]:
Denver_venues = getNearbyVenues(names=df_denver['Zip'],
                                   latitudes=df_denver['Latitude'],
                                   longitudes=df_denver['Longitude']
                                  )
print("Done colelcting venues")

80201
80202
80203
80204
80205
80206
80207
80209
80210
80211
80212
80214
80215
80216
80218
80219
80220
80221
80222
80223
80224
80225
80226
80227
80228
80229
80230
80231
80232
80233
80234
80235
80236
80237
80239
80241
80246
80247
80249
80256
80259
80260
80261
80262
80264
80266
80273
80274
80280
80281
80290
80291
80292
80293
80294
80295


In [180]:
Denver_venues.shape

(1519, 7)

Next we will get a count for each type of buisness

In [181]:
denver_onehot = pd.get_dummies(Denver_venues[['Venue Category']], prefix="", prefix_sep="")

denver_onehot['Neighborhood'] = Denver_venues['Neighborhood'] 
neighnumber = denver_onehot.columns.get_loc("Neighborhood")

fixed_columns = [denver_onehot.columns[neighnumber]] + list(denver_onehot.columns[:neighnumber]) + list(denver_onehot.columns[neighnumber+1:])
denver_onehot = denver_onehot[fixed_columns]

denver_onehot.head()

Unnamed: 0,Neighborhood,ATM,Alternative Healer,American Restaurant,Antique Shop,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Trail,Train Station,Travel Lounge,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio
0,80201,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,80202,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,80202,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,80202,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,80202,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [182]:
denver_grouped = denver_onehot.groupby('Neighborhood').sum().reset_index()
denver_grouped.shape

(56, 216)

In [183]:
df_normalized = denver_grouped.copy()
df_normalized = df_normalized.drop(columns = ['Neighborhood'])
df_normalized.head()

Unnamed: 0,ATM,Alternative Healer,American Restaurant,Antique Shop,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Astrologer,...,Trail,Train Station,Travel Lounge,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,6,0,0,0,0,0,1,0,...,0,1,0,0,0,0,1,2,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


Next we create a value to indicate how much of each buisness is in each neighborhood, where 100 indicates the area with the most of that buisness, and 0 indicates that this area has none.

In [184]:
for feature_name in df_normalized.columns:
    max_value = df_normalized[feature_name].max()
    min_value = df_normalized[feature_name].min()
    df_normalized[feature_name] = (df_normalized[feature_name]/max_value)

In [185]:
df_normalized = df_normalized.round(2)
df_normalized = df_normalized*100
df_normalized = df_normalized.astype('int32')

In [186]:
df_normalized['Neighborhood']= denver_grouped['Neighborhood']
df_normalized.Neighborhood = denver_grouped.Neighborhood.astype(int)

In [187]:
cols = df_normalized.columns.tolist()
cols = cols[-1:] + cols[:-1]
df_normalized = df_normalized[cols]

In [188]:
df_normalized = df_normalized.rename(columns={"Neighborhood": "Zip"})
df_normalized = df_normalized.merge(df_denver, on='Zip')
df_normalized.head()

Unnamed: 0,Zip,ATM,Alternative Healer,American Restaurant,Antique Shop,Arcade,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,...,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Wine Shop,Yoga Studio,City,State,Latitude,Longitude
0,80201,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,Denver,CO,39.726303,-104.856808
1,80202,0,100,86,0,0,0,0,0,50,...,0,0,100,100,0,0,Denver,CO,39.751586,-104.99699
2,80203,0,0,14,0,0,0,0,0,0,...,0,0,0,100,0,0,Denver,CO,39.731286,-104.98306
3,80204,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,Denver,CO,39.734686,-105.01966
4,80205,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,Denver,CO,39.758986,-104.96678


Create a color space where green is the max value and blue is the minimum.

In [189]:
red = Color("blue")
colors = list(red.range_to(Color("green"),101))
colorsHex = []
for i in colors:
    colorsHex.append(i.hex)

Here you can indicate what buisness you are interested in sorting by.

In [190]:
groupChoice = "Yoga Studio"

Create a heated dot map based on your choice

In [191]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=11)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(df_normalized['Latitude'], df_normalized['Longitude'], df_normalized['Zip'], df_normalized[groupChoice]):
    label = folium.Popup(str(poi) + ' ' + str(groupChoice) + ' ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=colorsHex[cluster],
        fill=True,
        fill_color=colorsHex[cluster],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters