In [57]:
# Importing required libraries
import geocoder # import geocoder
import numpy as np
import pandas as pd
import requests  # library to handle requests
import os
from bs4 import BeautifulSoup

import json # library to handle JSON files

#!conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
import geopy.geocoders
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library


import certifi # Nominatum requires a certificate
import ssl

print('Libraries loaded')

Libraries loaded


# Getting neighborhoods data
* Bogotá is divided in neighborhoods and localities. It was required to have a dataframe describing this relationship.  After searching in diferent web sites, it was decided to use following pdf file, which come from the "Secretaria de Salud Distrital":

http://www.saludcapital.gov.co/DPYS/Tablas%20de%20Referencia/Codificación%20de%20Barrios%20por%20localidad.pdf

In [11]:
# It was required to perform a copy and paste from pdf file and clean some empty rows.
# The result was the following csv file

bogota_n = pd.read_csv('bogota_barrios.csv')

# Dropping empty cells using "PostalCode"
bogota_n = bogota_n.dropna(subset=['barrio'])

print(bogota_n.head())
print(bogota_n.shape)

   cod_barrio              barrio       Localidad   Co_digo Localidad 
0        1203            CARACAS   ANTONIO NARINO                   15
1        1202       CIUDAD BERNA   ANTONIO NARINO                   15
2        1204  CIUDAD JARDIN SUR   ANTONIO NARINO                   15
3        2301       EDUARDO FREI   ANTONIO NARINO                   15
4        2107          LA FRAGUA   ANTONIO NARINO                   15
(831, 4)


## Adding Geospatial coordinates

This part of the process required a lot of time. I decided to use the Nominatim geolocator from geopy library, the following code was used to generate a csv file containing the geospatial coordinates of each Bogota neighborhood

In [10]:
ctx = ssl.create_default_context(cafile=certifi.where())
geopy.geocoders.options.default_ssl_context = ctx

bogota_n = pd.read_csv('bogota_barrios.csv')
bogota_n = bogota_n.dropna(subset=['barrio'])

bog_loc_n = []
for index, neighborhood in bogota_n.iterrows():
    try:
        geolocator = Nominatim(user_agent='foursquare_agent')
        location = geolocator.geocode('{}, {}, Bogotá, Distrito Capital, Colombia'.format(neighborhood['barrio'], neighborhood['localidad']))
        if location:
            print(neighborhood['barrio'], location)
            bog_loc_n.append((neighborhood['barrio'], neighborhood['localidad'], location.latitude, location.longitude))
    except:
        pass

bog_loc_df = pd.DataFrame(bog_loc_n, columns=['neighborhood','locality','latitude','longitude'])
bog_loc_df.to_csv('bog_geospatial..csv', index=False)

print(bog_loc_df.head())

Empty DataFrame
Columns: [neighborhood, locality, latitude, longitude]
Index: []


The resulting csv file is the following:

In [15]:
bogota_geo = pd.read_csv('bog_geospatial.csv')

print(bogota_geo.head())
print(bogota_geo.shape)

         neighborhood         locality  latitude  longitude
0            CARACAS   ANTONIO NARINO   4.591831 -74.088903
1       CIUDAD BERNA   ANTONIO NARINO   4.582115 -74.090310
2  CIUDAD JARDIN SUR   ANTONIO NARINO   4.580311 -74.096289
3          LA FRAGUA   ANTONIO NARINO   4.602600 -74.137119
4        LA FRAGUITA   ANTONIO NARINO   4.594836 -74.100720
(501, 4)


#### Let's visualizate Bogota and the neighborhoods in it.

In [58]:
# create map of Bogota using latitude and longitude values
address = 'Bogota, Colombia'
geolocator = Nominatim(user_agent="bogota_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Bogota City are {}, {}.'.format(latitude, longitude))

map_bogota = folium.Map(location=[latitude, longitude], zoom_start=11)

# Adding the locality boundaries from https://bogota-laburbano.opendatasoft.com/explore/dataset/poligonos-localidades/
folium.GeoJson(
    'poligonos-localidades.geojson.json',
    name='Nombre de la localidad'
).add_to(map_bogota)

# add markers to map
for lat, lng, label in zip(bogota_geo['latitude'], bogota_geo['longitude'], bogota_geo['neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_bogota)  
    
map_bogota


The geograpical coordinate of Bogota City are 4.5980772, -74.0761028.


#### Now, I'm going to start utilizing the Foursquare API to explore the neighborhoods and segment them

In [21]:
CLIENT_ID = 'GRYPGFTPHX2GFL2L0GLN3W3EXBM3FORRA52M3YBGETW1NSDH' # your Foursquare ID
CLIENT_SECRET = 'BL0U24MID0ALWEZVA3VYFJQT4GD2DMMIYII2XEO3T0ZIDQSE' # your Foursquare Secret
VERSION = '20180604'
LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: GRYPGFTPHX2GFL2L0GLN3W3EXBM3FORRA52M3YBGETW1NSDH
CLIENT_SECRET:BL0U24MID0ALWEZVA3VYFJQT4GD2DMMIYII2XEO3T0ZIDQSE


#### Let's explore the first neighborhood in our dataframe

In [22]:
neighborhood_latitude = bogota_geo.loc[0, 'latitude'] # neighborhood latitude value
neighborhood_longitude = bogota_geo.loc[0, 'longitude'] # neighborhood longitude value
neighborhood_name = bogota_geo.loc[0, 'neighborhood'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

Latitude and longitude values of CARACAS  are 4.5918307, -74.0889028.


In [23]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 500 # define radius
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

'https://api.foursquare.com/v2/venues/explore?&client_id=GRYPGFTPHX2GFL2L0GLN3W3EXBM3FORRA52M3YBGETW1NSDH&client_secret=BL0U24MID0ALWEZVA3VYFJQT4GD2DMMIYII2XEO3T0ZIDQSE&v=20180604&ll=4.5918307,-74.0889028&radius=500&limit=100'

In [24]:
results = requests.get(url).json()

In [25]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Cleaning the json and structure it into a *pandas* dataframe.

In [26]:
venues = results['response']['groups'][0]['items']
    
nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Chuletas Capital,Cajun / Creole Restaurant,4.588623,-74.090321
1,Sahara Arabian Food,Moroccan Restaurant,4.596074,-74.087894
2,Farmacia Mercy,Pharmacy,4.589708,-74.090909
3,Cromantic Professional Beauty Market,Cosmetics Shop,4.588724,-74.089259
4,Ortizo Instumentos Musicales,Art Gallery,4.589663,-74.085801


## 2. Explore Neighborhoods in Bogota

#### Repeating the same process to all the neighborhoods in Toronto

In [31]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        #print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [33]:
bogota_venues = getNearbyVenues(names=bogota_geo['neighborhood'],
                                 latitudes=bogota_geo['latitude'],
                                 longitudes=bogota_geo['longitude']
                                )
print(bogota_venues.head())

  Neighborhood  Neighborhood Latitude  Neighborhood Longitude  \
0     CARACAS                4.591831              -74.088903   
1     CARACAS                4.591831              -74.088903   
2     CARACAS                4.591831              -74.088903   
3     CARACAS                4.591831              -74.088903   
4     CARACAS                4.591831              -74.088903   

                                  Venue  Venue Latitude  Venue Longitude  \
0                      Chuletas Capital        4.588623       -74.090321   
1                   Sahara Arabian Food        4.596074       -74.087894   
2                        Farmacia Mercy        4.589708       -74.090909   
3  Cromantic Professional Beauty Market        4.588724       -74.089259   
4          Ortizo Instumentos Musicales        4.589663       -74.085801   

              Venue Category  
0  Cajun / Creole Restaurant  
1        Moroccan Restaurant  
2                   Pharmacy  
3             Cosmetics Shop

## 3. Analyzing Each Neighborhood

#### Grouping rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [34]:
# one hot encoding
bogota_onehot = pd.get_dummies(bogota_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
bogota_onehot['Neighborhood'] = bogota_venues['Neighborhood'] 

# move neighborhood column to the first column
fixed_columns = [bogota_onehot.columns[-1]] + list(bogota_onehot.columns[:-1])
bogota_onehot = bogota_onehot[fixed_columns]

bogota_onehot.head()

Unnamed: 0,Yoga Studio,Advertising Agency,African Restaurant,Airport,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,Arcade,...,Turkish Restaurant,University,Vegetarian / Vegan Restaurant,Video Game Store,Volleyball Court,Water Park,Wine Bar,Wine Shop,Wings Joint,Women's Store
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Next, let's group rows by neighborhood and by taking the mean of the frequency of occurrence of each category

In [59]:
bogota_grouped = bogota_onehot.groupby('Neighborhood').mean().reset_index()

# Selecting the coffee neighborhoods with coffee shops
bogota_coffee = bogota_grouped[['Neighborhood','Coffee Shop']]
bogota_coffee = bogota_coffee[bogota_coffee['Coffee Shop'] > 0]
bogota_coffee.to_csv('bogota_categories.csv', index=False)

# Filtering the geospatial locations, to see the coffee shop density per locality
neighborhood_list = bogota_coffee['Neighborhood'].unique().tolist()
bogota_geo_ftd = bogota_geo[bogota_geo['neighborhood'].isin(neighborhood_list)]


#### Creating a follium visualization in orde to verify the coffee shops density

In [56]:
# create map of Bogota using latitude and longitude values
address = 'Bogota, Colombia'
geolocator = Nominatim(user_agent="bogota_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Bogota City are {}, {}.'.format(latitude, longitude))

map_bogota = folium.Map(location=[latitude, longitude], zoom_start=12)

# Adding the locality boundaries from https://bogota-laburbano.opendatasoft.com/explore/dataset/poligonos-localidades/
folium.GeoJson(
    'poligonos-localidades.geojson.json',
    name='Nombre de la localidad'
).add_to(map_bogota)


# add markers to map
for lat, lng, label in zip(bogota_geo_ftd['latitude'], bogota_geo_ftd['longitude'], bogota_geo_ftd['neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_bogota)  
    
map_bogota

The geograpical coordinate of Bogota City are 4.5980772, -74.0761028.


It is possible to identify a big density of coffee shops in Four Localities:
- Santa Fe
- Martires
- Chapinero
- La Candelaria

#### Filtering the Localites with higher coffee shop density

Now, we analyze other economical aspects, excelso coffee tends to be much more expensive, then it is required to check the zones with a better income per capita