# Predicting São Paulo’s  financial districts locations using Coffee Shops concentration through clustering techniques

Is it possible to predict the location of the biggest business/financial districts of São Paulo just staring at coffee shops? 

## 1. Gathering data
The data we're going to use are the venues provided by Foursquare API. 

In [1]:
import numpy as np
import pandas as pd
import folium
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize
import requests
from bs4 import BeautifulSoup

In [2]:
Client_ID = 'F3PD4NLXISCXBJTE52S5DBQ2G1ZYGNJYJ2ICSTRZJMSOF5WK'
Client_Secret = 'B10LINRN32HCDSMOORHUDHJKCUQL1LR0UE5XACS4AXN4IQ4J'

Version = '20180604'

Since the foursquare query results are limited by 50, we're going to iterate our query over many combinations of Latitude and Longitude, and eliminating duplicates.

In [3]:
# DISCLAIMER: These values were caught by myself using Google searches
# They do not represent the exact limits of the city
N_Limit = -23.4142 # @ Serra da Cantareira
S_Limit = -23.7403 # @ Represa de Guarapiranga
E_Limit = -46.3988 # @ Cidade tiradentes
W_Limit = -46.792 # @ Osasco

Increment = 0.025

Radius = 1500

In [4]:
Latitude = N_Limit
Longitude = E_Limit

Coffee_Shops = pd.DataFrame()

Counter = 0

Search_Query = ['Café', 'Coffee', 'Cafeteria']

for Search in Search_Query:
    print(Search)
    while Latitude >= S_Limit:
        while Longitude >= W_Limit:
            URL = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&query={}&radius={}'.format(Client_ID, Client_Secret, Latitude, Longitude, Version, Search, Radius)

            Results = requests.get(URL).json()

            # Assign relevant part of JSON to venues
            Venues = Results['response']['venues']

            # Tranform venues into a dataframe
            Dataframe = json_normalize(Venues)

            Coffee_Shops = Coffee_Shops.append(Dataframe, sort=False)

            Longitude = Longitude - Increment
            Counter = Counter + 1

        Longitude = E_Limit
        Latitude = Latitude - Increment 
    Latitude = N_Limit

print("We've just made %d queries through Foursquare's API" % Counter)

Café
Coffee
Cafeteria
We've just made 672 queries through Foursquare's API


In [5]:
Coffee_Shops.head()

Unnamed: 0,categories,hasPerk,id,location.cc,location.city,location.country,location.distance,location.formattedAddress,location.labeledLatLngs,location.lat,location.lng,location.state,name,referralId,location.address,location.postalCode,location.crossStreet,location.neighborhood,venuePage.id
0,"[{'id': '4bf58dd8d48988d116941735', 'name': 'B...",False,52be1cd1498efd3ee4c8c521,BR,,Brasil,1495,[Brasil],"[{'label': 'display', 'lat': -23.4261519313639...",-23.426152,-46.43049,,cafe filosofico rock bar,v-1555768388,,,,,
1,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4fccf703e4b09949e4fc7ce4,BR,,Brasil,1298,[Brasil],"[{'label': 'display', 'lat': -23.4183807832593...",-23.418381,-46.435666,,Art Café conveniência,v-1555768388,,,,,
2,"[{'id': '4f4532974b9074f6e4fb0104', 'name': 'D...",False,51f57da8498eeefdff8be716,BR,Guarulhos,Brasil,1307,"[Guarulhos, SP, Brasil]","[{'label': 'display', 'lat': -23.4179611828491...",-23.417961,-46.435929,SP,Cafézinho da Leny,v-1555768388,,,,,
0,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,5972817d364d972852d1236b,BR,Guarulhos,Brasil,988,"[Avenida Mulungú, 262, Guarulhos, SP, 07151-38...","[{'label': 'display', 'lat': -23.413462, 'lng'...",-23.413462,-46.458443,SP,Edy Café,v-1555768388,"Avenida Mulungú, 262",07151-380,,,
1,"[{'id': '4bf58dd8d48988d1e0931735', 'name': 'C...",False,4fccf703e4b09949e4fc7ce4,BR,,Brasil,1420,[Brasil],"[{'label': 'display', 'lat': -23.4183807832593...",-23.418381,-46.435666,,Art Café conveniência,v-1555768388,,,,,


In [6]:
# Select only columns that matter
Coffee_Shops = Coffee_Shops[['location.address','location.city','location.lat','location.lng','location.postalCode','location.neighborhood', 'name']]

# Drop duplicate entries
Coffee_Shops.drop_duplicates(inplace=True)
Coffee_Shops.describe()

Unnamed: 0,location.lat,location.lng
count,2622.0,2622.0
mean,-23.577241,-46.630902
std,0.065558,0.079707
min,-23.752893,-46.789371
25%,-23.62405,-46.690202
50%,-23.569419,-46.641138
75%,-23.534647,-46.571355
max,-23.397865,-46.382414


## 2. Clustering venues
By using k-means we'll be able to find clusters of venues. 

In [16]:
from sklearn.cluster import KMeans

X = Coffee_Shops[['location.lat','location.lng']]

kmeans = KMeans(n_clusters = 48, random_state=0).fit(X)
y = kmeans.labels_
centers = kmeans.cluster_centers_

Lat_Long_Dataframe = pd.DataFrame()
Lat_Long_Dataframe['Name'] = Coffee_Shops['name']
Lat_Long_Dataframe['Latitude'] = Coffee_Shops['location.lat']
Lat_Long_Dataframe['Longitude'] = Coffee_Shops['location.lng']
Lat_Long_Dataframe['Cluster'] = y

In [17]:
Cluster_Dataframe = pd.DataFrame()

Cluster_Dataframe['Venues'] = Lat_Long_Dataframe.groupby('Cluster').count().Name
Cluster_Dataframe['Latitude_center'] = list(zip(*centers))[0]
Cluster_Dataframe['Longitude_center'] = list(zip(*centers))[1]

In [9]:
Cluster_Dataframe.head()

Unnamed: 0_level_0,Venues,Latitude_center,Longitude_center
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,32,-23.692481,-46.626891
1,104,-23.547329,-46.600895
2,142,-23.596515,-46.686092
3,110,-23.658285,-46.527461
4,71,-23.583679,-46.72714


In [18]:
clusters_map = folium.Map(location=[Cluster_Dataframe['Latitude_center'].mean(), Cluster_Dataframe['Longitude_center'].mean()], zoom_start=11) # generate map centred around Ecco
folium.TileLayer('cartodbpositron').add_to(clusters_map)

for lat, lng, venues in zip(Cluster_Dataframe['Latitude_center'], Cluster_Dataframe['Longitude_center'], Cluster_Dataframe['Venues']):
    folium.CircleMarker(
        [lat, lng],
        radius=venues/10,
        fill=True,
        color='blue',
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(clusters_map)

In [19]:
clusters_map

## 3. Evaluating results and accuracy 
Top 3 clusters and expected results  

In [20]:
Cluster_Dataframe.sort_values(by='Venues', ascending=False).head(3)

Unnamed: 0_level_0,Venues,Latitude_center,Longitude_center
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
19,141,-23.542867,-46.63764
22,130,-23.595629,-46.686629
27,129,-23.562322,-46.654495


In [21]:
Top_Cluster_Dataframe = Cluster_Dataframe.sort_values(by='Venues', ascending=False).head(3)

top_clusters_map = folium.Map(location=[Top_Cluster_Dataframe['Latitude_center'].mean(), Top_Cluster_Dataframe['Longitude_center'].mean()], zoom_start=13) # generate map centred around Ecco
folium.TileLayer('cartodbpositron').add_to(top_clusters_map)
for lat, lng, venues in zip(Top_Cluster_Dataframe['Latitude_center'], Top_Cluster_Dataframe['Longitude_center'], Top_Cluster_Dataframe['Venues']):
    folium.CircleMarker(
        [lat, lng],
        radius=venues/10,
        fill=True,
        color='blue',
        fill_color='blue',
        fill_opacity=0.6
    ).add_to(top_clusters_map)

In [22]:
top_clusters_map

According to Wikipedia, there are 3 main financial districts in São Paulo: Avenida Brigadeiro Faria Lima, Avenida Paulista and the city's historical downtown. 
Let's plot them as well so we can see if coffee shops are a good proxy for finding them.

In [23]:
# Avenida Brigadeiro Faria Lima
folium.CircleMarker(
    [-23.5863,-46.6831], 
    popup='Faria Lima',
    radius=75,
    fill=True,
    color='green',
    fill_color='green',
    fill_opacity=0.1
).add_to(top_clusters_map)

# Avenida Paulista
folium.CircleMarker(
    [-23.5629, -46.6544],
    popup='Paulista',
    radius=75,
    fill=True,
    color='green',
    fill_color='green',
    fill_opacity=0.1
).add_to(top_clusters_map)

# City's Historical Downtown
folium.CircleMarker(
    [-23.5442, -46.6339], 
    popup='Downtown',
    radius=75,
    fill=True,
    color='green',
    fill_color='green',
    fill_opacity=0.1
).add_to(top_clusters_map)

top_clusters_map

As expected, the biggest aglomerations of coffee shops are next to São Paulo's biggest financial districts.