### This notebook was made almost entirely with the help of IBM-Coursera's course, where many of the scripts here are taught and encouraged to be used in this final project.

In [3]:
#!conda install -c conda-forge folium=0.5.0 --yes
#!conda install -c conda-forge geopy --yes 

In [4]:
import pandas as pd
import numpy as np
import folium 
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests # library to handle requests
import json # library to handle JSON files
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans

In [5]:
## Getting the data
Capital_cities_BR = pd.read_html("https://gps.pezquiza.com/apontamento-de-antena/latitude-e-longitude-das-capitais-brasileiras-para-usar-no-localizador-de-satelites/")
Capital_cities_BR = pd.DataFrame(Capital_cities_BR[0])

## Setting the header as the first column
new_header = Capital_cities_BR.iloc[0] #grab the first row for the header
Capital_cities_BR = Capital_cities_BR[1:-1] #take the data less the header row (and without the last as well)
Capital_cities_BR.columns = new_header #set the header row as the df header

Capital_cities_BR.dtypes
Capital_cities_BR.tail()

Unnamed: 0,Capital-Estado,Latitude,Longitude
23,Salvador – Bahia,12º58Ž16″,38º30Ž39″
24,São Luís – Maranhão,02º31Ž47″,44º18Ž10″
25,São Paulo – São Paulo,23º32Ž51″,46º38Ž10″
26,Teresina – Piauí,05º05Ž21″,42º48Ž07″
27,Vitória – Espírito Santo,20º19Ž10″,40º20Ž16″


In [6]:
## Transforming the latitude and longitude in numeric

## This functions puts a "-" before the latitudes in the south hemisfere (also the longitude will also have the minus sign)
def negative_coordinates(x):
    if x == "N":
        res = ""
    else: 
        res = "-"
    return(res)


for i in range(1,28):
    Capital_cities_BR.loc[i,'Latitude'] = "".join([negative_coordinates(Capital_cities_BR.loc[i,'Latitude'][-1]),Capital_cities_BR.loc[i,'Latitude'][0:2],".",Capital_cities_BR.loc[i,'Latitude'][3:5],Capital_cities_BR.loc[i,'Latitude'][6:8]])
    Capital_cities_BR.loc[i,'Longitude'] = "".join(["-",Capital_cities_BR.loc[i,'Longitude'][0:2],".",Capital_cities_BR.loc[i,'Longitude'][3:5],Capital_cities_BR.loc[i,'Longitude'][6:8]])
Capital_cities_BR.head(28)

Unnamed: 0,Capital-Estado,Latitude,Longitude
1,Aracaju – Sergipe,-10.544,-37.0418
2,Belém – Pará,-1.2721,-48.3016
3,Belo Horizonte – Minas Gerais,-19.5515,-43.5616
4,Boa Vista – Roraima,2.4911,-60.4024
5,Brasília – Distrito Federal,-15.4647,-47.5547
6,Campo Grande – Mato Grosso do Sul,-20.2634,-54.3847
7,Cuiabá – Mato Grosso,-15.3546,-56.0558
8,Curitiba – Paraná,-25.254,-49.1623
9,Florianópolis – Santa Catarina,-27.3548,-48.3257
10,Fortaleza – Ceará,-3.4302,-38.3235


In [9]:
# create map of Brazil using the mean latitude and longitude from the capitals

latitude = pd.to_numeric(Capital_cities_BR['Latitude']).mean()
longitude =  pd.to_numeric(Capital_cities_BR['Longitude']).mean()
print(latitude,longitude)


map_Brazil = folium.Map(location=[latitude, longitude], zoom_start=4)

# add markers to map
for lat, lng, city in zip(pd.to_numeric(Capital_cities_BR['Latitude']), pd.to_numeric(Capital_cities_BR['Longitude']), Capital_cities_BR['Capital-Estado']):
    label = '{}'.format(city)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Brazil)  
    
print("This map is available in the report and the presentation")

-12.207077777777776 -46.96158518518519
This map is available in the report and the presentation


In [24]:
## Credentials to get the foursquare data

CLIENT_ID = 'XXXXXXXXXXXXXXX' # your Foursquare ID
CLIENT_SECRET = 'XXXXXXXXXXXXXXXXXX' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('The credentials are personal and anyone can get them in the Foursquare website, in the Developer Portal')

The credentials are personal and anyone can get them in the Foursquare website, in the Developer Portal


In [22]:
## The functions that takes the Venues from the center of each city, they will be used to compare the cities later

def getNearbyVenues(names, latitudes, longitudes, radius=100000,LIMIT = 500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [12]:
## Getting the venues from each city
Brazil_venues = getNearbyVenues(names= Capital_cities_BR["Capital-Estado"],
                                   latitudes=Capital_cities_BR['Latitude'],
                                   longitudes=Capital_cities_BR['Longitude']
                                  )

Aracaju – Sergipe
Belém – Pará
Belo Horizonte – Minas Gerais
Boa Vista – Roraima
Brasília – Distrito Federal
Campo Grande – Mato Grosso do Sul
Cuiabá – Mato Grosso
Curitiba – Paraná
Florianópolis – Santa Catarina
Fortaleza – Ceará
Goiânia – Goiás
João Pessoa – Paraíba
Macapá – Amapá
Maceió – Alagoas
Manaus – Amazonas
Natal – Rio Grande do Norte
Palmas – Tocantins
Porto Alegre – Rio Grande do Sul
Porto Velho – Rondônia
Recife – Pernambuco
Rio Branco – Acre
Rio de Janeiro – Rio de Janeiro
Salvador – Bahia
São Luís – Maranhão
São Paulo – São Paulo
Teresina – Piauí
Vitória – Espírito Santo


In [13]:
Brazil_venues.shape ## we have 2495 venues of all cities

(2470, 7)

In [14]:
Brazil_venues.head()

Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Aracaju – Sergipe,-10.544,-37.0418,Museu da Gente Sergipana,-10.91709,-37.047771,History Museum
1,Aracaju – Sergipe,-10.544,-37.0418,IL Sordo Gelateria,-10.9302,-37.050115,Ice Cream Shop
2,Aracaju – Sergipe,-10.544,-37.0418,Forneria,-10.938256,-37.052529,Bakery
3,Aracaju – Sergipe,-10.544,-37.0418,Atacadao das embalagens,-10.907181,-37.05034,Department Store
4,Aracaju – Sergipe,-10.544,-37.0418,Sorveteria Castelo Branco,-10.93286,-37.079808,Ice Cream Shop


In [15]:
## To work with this data lates, we have to make it as a one hot encoding, with the get_dummies function of pandas library

# one hot encoding
Brazil_onehot = pd.get_dummies(Brazil_venues[['Venue Category']], prefix="", prefix_sep="")

Brazil_onehot['City'] = Brazil_venues['City']

Brazil_onehot.head()


Unnamed: 0,Acai House,Accessories Store,African Restaurant,Airport,Airport Lounge,American Restaurant,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Store,Water Park,Waterfall,Waterfront,Wine Bar,Yoga Studio
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
## We are interested in fact in the percentage of each category represents from the venues of the city

Brazil_grouped = Brazil_onehot.groupby('City').mean().reset_index()
Brazil_grouped.head()

Unnamed: 0,City,Acai House,Accessories Store,African Restaurant,Airport,Airport Lounge,American Restaurant,Aquarium,Argentinian Restaurant,Art Gallery,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Store,Water Park,Waterfall,Waterfront,Wine Bar,Yoga Studio
0,Aracaju – Sergipe,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0
1,Belo Horizonte – Minas Gerais,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0
2,Belém – Pará,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0
3,Boa Vista – Roraima,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0
4,Brasília – Distrito Federal,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0


In [17]:
## Selecting the most common venue for each city

def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]


num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
City_venues_sorted = pd.DataFrame(columns=columns)
City_venues_sorted['City'] = Brazil_grouped['City']

for ind in np.arange(Brazil_grouped.shape[0]):
    City_venues_sorted.iloc[ind, 1:] = return_most_common_venues(Brazil_grouped.iloc[ind, :], num_top_venues)

City_venues_sorted.head()

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
0,Aracaju – Sergipe,Bakery,Brazilian Restaurant,Burger Joint,Beach,Park,Ice Cream Shop,Café,Pet Store,Fast Food Restaurant,Restaurant
1,Belo Horizonte – Minas Gerais,Bakery,Snack Place,Supermarket,Hotel,Ice Cream Shop,Plaza,Pizza Place,Historic Site,Japanese Restaurant,Park
2,Belém – Pará,Ice Cream Shop,Beach,Plaza,Bakery,Historic Site,Brazilian Restaurant,Food Truck,Burger Joint,Pizza Place,Museum
3,Boa Vista – Roraima,Ice Cream Shop,Plaza,Pizza Place,Park,Supermarket,Clothing Store,BBQ Joint,Shopping Mall,Middle Eastern Restaurant,Mexican Restaurant
4,Brasília – Distrito Federal,Hotel,Bakery,Ice Cream Shop,Park,Brazilian Restaurant,Plaza,Snack Place,Pastry Shop,Restaurant,Seafood Restaurant


In [18]:
# set number of clusters
kclusters = 5

Brazil_grouped_clustering = Brazil_grouped.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(Brazil_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 
Brazil_grouped_clustering.head()


Unnamed: 0,Acai House,Accessories Store,African Restaurant,Airport,Airport Lounge,American Restaurant,Aquarium,Argentinian Restaurant,Art Gallery,Art Museum,...,Trail,Train Station,Vegetarian / Vegan Restaurant,Veterinarian,Video Store,Water Park,Waterfall,Waterfront,Wine Bar,Yoga Studio
0,0.0,0.0,0.01,0.0,0.0,0.0,0.01,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0
1,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.01,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.01,0.0,0.0,0.01,0.0,0.01,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.019608,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.01,0.0,0.0


In [19]:
# add clustering labels
City_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

Brazil_merged = Capital_cities_BR

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
Brazil_merged = Brazil_merged.join(City_venues_sorted.set_index('City'), on='Capital-Estado')

Brazil_merged.head() # check the last columns!

Unnamed: 0,Capital-Estado,Latitude,Longitude,Cluster Labels,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue
1,Aracaju – Sergipe,-10.544,-37.0418,3,Bakery,Brazilian Restaurant,Burger Joint,Beach,Park,Ice Cream Shop,Café,Pet Store,Fast Food Restaurant,Restaurant
2,Belém – Pará,-1.2721,-48.3016,3,Ice Cream Shop,Beach,Plaza,Bakery,Historic Site,Brazilian Restaurant,Food Truck,Burger Joint,Pizza Place,Museum
3,Belo Horizonte – Minas Gerais,-19.5515,-43.5616,3,Bakery,Snack Place,Supermarket,Hotel,Ice Cream Shop,Plaza,Pizza Place,Historic Site,Japanese Restaurant,Park
4,Boa Vista – Roraima,2.4911,-60.4024,2,Ice Cream Shop,Plaza,Pizza Place,Park,Supermarket,Clothing Store,BBQ Joint,Shopping Mall,Middle Eastern Restaurant,Mexican Restaurant
5,Brasília – Distrito Federal,-15.4647,-47.5547,3,Hotel,Bakery,Ice Cream Shop,Park,Brazilian Restaurant,Plaza,Snack Place,Pastry Shop,Restaurant,Seafood Restaurant


In [20]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=5)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]
print(rainbow)
print(colors_array)

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(Brazil_merged['Latitude'], Brazil_merged['Longitude'], Brazil_merged['Capital-Estado'], Brazil_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lon)],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters


['#8000ff', '#00b5eb', '#80ffb4', '#ffb360', '#ff0000']
[[5.00000000e-01 0.00000000e+00 1.00000000e+00 1.00000000e+00]
 [1.96078431e-03 7.09281308e-01 9.23289106e-01 1.00000000e+00]
 [5.03921569e-01 9.99981027e-01 7.04925547e-01 1.00000000e+00]
 [1.00000000e+00 7.00543038e-01 3.78411050e-01 1.00000000e+00]
 [1.00000000e+00 1.22464680e-16 6.12323400e-17 1.00000000e+00]]
