In [1]:
# @hidden_cell
# The project token is an authorization token that is used to access project resources like data sources, connections, and used by platform APIs.
from project_lib import Project
project = Project(project_id='073e093e-9637-4264-a6f8-b0f7613a0467', project_access_token='p-1df74eadaadb87725116aa0992cbdec6a8ddd03d')
pc = project.project_context


# 1 - Plot Initial Sao Paulo metro data

# 1.1 Importing needed libs

In [107]:
import ast
import requests
import pandas as pd
from pandas.io.json import json_normalize
from sklearn.preprocessing import MinMaxScaler, StandardScaler
!pip install geopandas
!pip install descartes
!pip install folium
import geopandas as gpd
import folium
from sklearn.cluster import KMeans



# 1.2 Reading the geolocation data

In [118]:
geo_location = project.get_file("datasets_135214_321309_metrosp_stations.csv")
geo_location.seek(0)
geo_csv = pd.read_csv(geo_location)
geo_csv = geo_csv.drop(['Unnamed: 0', 'station', 'neigh'], 1)

In [119]:
geo_csv.head()

Unnamed: 0,name,lat,lon,line
0,Aacd Servidor,-23.597825,-46.652374,['lilas']
1,Adolfo Pinheiro,-23.650073,-46.704206,['lilas']
2,Alto Da Boa Vista,-23.641625,-46.699434,['lilas']
3,Alto Do Ipiranga,-23.602237,-46.612486,['verde']
4,Ana Rosa,-23.581871,-46.638104,"['azul', 'verde']"


# 1.3 Ploting the map

In [120]:
def get_metro_color(lines):
    lines_decoded = ast.literal_eval(lines)
    line_color = lines_decoded[0]
    if line_color == 'azul':
        return 'blue'
    if line_color == 'verde':
        return 'green'
    if line_color == 'vermelha':
        return 'red'
    if line_color == 'prata':
        return 'lightgray'
    if line_color == 'lilas':
        return 'purple'
    if line_color == 'amarela':
        return 'orange'
    raise Exception('Unknowk color {}'.format(line_color))

m = folium.Map(location=[-23.533773, -46.625290], zoom_start=12)
for index, row in geo_csv.iterrows():
    folium.Marker(location=[row['lat'], row['lon']], 
                  icon=folium.Icon(color=get_metro_color(row['line'])), 
                  radius=8).add_to(m)
folium.LayerControl().add_to(m)
m

# 2 - Exploring the data with foursquare

In [121]:

CLIENT_ID = 'HIDDEN'
CLIENT_SECRET = 'HIDDEN'
VERSION = '20180605'
radius = 500
LIMIT = 100

In [122]:
search_query = 'venues'
radius = 1000
LIMIT = 100

In [123]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [124]:
def get_venues(row):
    url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID,
        CLIENT_SECRET,
        VERSION,
        row['lat'],
        row['lon'],
        radius,
        LIMIT)
    results = requests.get(url).json()
    venues = results['response']['groups'][0]['items']
    nearby_venues = json_normalize(venues)
    filtered_columns = ['venue.name', 'venue.id', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
    nearby_venues
    nearby_venues =nearby_venues.loc[:, filtered_columns]

    nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
    occurrence_df = nearby_venues.groupby('venue.categories').size().reset_index()
    occurrence_df.columns = ['category', 'occurrences']
    
    return occurrence_df

In [125]:
geo_csv_with_venues_info = geo_csv.copy()
venues_dict_info = {}
for index, row in geo_csv.iterrows():
    venues = get_venues(row)
    for index_venues, row_venues in venues.iterrows():
        geo_csv_with_venues_info.loc[index,row_venues['category']] = row_venues['occurrences']

geo_csv_with_venues_info.head()

Unnamed: 0,name,lat,lon,line,Art Museum,Arts & Crafts Store,Athletics & Sports,BBQ Joint,Bagel Shop,Bakery,...,Irish Pub,Social Club,Swiss Restaurant,Herbs & Spices Store,High School,Bus Stop,Hawaiian Restaurant,Borek Place,Piadineria,Post Office
0,Aacd Servidor,-23.597825,-46.652374,['lilas'],2.0,2.0,2.0,1.0,1.0,2.0,...,,,,,,,,,,
1,Adolfo Pinheiro,-23.650073,-46.704206,['lilas'],,2.0,1.0,,,2.0,...,,,,,,,,,,
2,Alto Da Boa Vista,-23.641625,-46.699434,['lilas'],,1.0,,,,1.0,...,,,,,,,,,,
3,Alto Do Ipiranga,-23.602237,-46.612486,['verde'],,1.0,1.0,,1.0,3.0,...,,,,,,,,,,
4,Ana Rosa,-23.581871,-46.638104,"['azul', 'verde']",,,,,,4.0,...,,,,,,,,,,


In [126]:
geo_csv_with_venues_info = geo_csv_with_venues_info.fillna(0)
geo_csv_with_venues_info.head()

Unnamed: 0,name,lat,lon,line,Art Museum,Arts & Crafts Store,Athletics & Sports,BBQ Joint,Bagel Shop,Bakery,...,Irish Pub,Social Club,Swiss Restaurant,Herbs & Spices Store,High School,Bus Stop,Hawaiian Restaurant,Borek Place,Piadineria,Post Office
0,Aacd Servidor,-23.597825,-46.652374,['lilas'],2.0,2.0,2.0,1.0,1.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Adolfo Pinheiro,-23.650073,-46.704206,['lilas'],0.0,2.0,1.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alto Da Boa Vista,-23.641625,-46.699434,['lilas'],0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alto Do Ipiranga,-23.602237,-46.612486,['verde'],0.0,1.0,1.0,0.0,1.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Ana Rosa,-23.581871,-46.638104,"['azul', 'verde']",0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2.1 Normalizing

In [87]:
scaler = MinMaxScaler() 

ml_df = geo_csv_with_venues_info.copy()
ml_df = ml_df.drop(['Unnamed: 0', 'station', 'neigh'], 1)

column_names_to_not_normalize = ['name', 'lat', 'lon', 'line']
column_names_to_normalize = [x for x in list(ml_df) if x not in column_names_to_not_normalize ]
x = ml_df[column_names_to_normalize].values
x_scaled = scaler.fit_transform(x)
df_temp = pd.DataFrame(x_scaled, columns=column_names_to_normalize, index = ml_df.index)
ml_df[column_names_to_normalize] = df_temp
ml_df

Unnamed: 0,name,lat,lon,line,Art Museum,Arts & Crafts Store,Athletics & Sports,BBQ Joint,Bagel Shop,Bakery,...,Irish Pub,Social Club,Swiss Restaurant,Herbs & Spices Store,High School,Bus Stop,Hawaiian Restaurant,Borek Place,Piadineria,Post Office
0,Aacd Servidor,-23.597825,-46.652374,['lilas'],0.666667,0.50,1.0,0.333333,0.5,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Adolfo Pinheiro,-23.650073,-46.704206,['lilas'],0.000000,0.50,0.5,0.000000,0.0,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Alto Da Boa Vista,-23.641625,-46.699434,['lilas'],0.000000,0.25,0.0,0.000000,0.0,0.142857,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Alto Do Ipiranga,-23.602237,-46.612486,['verde'],0.000000,0.25,0.5,0.000000,0.5,0.428571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Ana Rosa,-23.581871,-46.638104,"['azul', 'verde']",0.000000,0.00,0.0,0.000000,0.0,0.571429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Anhangabau,-23.547825,-46.639180,['vermelha'],0.333333,0.25,0.0,0.000000,0.0,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,Armenia,-23.525410,-46.629259,['azul'],0.333333,0.00,0.5,0.333333,0.0,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Artur Alvim,-23.540244,-46.484706,['vermelha'],0.000000,0.00,0.0,0.000000,0.0,0.857143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,Belem,-23.542872,-46.589615,['vermelha'],0.000000,0.00,0.0,0.000000,0.5,0.571429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,Borba Gato,-23.633466,-46.692867,['lilas'],0.000000,0.00,0.0,0.000000,0.0,0.285714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2.2 Apply K-means clustering

In [127]:
clustering_data = ml_df[column_names_to_normalize].values
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 0).fit_predict(clustering_data)

## 2.3 Ploting the first clustering result

In [129]:
def get_kmeans_color(value):
    if value == 0:
        return 'blue'
    if value == 1:
        return 'green'
    if value == 2:
        return 'red'
    if value == 3:
        return 'lightgray'
    if value == 4:
        return 'darkred'
    if value == 5:
        return 'darkblue'
    if value == 6:
        return 'purple'
    if value == 7:
        return 'pink'
    if value == 8:
        return 'cadetblue'
    if value == 9:
        return 'black'
    if value == 10:
        return 'beige'
    if value == 11:
        return 'lightgreen'
    return 'orange'

m = folium.Map(location=[-23.533773, -46.625290], zoom_start=12)
for index, row in geo_csv.iterrows():
    folium.Marker(location=[row['lat'], row['lon']], 
                  icon=folium.Icon(color=get_kmeans_color(kmeans[index])), 
                  radius=8).add_to(m)
folium.LayerControl().add_to(m)
m

## 2.4 Plotting with others cluster size

In [130]:
kmeans = KMeans(n_clusters = 6, init = 'k-means++', random_state = 0).fit_predict(clustering_data)


m = folium.Map(location=[-23.533773, -46.625290], zoom_start=12)
for index, row in geo_csv.iterrows():
    folium.Marker(location=[row['lat'], row['lon']], 
                  icon=folium.Icon(color=get_kmeans_color(kmeans[index])), 
                  radius=8).add_to(m)
folium.LayerControl().add_to(m)
m

In [136]:
kmeans = KMeans(n_clusters = 4, init = 'k-means++', random_state = 0).fit_predict(clustering_data)


m = folium.Map(location=[-23.533773, -46.625290], zoom_start=12)
for index, row in geo_csv.iterrows():
    folium.Marker(location=[row['lat'], row['lon']], 
                  icon=folium.Icon(color=get_kmeans_color(kmeans[index])), 
                  radius=8).add_to(m)
folium.LayerControl().add_to(m)
m

## 2.5 Analysing by specific venue type

In [148]:
clustering_data = ml_df[['Irish Pub']].values
kmeans = KMeans(n_clusters = 2, init = 'k-means++', random_state = 0).fit_predict(clustering_data)
m = folium.Map(location=[-23.533773, -46.625290], zoom_start=12)
for index, row in geo_csv.iterrows():
    folium.Marker(location=[row['lat'], row['lon']], 
                  icon=folium.Icon(color=get_kmeans_color(kmeans[index])), 
                  radius=8).add_to(m)
folium.LayerControl().add_to(m)
m