## São Paulo price analysis per borough

In [22]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import matplotlib.cm as cm
import matplotlib.colors as colors
!conda install -c conda-forge geopy --yes 
from geopy.geocoders import Nominatim
!conda install -c conda-forge folium=0.5.0 --yes 
import folium
import json
from pandas.io.json import json_normalize 
from sklearn.cluster import KMeans

print("Done importing everything!")

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Done importing everything!


In [94]:
#Do the request to retrieve the week page with Sao Paulo boroughs
url_sp_boroughs = "https://en.wikipedia.org/wiki/Subdivisions_of_São_Paulo"
page_text = requests.get(url_sp_boroughs).text

#From the whole html of the wiki page, let's separate the interesting table
page_html = BeautifulSoup(page_text,"lxml")
boroughs_table_html = page_html.find("table",class_="toccolours")

In [95]:
#Create the dataframe from the hmlt and check what is looks like
sp_boroughs = pd.read_html(str(boroughs_table_html))[0]
sp_boroughs

Unnamed: 0,Regional prefecture,Regional prefecture.1,Regional prefecture.2,Regional prefecture.3,Unnamed: 4
0,,,,,
1,1.0,Aricanduva,,17.0,M'Boi Mirim
2,2.0,Butantã,,18.0,Mooca
3,3.0,Campo Limpo,,19.0,Parelheiros
4,4.0,Capela do Socorro,,20.0,Penha
5,5.0,Casa Verde,,21.0,Perus
6,6.0,Cidade Ademar,,22.0,Pinheiros
7,7.0,Cidade Tiradentes,,23.0,Pirituba-Jaraguá
8,8.0,Ermelino Matarazzo,,24.0,Santana-Tucuruvi
9,9.0,Freguesia-Brasilândia,,25.0,Santo Amaro


In [96]:
#Drop the useless columns full of NaN
sp_boroughs.drop(["Regional prefecture","Regional prefecture.2","Regional prefecture.3"],axis=1,inplace=True)

#Drop the useless rows full of NaN
sp_boroughs.dropna(axis=0,inplace=True)

#The dataframe now has two interesting columns, we want to put all info in one single column

#Create an auxiliary dataframe with the last column
sp_boroughs_aux = sp_boroughs[["Unnamed: 4"]]
sp_boroughs_aux.columns = ["Regional prefecture.1"]

#Concatenate the original dataframe with the auxiliary one: put the info in the last column under in the 1st column
sp_boroughs = pd.concat([sp_boroughs,sp_boroughs_aux],ignore_index=True,axis=0,sort=False)

#The last column is now useless, drop it
sp_boroughs.drop(["Unnamed: 4"],axis=1,inplace=True)

#Rename the remaining column
sp_boroughs.columns = ["Borough"]

#Check out the result
sp_boroughs

Unnamed: 0,Borough
0,Aricanduva
1,Butantã
2,Campo Limpo
3,Capela do Socorro
4,Casa Verde
5,Cidade Ademar
6,Cidade Tiradentes
7,Ermelino Matarazzo
8,Freguesia-Brasilândia
9,Guaianases


In [97]:
#Function to retrieve the latitude from a borough in a given row
def get_latitude_borough_sp(row):
    address = row["Borough"] + ", São Paulo, SP, Brazil"
    geolocator = Nominatim(user_agent="sp_explorer")
    location = geolocator.geocode(address)
    try:
        return location.latitude
    except:
        return np.nan

#Function to retrieve the longitude from a borough in a given row
def get_longitude_borough_sp(row):
    address = row["Borough"] + ", São Paulo, SP, Brazil"
    geolocator = Nominatim(user_agent="sp_explorer")
    location = geolocator.geocode(address)
    try:
        return location.longitude
    except:
        return np.nan

In [98]:
#Let's assign the latitudes of the boroughs in our dataframe
sp_boroughs["Latitude"] = sp_boroughs.apply(get_latitude_borough_sp, axis=1)

In [100]:
#Let's assign the longitudes of the boroughs in our dataframe
sp_boroughs["Longitude"] = sp_boroughs.apply(get_longitude_borough_sp, axis=1)

In [101]:
#Check it out
sp_boroughs

Unnamed: 0,Borough,Latitude,Longitude
0,Aricanduva,-23.578024,-46.511454
1,Butantã,-23.5719,-46.70809
2,Campo Limpo,-23.64888,-46.758729
3,Capela do Socorro,,
4,Casa Verde,-23.505927,-46.656138
5,Cidade Ademar,-23.673012,-46.655281
6,Cidade Tiradentes,-23.582497,-46.409207
7,Ermelino Matarazzo,-23.491674,-46.48407
8,Freguesia-Brasilândia,-23.483494,-46.687149
9,Guaianases,-23.542308,-46.415605


In [102]:
#Remove rows of boroughs that do not have latitude or longitude
sp_boroughs.dropna(axis=0,inplace=True)

#Reset the index after droping rows
sp_boroughs.reset_index(drop=True, inplace=True)

#Check it out
sp_boroughs

Unnamed: 0,Borough,Latitude,Longitude
0,Aricanduva,-23.578024,-46.511454
1,Butantã,-23.5719,-46.70809
2,Campo Limpo,-23.64888,-46.758729
3,Casa Verde,-23.505927,-46.656138
4,Cidade Ademar,-23.673012,-46.655281
5,Cidade Tiradentes,-23.582497,-46.409207
6,Ermelino Matarazzo,-23.491674,-46.48407
7,Freguesia-Brasilândia,-23.483494,-46.687149
8,Guaianases,-23.542308,-46.415605
9,Ipiranga,-23.589273,-46.606162
