<a href="https://colab.research.google.com/github/renan-cav/IDHM-Brasil-2010/blob/main/Raspagem_IDHM_2010.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Raspagem de tabela do site do Programa das Nações Unidas para o Desenvolvimento

Site: https://www.br.undp.org/content/brazil/pt/home/idh0/rankings/idhm-municipios-2010.html

Código Fonte: https://www.br.undp.org/content/brazil/pt/home/idh0/rankings/idhm-municipios-2010.html

In [None]:
!pip install quickda # Instala a biblioteca quickda que serve para análise de dados

In [253]:
import csv # Importa as bibliotecas necessárias
import pandas as pd
import quickda

from quickda.clean_data import * # Limpa o dataframe
from quickda.explore_data import *

In [270]:
url = "https://www.br.undp.org/content/brazil/pt/home/idh0/rankings/idhm-municipios-2010.html"
html = pd.read_html(url) # Puxa a tabela direto da URL Definida

In [271]:
df = html[0] # Extrai a tabela do HTML

In [272]:
df.head()

Unnamed: 0,Ranking IDHM 2010,Município,IDHM 2010,IDHM Renda 2010,IDHM Longevidade 2010,IDHM Educação 2010
0,1 º,São Caetano do Sul (SP),862,891,887,811
1,2 º,Águas de São Pedro (SP),854,849,890,825
2,3 º,Florianópolis (SC),847,870,873,800
3,4 º,Balneário Camboriú (SC),845,854,894,789
4,4 º,Vitória (ES),845,876,855,805


In [273]:
df[["municipio","estado"]] = df["Município"].str.split("(", expand=True) # Divide a coluna munícipio e criar a coluna estado
df["estado"] = df["estado"].str.replace(")","") # Tira o parêntese que estava sobrando
df = df.rename(columns={"Ranking IDHM 2010": "Ranking", "IDHM Renda 2010": "IDHM Renda", "IDHM Longevidade 2010": "IDHM Longev", "IDHM Educação 2010" : "IDHM Edu"}) # Renomeia as colunas
df = df.drop("Município", 1) # Retira a coluna "Município". O 1 no fim do código significa que uma coluna deve ser retirada.

In [274]:
df.head()

Unnamed: 0,Ranking,IDHM 2010,IDHM Renda,IDHM Longev,IDHM Edu,municipio,estado
0,1 º,862,891,887,811,São Caetano do Sul,SP
1,2 º,854,849,890,825,Águas de São Pedro,SP
2,3 º,847,870,873,800,Florianópolis,SC
3,4 º,845,854,894,789,Balneário Camboriú,SC
4,4 º,845,876,855,805,Vitória,ES


In [275]:
df["Ranking"] = df["Ranking"].str.replace("º", "").str.replace("td>", "") # Faz mudanças nas células. Retira o "º" do ranking e "td>" que era um erro na criação do HTML.
df["IDHM Renda"] = df["IDHM Renda"].str.replace("td>", "").str.replace(",","")
df["IDHM Longev"] = df["IDHM Longev"].str.replace("td>", "").str.replace(",","")

In [276]:
df.head()

Unnamed: 0,Ranking,IDHM 2010,IDHM Renda,IDHM Longev,IDHM Edu,municipio,estado
0,1,862,891,887,811,São Caetano do Sul,SP
1,2,854,849,890,825,Águas de São Pedro,SP
2,3,847,870,873,800,Florianópolis,SC
3,4,845,854,894,789,Balneário Camboriú,SC
4,4,845,876,855,805,Vitória,ES


In [277]:
df = df.convert_dtypes(infer_objects=True)

In [278]:
df[["Ranking", "IDHM Renda", "IDHM Longev"]] = df[["Ranking", "IDHM Renda", "IDHM Longev"]].astype(int)

In [279]:
df.head()

Unnamed: 0,Ranking,IDHM 2010,IDHM Renda,IDHM Longev,IDHM Edu,municipio,estado
0,1,862,891,887,811,São Caetano do Sul,SP
1,2,854,849,890,825,Águas de São Pedro,SP
2,3,847,870,873,800,Florianópolis,SC
3,4,845,854,894,789,Balneário Camboriú,SC
4,4,845,876,855,805,Vitória,ES


In [280]:
df = df[["Ranking", "municipio", "estado", "IDHM 2010","IDHM Renda", "IDHM Longev", "IDHM Edu"]] # Reordena as colunas

In [281]:
df = clean(df, method='standardize') # Padroniza o cabeçalho da base de dados (QuickDA)

In [283]:
df.head()

Unnamed: 0,ranking,municipio,estado,idhm_2010,idhm_renda,idhm_longev,idhm_edu
0,1,São Caetano do Sul,SP,862,891,887,811
1,2,Águas de São Pedro,SP,854,849,890,825
2,3,Florianópolis,SC,847,870,873,800
3,4,Balneário Camboriú,SC,845,854,894,789
4,4,Vitória,ES,845,876,855,805


In [282]:
explore(df, method='summarize')

Unnamed: 0,dtypes,count,null_sum,null_pct,nunique,min,25%,50%,75%,max,mean,median,std,skew
estado,string,5565,0,0.0,27,AC,-,-,-,TO,-,-,-,-
idhm_2010,Int64,5565,0,0.0,349,418,599,665,718,862,659.157,665,71.9973,-0.155605
idhm_edu,Int64,5565,0,0.0,466,207,490,560,631,825,559.094,560,93.3277,-0.0984634
idhm_longev,int64,5565,0,0.0,220,672,769,808,836,894,801.564,808,44.6809,-0.409358
idhm_renda,int64,5565,0,0.0,390,400,572,654,707,891,642.873,654,80.6617,-0.101024
municipio,string,5565,0,0.0,5291,Abadia de Goiás,-,-,-,Óleo,-,-,-,-
ranking,int64,5565,0,0.0,349,1,1362,2776,4167,5565,2772.27,2776,1607.38,0.00369577


In [35]:
df.to_csv("IDHM_2010_BR.csv")