<a href="https://colab.research.google.com/github/schammass/BeautifulSoup/blob/main/WebScrapping2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importing libraries**

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# **Resquest + Selector**

In [3]:
# Creating a web scraping function:

def scrap_state_info(state: str) -> dict:
    """
    Return information from Brazilian state

    :param state: State name
    :returns state_dict: dictionary with status indicators
    """
    print(f'⛷️ Picking {state} info...')
    state_url = f'https://www.ibge.gov.br/cidades-e-estados/{state}.html'
    page = requests.get(state_url)

    soup = BeautifulSoup(page.content, 'html.parser')
    indicators = soup.select('.indicador')

    state_dict = {
        ind.select('.ind-label')[0].text: ind.select('.ind-value')[0].text
        for ind in indicators
    }

    state_dict['Estado'] = state

    return state_dict

scrap_state_info('sp')

⛷️ Picking sp info...


{'Capital': 'São Paulo\xa0\xa0\xa0[2010]',
 'Densidade demográfica': '166,25 hab/km²\xa0\xa0\xa0[2010]',
 'Despesas empenhadas': '231.982.243,69 R$ (×1000)\xa0\xa0\xa0[2017]',
 'Estado': 'sp',
 'Gentílico': 'paulista',
 'Governador': 'JOÃO AGRIPINO DA COSTA DORIA JUNIOR\xa0\xa0\xa0[2019]',
 'IDH Índice de desenvolvimento humano': '0,783\xa0\xa0\xa0[2010]',
 'Matrículas no ensino fundamental': '5.414.208 matrículas\xa0\xa0\xa0[2020]',
 'População estimada': '46.649.132 pessoas\xa0\xa0\xa0[2021]',
 'Receitas realizadas': '232.822.496,57 R$ (×1000)\xa0\xa0\xa0[2017]',
 'Rendimento mensal domiciliar per capita': '1.814 R$\xa0\xa0\xa0[2020]',
 'Total de veículos': '30.778.960 veículos\xa0\xa0\xa0[2020]',
 'Área Territorial': '248.219,481 km²\xa0\xa0\xa0[2020]'}

In [4]:
states = ['AC', 'AL', 'AP', 'AM', 'BA', 'CE', 'DF', 'ES', 'GO', 'MA', 'MT', 'MS', 'MG', 'PA', 'PB', 'PR', 'PE', 'PI', 'RJ', 'RN', 'RS', 'RO', 'RR', 'SC', 'SP', 'SE', 'TO']

states_data = [scrap_state_info(state) for state in states]

⛷️ Picking AC info...
⛷️ Picking AL info...
⛷️ Picking AP info...
⛷️ Picking AM info...
⛷️ Picking BA info...
⛷️ Picking CE info...
⛷️ Picking DF info...
⛷️ Picking ES info...
⛷️ Picking GO info...
⛷️ Picking MA info...
⛷️ Picking MT info...
⛷️ Picking MS info...
⛷️ Picking MG info...
⛷️ Picking PA info...
⛷️ Picking PB info...
⛷️ Picking PR info...
⛷️ Picking PE info...
⛷️ Picking PI info...
⛷️ Picking RJ info...
⛷️ Picking RN info...
⛷️ Picking RS info...
⛷️ Picking RO info...
⛷️ Picking RR info...
⛷️ Picking SC info...
⛷️ Picking SP info...
⛷️ Picking SE info...
⛷️ Picking TO info...


In [5]:
# Transforming variable states_date to a data frame using Pandas and visualizing it.
df = pd.DataFrame(states_data)
df.head()

Unnamed: 0,Governador,Capital,Gentílico,Área Territorial,População estimada,Densidade demográfica,Matrículas no ensino fundamental,IDH Índice de desenvolvimento humano,Receitas realizadas,Despesas empenhadas,Rendimento mensal domiciliar per capita,Total de veículos,Estado
0,GLADSON DE LIMA CAMELI [2019],Rio Branco [2010],acriano,"164.173,431 km² [2020]",906.876 pessoas [2021],"4,47 hab/km² [2010]",156.679 matrículas [2020],"0,663 [2010]","6.632.883,11 R$ (×1000) [2017]","6.084.416,81 R$ (×1000) [2017]",917 R$ [2020],306.258 veículos [2020],AC
1,JOSE RENAN VASCONCELOS CALHEIROS FILHO [2019],Maceió [2010],alagoano,"27.830,656 km² [2020]",3.365.351 pessoas [2021],"112,33 hab/km² [2010]",464.704 matrículas [2020],"0,631 [2010]","11.950.438,46 R$ (×1000) [2017]","10.460.634,92 R$ (×1000) [2017]",796 R$ [2020],930.933 veículos [2020],AL
2,ANTONIO WALDEZ GÓES DA SILVA [2019],Macapá [2010],amapaense,"142.470,762 km² [2020]",877.613 pessoas [2021],"4,69 hab/km² [2010]",134.820 matrículas [2020],"0,708 [2010]","5.396.417,14 R$ (×1000) [2017]","4.224.464,09 R$ (×1000) [2017]",893 R$ [2020],215.330 veículos [2020],AP
3,WILSON MIRANDA LIMA [2019],Manaus [2010],amazonense,"1.559.167,878 km² [2020]",4.269.995 pessoas [2021],"2,23 hab/km² [2010]",700.104 matrículas [2020],"0,674 [2010]","17.328.459,43 R$ (×1000) [2017]","15.324.896,56 R$ (×1000) [2017]",852 R$ [2020],969.722 veículos [2020],AM
4,RUI COSTA DOS SANTOS [2019],Salvador [2010],baiano,"564.760,427 km² [2020]",14.985.284 pessoas [2021],"24,82 hab/km² [2010]",1.947.177 matrículas [2020],"0,660 [2010]","50.191.003,24 R$ (×1000) [2017]","45.570.160,00 R$ (×1000) [2017]",965 R$ [2020],4.506.825 veículos [2020],BA


In [6]:
#Checking the data type of each column.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 13 columns):
 #   Column                                   Non-Null Count  Dtype 
---  ------                                   --------------  ----- 
 0   Governador                               27 non-null     object
 1   Capital                                  27 non-null     object
 2   Gentílico                                27 non-null     object
 3   Área Territorial                         27 non-null     object
 4   População estimada                       27 non-null     object
 5   Densidade demográfica                    27 non-null     object
 6   Matrículas no ensino fundamental        27 non-null     object
 7   IDH Índice de desenvolvimento humano    27 non-null     object
 8   Receitas realizadas                      27 non-null     object
 9   Despesas empenhadas                      27 non-null     object
 10  Rendimento mensal domiciliar per capita  27 non-null     object


# **Cleaning**

In [7]:
states_df = df.copy()
states_df.columns = ['governor', 'capital', 'gentile', 'area', 'population', 'demographic_density', 'primary_school_enrollment', 'idh', 'realized_revenue', 'committed_expenses', 'income_per_capita', 'vehicle_total', 'code']
states_df = states_df[['code', 'governor', 'population', 'area', 'idh', 'income_per_capita', 'vehicle_total', 'primary_school_enrollment', 'committed_expenses', 'realized_revenue']]


states_df = states_df.replace({
    '\.': '',
    ',': '.',
    '\[\d+\]': '',
    ' hab/km²': '',
    ' km²': '',
    ' pessoas': '',
    ' matrículas': '',
    'R\$.*': '',
    ' veículos': ''
}, regex=True)

states_df.head()

num_cols = ['population', 'area', 'idh', 'income_per_capita', 'vehicle_total', 'primary_school_enrollment', 'committed_expenses', 'realized_revenue']

states_df[num_cols] = states_df[num_cols].apply(lambda x: x.str.strip())
# states_df[num_cols] = states_df[num_cols].apply(pd.to_numeric)
states_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   code                       27 non-null     object
 1   governor                   27 non-null     object
 2   population                 27 non-null     object
 3   area                       27 non-null     object
 4   idh                        27 non-null     object
 5   income_per_capita          27 non-null     object
 6   vehicle_total              27 non-null     object
 7   primary_school_enrollment  27 non-null     object
 8   committed_expenses         27 non-null     object
 9   realized_revenue           27 non-null     object
dtypes: object(10)
memory usage: 2.2+ KB


# **Export**

In [11]:
states_df.to_csv('/content/drive/MyDrive/ibge_states.csv')