# Extração dos Dados

In [0]:
import requests
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from io import BytesIO
from zipfile import ZipFile

In [0]:
# Retorna uma lista contendo o (mes atual - 2) e tres meses de historico no formato '%Y%m'
def generate_past_months(current_date, num_months):
    months = []
    for i in range(num_months):
        month = (current_date.replace(day=1) - timedelta(days=i*30)).strftime('%Y%m')
        months.append(month)    
    return months

In [0]:
def check_folders(category_path, months):
    try:
        existing_folders = [folder.name.rstrip('/') for folder in dbutils.fs.ls(category_path)]        
        # Retorna True se ja houver dados historicos
        return all(month in existing_folders for month in months)
    except Exception as e:
        # Caso a pasta ainda nao tenha sido criada (primeira execucao)
        return False

In [0]:
# Extrai os arquivos da pasta zipada e salva-os no dbfs
def save_data(data, category_path, month, encoding="cp1252"):
    try: 
        with ZipFile(BytesIO(data)) as zip_file:
            for filename in zip_file.namelist():
                csv = zip_file.read(filename).decode(encoding="cp1252")
                dbfs_path = f"{category_path}/{month}/{filename}"                               

                dbutils.fs.put(dbfs_path, csv, True)

                print(f"Arquivo {dbfs_path} salvo.")
    except Exception as e:
        print(f"Erro ao salvar os dados: {e}")



In [0]:
def get_data (category, category_path, month):
    url = f"https://portaldatransparencia.gov.br/download-de-dados/servidores/{month}_{category}_SIAPE"
    response = requests.get(url)
    
    if response.status_code == 200:
        save_data(response.content, category_path, month)
    else:    
        print(f"Erro na requisicao. Status code: {response.status_code}")


In [0]:
# Define o caminho em que os dados serao armazenados
category_path = {
    "Aposentados": "/bronze/SIAPE/Aposentados",
    "Pensionistas": "/bronze/SIAPE/Pensionistas",
    "Servidores": "/bronze/SIAPE/Servidores"
}
 
two_months_ago = datetime.now() - relativedelta(months=2)
required_months = generate_past_months(two_months_ago, 4)


for category, category_path in category_path.items():  
    # Ja existe historico, obtem apenas dados do (mes atual - 2)
    if check_folders(category_path, required_months):
        get_data (category, category_path, two_months_ago.strftime('%Y%m'))

    # Primeira execucao
    else:
        for month in required_months:
            get_data (category, category_path, month)

print("Extracao de dados finalizada!")        

Wrote 161089678 bytes.
Arquivo /bronze/SIAPE/Aposentados/202408/202408_Cadastro.csv salvo.
Wrote 80096 bytes.
Arquivo /bronze/SIAPE/Aposentados/202408/202408_Observacoes.csv salvo.
Wrote 129589165 bytes.
Arquivo /bronze/SIAPE/Aposentados/202408/202408_Remuneracao.csv salvo.
Wrote 128527432 bytes.
Arquivo /bronze/SIAPE/Pensionistas/202408/202408_Cadastro.csv salvo.
Wrote 20354 bytes.
Arquivo /bronze/SIAPE/Pensionistas/202408/202408_Observacoes.csv salvo.
Wrote 87015161 bytes.
Arquivo /bronze/SIAPE/Pensionistas/202408/202408_Remuneracao.csv salvo.
Wrote 475566 bytes.
Arquivo /bronze/SIAPE/Servidores/202408/202408_Afastamentos.csv salvo.
Wrote 439131436 bytes.
Arquivo /bronze/SIAPE/Servidores/202408/202408_Cadastro.csv salvo.
Wrote 1600887 bytes.
Arquivo /bronze/SIAPE/Servidores/202408/202408_Observacoes.csv salvo.
Wrote 169080321 bytes.
Arquivo /bronze/SIAPE/Servidores/202408/202408_Remuneracao.csv salvo.
Extracao de dados finalizada!
