In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import io
import re
import time

In [None]:
def download_sinesp(max_retries=5, delay=3):
    """
    Accesses the SINESP page, extracts spreadsheet links, and consolidates them into a DataFrame.
    
    :param max_retries: Number of times the script will attempt to download each file in case of error.
    :param delay: Seconds to wait between download attempts.
    """
    url_base = "https://www.gov.br/mj/pt-br/assuntos/sua-seguranca/seguranca-publica/estatistica/dados-nacionais-1/base-de-dados-e-notas-metodologicas-dos-gestores-estaduais-sinesp-vde-2022-e-2023"
    
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    print("Accessing the page to search for links...")
    try:
        response = requests.get(url_base, headers=headers, timeout=30)
        response.raise_for_status()
    except Exception as e:
        print(f"Fatal error accessing the main page: {e}")
        return None
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Collect all links pointing to Excel files of type 'bancovde'
    links_encontrados = []
    for a in soup.find_all('a', href=True):
        href = a['href']
        if 'bancovde' in href and 'xlsx' in href:
            # Ensure the link is absolute
            if not href.startswith('http'):
                href = requests.compat.urljoin(url_base, href)
            links_encontrados.append(href)
    
    # Remove duplicates while maintaining order
    links = list(dict.fromkeys(links_encontrados))
    print(f"Found {len(links)} unique files to process.")

    dataframes = []

    for link in links:
        # Try to extract the year from the link for logging purposes
        ano_match = re.search(r'bancovde-(\d{4})', link)
        year = ano_match.group(1) if ano_match else "Unknown"
        
        download_success = False
        
        # Start of the retry logic routine
        for attempt in range(1, max_retries + 1):
            try:
                print(f"Downloading {year} (Attempt {attempt}/{max_retries})...")
                r = requests.get(link, headers=headers, timeout=60)
                r.raise_for_status() # Raises an error if status is not 200
                
                # If it reached here, the download worked. Reading the Excel:
                df_year = pd.read_excel(io.BytesIO(r.content))
                df_year['ano_referencia_arquivo'] = year
                
                dataframes.append(df_year)
                print(f"--- Success processing the file for {year}!")
                download_success = True
                break # Exit the retry loop and move to the next link
                
            except Exception as e:
                print(f"!!! Error on attempt {attempt} for year {year}: {e}")
                if attempt < max_retries:
                    print(f"Waiting {delay} seconds for a new attempt...")
                    time.sleep(delay)
                else:
                    print(f"XXX Final failure for file {year} after {max_retries} attempts.")

    if dataframes:
        print("\nConsolidating all years into a single DataFrame...")
        df_final = pd.concat(dataframes, ignore_index=True)
        print(f"Completed! Total records: {df_final.shape[0]}")
        return df_final
    else:
        print("No data was successfully downloaded.")
        return None

# Script Execution
if __name__ == "__main__":
    # You can change the max_retries here
    df_consolidated = download_sinesp(max_retries=5, delay=2)
    
    if df_consolidated is not None:
        # Display the first few rows of the final result
        print(df_consolidated.head())
        
        # Optional: Save to CSV
        #df_consolidated.to_csv("sinesp_2015_2025_consolidated.csv", index=False, encoding='utf-8-sig')

Accessing the page to search for links...
Found 11 unique files to process.
Downloading 2025 (Attempt 1/5)...
--- Success processing the file for 2025!
Downloading 2024 (Attempt 1/5)...
!!! Error on attempt 1 for year 2024: ('Connection broken: IncompleteRead(3520668 bytes read, 23352920 more expected)', IncompleteRead(3520668 bytes read, 23352920 more expected))
Waiting 2 seconds for a new attempt...
Downloading 2024 (Attempt 2/5)...
--- Success processing the file for 2024!
Downloading 2023 (Attempt 1/5)...
