In [90]:
import os
import requests
from bs4 import BeautifulSoup
from datetime import datetime

BASE_URL = "https://boe.es/diario_boe/xml.php?id=BOE-S-{}"
BOE_URL = "https://www.boe.es"

In [91]:
def get_valid_boe_list(start_date):

    current_date = datetime.strptime(start_date, "%Y%m%d")
    valid_boe_list = []

    while True:
        formatted_date = current_date.strftime("%Y%m%d")
        url = BASE_URL.format(formatted_date)

        response = requests.get(url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "lxml-xml")  # Change to "lxml"

            # Check if the file exists
            error_element = soup.find("descripcion")
            if error_element and "No se encontró el sumario original." in error_element.text:
                print(f"No data available for {formatted_date}.")
                break

            # Valid date, append to the list
            valid_boe_list.append(formatted_date)  # Append only the date

            # Check the next date inside the XML, if it exists we continue
            fecha_sig_element = soup.find("fechaSig")
            if fecha_sig_element and fecha_sig_element.text:
                next_date_str = fecha_sig_element.text
                next_date = datetime.strptime(next_date_str, "%d/%m/%Y")

                current_date = next_date
            else:
                break
        else:
            print(f"Failed to fetch data for {formatted_date}. Status code: {response.status_code}")
            break

    return valid_boe_list


In [92]:
def get_url_for_discapacidad(dates, label):
    urls_for_discapacidad = {}

    for date in dates:
        url = BASE_URL.format(date)

        response = requests.get(url)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "lxml-xml")
            items = soup.find_all("item")

            for item in items:
                titulo_tag = item.find("titulo")
                if titulo_tag and label in titulo_tag.text.lower():
                    url_pdf_tag = item.find("urlPdf")
                    url_xml_tag = item.find("urlXml")
                    if url_pdf_tag and url_xml_tag:
                        urls_for_discapacidad[date] = {
                            "urlPdf": url_pdf_tag.text,
                            "urlXml": url_xml_tag.text
                        }
        else:
            print(f"Failed to fetch data for {date}. Status code: {response.status_code}")

    return urls_for_discapacidad


In [93]:
def download_files(file_urls, folder="files"):
    # Create a folder for the files, one for PDFs and one for XMLs
    os.makedirs(os.path.join(folder, "pdf"), exist_ok=True)
    os.makedirs(os.path.join(folder, "xml"), exist_ok=True)

    for date, urls in file_urls.items():
        pdf_url = BOE_URL + urls['urlPdf']
        xml_url = BOE_URL + urls['urlXml']

        pdf_filename = os.path.join(folder, "pdf", f"{date}.pdf")
        xml_filename = os.path.join(folder, "xml", f"{date}.xml")

        download_file(pdf_url, pdf_filename)
        download_file(xml_url, xml_filename)

def download_file(url, filename):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.content)
        print(f"Downloaded: {filename}")
    else:
        print(f"Failed to download {filename}. Status code: {response.status_code}")


In [94]:

# Example usage:
start_date = "20231101"
boe_list = get_valid_boe_list(start_date)
print(boe_list)

label = "discapacidad"
pdf_urls_for_discapacidad = get_url_for_discapacidad(boe_list, label)

for date, pdf_url in pdf_urls_for_discapacidad.items():
    print(f"Date: {date}, PDF URL for 'discapacidad': {pdf_url}")

download_files(pdf_urls_for_discapacidad, folder="downloaded_files")


['20231101', '20231102', '20231103', '20231104', '20231106', '20231107', '20231108', '20231109', '20231110', '20231111', '20231113', '20231114', '20231115', '20231116', '20231117', '20231118', '20231120', '20231121', '20231122', '20231123', '20231124', '20231125']
Date: 20231107, PDF URL for 'discapacidad': {'urlPdf': '/boe/dias/2023/11/07/pdfs/BOE-A-2023-22678.pdf', 'urlXml': '/diario_boe/xml.php?id=BOE-A-2023-22678'}
Date: 20231108, PDF URL for 'discapacidad': {'urlPdf': '/boe/dias/2023/11/08/pdfs/BOE-B-2023-32820.pdf', 'urlXml': '/diario_boe/xml.php?id=BOE-B-2023-32820'}
Date: 20231114, PDF URL for 'discapacidad': {'urlPdf': '/boe/dias/2023/11/14/pdfs/BOE-A-2023-23110.pdf', 'urlXml': '/diario_boe/xml.php?id=BOE-A-2023-23110'}
Date: 20231115, PDF URL for 'discapacidad': {'urlPdf': '/boe/dias/2023/11/15/pdfs/BOE-B-2023-33684.pdf', 'urlXml': '/diario_boe/xml.php?id=BOE-B-2023-33684'}
Date: 20231121, PDF URL for 'discapacidad': {'urlPdf': '/boe/dias/2023/11/21/pdfs/BOE-B-2023-34719.pdf'

FileNotFoundError: [Errno 2] No such file or directory: 'downloaded_files\\pdf\\20231107.pdf'