In [1]:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options



def get_ao_recents(limit=20):
    url = f"https://boamp-datadila.opendatasoft.com/api/explore/v2.1/catalog/datasets/boamp/records"
    params = {
        "order_by": "dateparution desc",
        "limit": limit
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print("Erreur :", response.status_code)
        return []

    data = response.json()
    return data.get("results", [])
    
def download_boamp_pdf_robuste(idweb, dateparution, filename=None, save_dir="pdf_boamp"):

    os.makedirs(save_dir, exist_ok=True)
    filepath = os.path.join(save_dir, f"{idweb}.pdf")
    headers = {"User-Agent": "Mozilla/5.0"}
    page_url = f'https://www.boamp.fr/pages/avis/?q=idweb:"{idweb}"'

    try:
        r = requests.get(page_url, headers=headers, timeout=10)
        if r.status_code == 200:
            soup = BeautifulSoup(r.text, "html.parser")
            tag = next(
                (a for a in soup.find_all("a", href=True)
                 if "/telechargements/" in a["href"] and a["href"].endswith(".pdf")),
                None
            )
            if tag:
                pdf_url = "https://www.boamp.fr" + tag["href"]
                pdf_resp = requests.get(pdf_url)
                if pdf_resp.status_code == 200:
                    with open(filepath, "wb") as f:
                        f.write(pdf_resp.content)
                    print(f"[{idweb}] PDF téléchargé (HTML) : {filepath}")
                    return filepath
                else:
                    print(f"[{idweb}] Lien HTML trouvé mais téléchargement échoué.")
    except Exception as e:
        print(f"[{idweb}]  Erreur scraping HTML : {e}")

    try:
        dt = datetime.strptime(dateparution, "%Y-%m-%d")
        year = dt.strftime("%Y")
        month = dt.strftime("%m")

        if filename and dt.year < 2024:
          
            pdf_url = f"https://www.boamp.fr/telechargements/PDF/{year}/{filename}/{idweb}.pdf"
        else:
          
            pdf_url = f"https://www.boamp.fr/telechargements/FILES/PDF/{year}/{month}/{idweb}.pdf"

        pdf_resp = requests.get(pdf_url)
        if pdf_resp.status_code == 200:
            with open(filepath, "wb") as f:
                f.write(pdf_resp.content)
            print(f"[{idweb}] PDF téléchargé (reconstruit) : {filepath}")
            return filepath
        else:
            print(f"[{idweb}] Lien reconstruit et sera téléchargé avec sélénium")
    except Exception as e:
        print(f"[{idweb}] Erreur reconstruction : {e}")

    try:
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        driver = webdriver.Chrome(options=chrome_options)

        driver.get(page_url)
        time.sleep(3)

        link = driver.find_element("xpath", "//a[contains(@href, '/telechargements/') and contains(@href, '.pdf')]")
        pdf_url = link.get_attribute("href")
        driver.quit()

        pdf_resp = requests.get(pdf_url)
        if pdf_resp.status_code == 200:
            with open(filepath, "wb") as f:
                f.write(pdf_resp.content)
            print(f"[{idweb}] PDF téléchargé (Selenium) : {filepath}")
            return filepath
        else:
            print(f"[{idweb}] PDF introuvable après Selenium : {pdf_url}")
            return None
    except Exception as e:
        print(f"[{idweb}] Erreur Selenium : {e}")
        return None

def get_ao_pdf(limit=20):
    records=get_ao_recents(limit)
    aos= [(elt.get('idweb'),elt.get('dateparution')) for elt in records]
    for i,ao in enumerate(aos):
        download_boamp_pdf_robuste(ao[0],ao[1])

In [2]:
get_ao_pdf(100)

[25-73001] PDF téléchargé (reconstruit) : pdf_boamp\25-73001.pdf
[25-73015] PDF téléchargé (reconstruit) : pdf_boamp\25-73015.pdf
[25-73030] PDF téléchargé (reconstruit) : pdf_boamp\25-73030.pdf
[25-73016] PDF téléchargé (reconstruit) : pdf_boamp\25-73016.pdf
[25-72852] PDF téléchargé (reconstruit) : pdf_boamp\25-72852.pdf
[25-72866] PDF téléchargé (reconstruit) : pdf_boamp\25-72866.pdf
[25-72964] PDF téléchargé (reconstruit) : pdf_boamp\25-72964.pdf
[25-72965] PDF téléchargé (reconstruit) : pdf_boamp\25-72965.pdf
[25-72976] PDF téléchargé (reconstruit) : pdf_boamp\25-72976.pdf
[25-73128] PDF téléchargé (reconstruit) : pdf_boamp\25-73128.pdf
[25-73149] PDF téléchargé (reconstruit) : pdf_boamp\25-73149.pdf
[25-73160] PDF téléchargé (reconstruit) : pdf_boamp\25-73160.pdf
[25-73064] PDF téléchargé (reconstruit) : pdf_boamp\25-73064.pdf
[25-73085] PDF téléchargé (reconstruit) : pdf_boamp\25-73085.pdf
[25-72883] PDF téléchargé (reconstruit) : pdf_boamp\25-72883.pdf
[25-72885] PDF téléchargé