In [2]:
import requests
from bs4 import BeautifulSoup
import os
import urllib.parse

def download_pdfs_from_fda(url, output_folder="fda_pdfs"):
    """
    Downloads PDF files from a given FDA CDER MAPP webpage.

    Args:
        url (str): The URL of the FDA CDER MAPP page.
        output_folder (str): The folder to save the downloaded PDFs.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)

        soup = BeautifulSoup(response.content, "html.parser")

        pdf_links = []
        file_links = []
        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            full_url = urllib.parse.urljoin(url, href)
            # data_entity = a_tag["data-entity-substitution"]
            if href.lower().endswith(".pdf"):
                
                pdf_links.append(full_url)
                
            if href.startswith("/media/"):
                # print(a_tag)
                for attr, value in a_tag.attrs.items():
                    # print(f"{attr}: {value}")
                    
                    if attr =="title":
                        name=value.split("/")[-1]
                        # print(name)
                file_links.append((name,full_url))
                
            # print(data_entity)
            

        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        for pdf_url in pdf_links:
            try:
                pdf_response = requests.get(pdf_url, stream=True)
                pdf_response.raise_for_status()

                filename = os.path.join(output_folder, pdf_url.split("/")[-1])

                with open(filename, "wb") as pdf_file:
                    for chunk in pdf_response.iter_content(chunk_size=8192):
                        pdf_file.write(chunk)

                print(f"Downloaded: {filename}")

            except requests.exceptions.RequestException as pdf_err:
                print(f"Error downloading {pdf_url}: {pdf_err}")
        
        print(len(file_links))
        for name,file_url in file_links:
            try:
                file_response = requests.get(file_url, stream=True)
                file_response.raise_for_status()

                # filename = os.path.join(output_folder, file_url.split("/")[-1])
                filename=os.path.join(output_folder,name+".pdf")
                

                with open(filename, "wb") as file:
                    for chunk in file_response.iter_content(chunk_size=8192):
                        print(f"Chunk received: {chunk[:50]}...")
                        file.write(chunk)
        

                print(f"Downloaded: {filename}")

            except requests.exceptions.RequestException as file_err:
                print(f"Error downloading {file_url}: {file_err}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
    except Exception as general_e:
        print(f"An unexpected error occurred: {general_e}")


# Example usage:
fda_url = "https://www.fda.gov/about-fda/center-drug-evaluation-and-research-cder/cder-manual-policies-procedures-mapp"
download_pdfs_from_fda(fda_url)

Downloaded: fda_pdfs/Procedures-for-Completing-and-Processing-the-Form-%5CAnnual-Status-Report-Review-Form--PMR-and-PMC-Summary%5C.pdf
128
Chunk received: b'%PDF-1.6\r%\xe2\xe3\xcf\xd3\r\n46 0 obj\r<</Linearized 1/L 186741/'...
Chunk received: b"Q\x01+\x91$:\x88\xfbDo\xcc\xc8X1I\xbc \xde\x11_\x88\xfd\xe2K\xf1\xbd\xb8\x84\xb9\x91X\xa3\x18\xe9\x97\xb1\xb2\xb9\x1c('\xcb#Vu\xccS?k\x90"...
Chunk received: b'\xfb\xea\xf7\xc2\x14|<\x825\xbc\xfcW\xdfs,?=\xc8\xfb\xd21\xf7\xf7\x10v\x11*\x85q\xfdl\xd7\xc78\xef\xa7\x96EN\xd38\xfb\xb9k\xdf_\xac\xf6$\xe5'...
Chunk received: b'\xbb\xef\x9a\xaeEw<\xb3\xd7\xff\xb6F\x9b\x9f~\r\xfb\xd0=\xa1\x8c\xda\xec\xc6\xe3]\xd8\xf5\xd4\x1f\xf0\xbb\x18\xdfq\xe6\x8e\xb3\xc5\xe8\xdf\xe5\x13\xae\x7f\xaf\xef\xf4\xeb{'...
Chunk received: b'1\xe0 \x1d\xa7\x11Wcd\xe3K"\x8b*\x1cB\xf0m~~N\x8fh~\x141H\x94\xbb\xa5\\S\xe1Xa\xd8yk\xbc\x12|R\xa9\xa1\x03\xc6`\x80\x8dE'...
Chunk received: b'c\x1cC\xbf_\xa4\x81\xae\xc9\xe0;\x19\xf5\xe6c^\xbcD\t\xc6\xfb\x98\x97?\xa1\xcd\xef\x02\x8d\xa9

KeyboardInterrupt: 

In [15]:
import requests
from bs4 import BeautifulSoup
import os
import urllib.parse

def download_files_from_fda(url, output_folder="fda_files"):
    """
    Downloads PDF and media files from a given FDA webpage.

    Args:
        url (str): The URL of the FDA page.
        output_folder (str): The folder to save the downloaded files.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors

        soup = BeautifulSoup(response.content, "html.parser")

        file_links = []

        for a_tag in soup.find_all("a", href=True):
            # print(a_tag)
            href = a_tag["href"]

            # Normalize the URL
            full_url = urllib.parse.urljoin(url, href)

            # Check for PDF files or media files
            if href.lower().endswith(".pdf"):
                file_links.append(full_url)
                
            if href.startswith("/media/"):
                print(a_tag['title'])
                file_links.append(full_url)

        if not os.path.exists(output_folder):
            os.makedirs(output_folder)

        for file_url in file_links:
            try:
                file_response = requests.get(file_url, stream=True)
                file_response.raise_for_status()

                filename = os.path.join(output_folder, file_url.split("/")[-1])

                with open(filename, "wb") as file:
                    for chunk in file_response.iter_content(chunk_size=8192):
                        file.write(chunk)

                print(f"Downloaded: {filename}")

            except requests.exceptions.RequestException as file_err:
                print(f"Error downloading {file_url}: {file_err}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the webpage: {e}")
    except Exception as general_e:
        print(f"An unexpected error occurred: {general_e}")


# Example usage:
fda_url = "https://www.fda.gov/drugs/guidance-compliance-regulatory-information/pharmaceutical-inspections-and-compliance#CGMP%20Inspections"
download_files_from_fda(fda_url)


Understanding CDER’s Risk-Based Site Selection Model
NAI 90 Day Decisional Letter
voluntary action indicated (VAI)
OAI 90 Day Decisional Letter
00524342 drug_shortages_cy_2023_rtc_final
Downloaded: fda_files/download?attachment
Downloaded: fda_files/download?attachment
Downloaded: fda_files/download?attachment
Downloaded: fda_files/download?attachment
Downloaded: fda_files/download?attachment


In [None]:
https://www.fda.gov/drugs/guidance-compliance-regulatory-information/pharmaceutical-inspections-and-compliance#CGMP%20Inspections