<a href="https://colab.research.google.com/github/rjrizani/pdf_scraper/blob/main/PDF_Metadata_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install PyPDF2
%pip install requests
%pip install beautifulsoup4

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting requests
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Using cached charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests)
  Downloading idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.3.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Using cached certifi-2025.1.31-py3-none-any.whl.metadata (2.5 kB)
Downloading requests-2.32.3-py3-none-any.whl (64 kB)
Using cached certifi-2025.1.31-py3-none-any.whl (166 kB)
Using cached charset_normalizer-3.4.1-cp313-cp313-win_amd64.whl (102 kB)
Downloading idna-3.10-py3-none-any.whl (70 kB)
Downloading urllib3-2.3.0-py3-none-any.whl (128 kB)
Installing collected packages: urllib3, idna, charset-normalizer, certifi, requests
Successfully installed certifi-2025.1.31 charset-normalizer-3.4.1 idna-3.10 requests-2.32.3 u


[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting beautifulsoup4
  Using cached beautifulsoup4-4.13.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Collecting typing-extensions>=4.0.0 (from beautifulsoup4)
  Downloading typing_extensions-4.13.0-py3-none-any.whl.metadata (3.0 kB)
Using cached beautifulsoup4-4.13.3-py3-none-any.whl (186 kB)
Using cached soupsieve-2.6-py3-none-any.whl (36 kB)
Downloading typing_extensions-4.13.0-py3-none-any.whl (45 kB)
Installing collected packages: typing-extensions, soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.13.3 soupsieve-2.6 typing-extensions-4.13.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests
from bs4 import BeautifulSoup
import PyPDF2
import os
import time
from datetime import datetime, timedelta
from urllib.parse import urljoin
import csv

def fetch_and_extract_metadata(url, download_directory="papers", max_downloads=None):
    """
    Fetches PDF files from a given URL, downloads them, and extracts their metadata.
    Handles potential errors during the process and includes a retry mechanism.

    Args:
        url (str): The URL of the page containing the PDF links.
        download_directory (str, optional): The directory where PDFs will be downloaded.
            Defaults to "papers".
        max_downloads (int, optional): The maximum number of PDFs to download.
            If None, all PDFs will be downloaded. Defaults to None.
    """
    try:
        # Create the download directory if it doesn't exist
        if not os.path.exists(download_directory):
            os.makedirs(download_directory)

        # Send an HTTP request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all anchor tags that might contain PDF links.  This is made more
        # robust by looking for hrefs ending in .pdf, and by using urljoin.
        pdf_links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True) if a['href'].lower().endswith('.pdf')]

        if not pdf_links:
            print(f"No PDF links found on the page: {url}")
            return

        print(f"Found {len(pdf_links)} PDF links.")

        # Iterate through each PDF link
        download_count = 0
        for pdf_url in pdf_links:
            if max_downloads is not None and download_count >= max_downloads:
                print(f"Reached maximum download limit of {max_downloads}. Stopping.")
                break

            try:
                # Get the PDF content with a timeout
                pdf_response = requests.get(pdf_url, timeout=30)
                pdf_response.raise_for_status()

                # Extract the filename from the URL, handling potential issues
                try:
                    filename = os.path.basename(pdf_url)
                    if not filename:
                        filename = "unnamed_pdf_" + str(time.time()) + ".pdf"  # Generate a unique name
                except:
                    filename = "unnamed_pdf_" + str(time.time()) + ".pdf"

                filepath = os.path.join(download_directory, filename)

                # Write the PDF content to a file
                with open(filepath, 'wb') as f:
                    f.write(pdf_response.content)

                print(f"Downloaded: {filename}")
                download_count += 1

                # Extract metadata from the downloaded PDF and save to a CSV file
                try:
                    with open(filepath, 'rb') as pdf_file:
                        pdf_reader = PyPDF2.PdfReader(pdf_file)
                        metadata = pdf_reader.metadata
                        if metadata:
                            # Prepare metadata for CSV
                            metadata_dict = {key: value for key, value in metadata.items()}
                            metadata_dict['filename'] = filename

                            # Define the CSV file path
                            csv_filepath = os.path.join(download_directory, "metadata.csv")

                            # Write metadata to the CSV file
                            write_header = not os.path.exists(csv_filepath)  # Check if header is needed
                            with open(csv_filepath, mode='a', newline='', encoding='utf-8') as csv_file:
                                writer = csv.DictWriter(csv_file, fieldnames=metadata_dict.keys())
                                if write_header:
                                    writer.writeheader()
                                writer.writerow(metadata_dict)

                            print(f"Metadata for {filename} saved to {csv_filepath}")
                        else:
                            print(f"No metadata found in {filename}")
                except PyPDF2.errors.PdfReadError:
                    print(f"Error reading PDF: {filename}. Skipping metadata extraction.")
                except Exception as e:
                    print(f"Error extracting metadata from {filename}: {e}")

            except requests.exceptions.RequestException as e:
                print(f"Error downloading PDF from {pdf_url}: {e}")
            except Exception as e:
                print(f"An unexpected error occurred while processing {pdf_url}: {e}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL {url}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def get_daily_download_path():
    """
    Generates a daily download path based on the current date.

    Returns:
        str: The path for the daily download directory (e.g., "papers/2024-07-24").
    """
    now = datetime.now()
    date_str = now.strftime("%Y-%m-%d")
    return os.path.join("papers", date_str)

def main():
    """
    Main function to run the script.  Demonstrates daily download and error handling.
    """
    url = "https://jis-eurasipjournals.springeropen.com/articles"
    daily_download_path = get_daily_download_path()
    fetch_and_extract_metadata(url, daily_download_path, max_downloads=5)  # Limiting to 5 downloads

if __name__ == "__main__":
    main()

Found 50 PDF links.
Downloaded: s13635-025-00195-6.pdf
Metadata for s13635-025-00195-6.pdf saved to papers\2025-03-27\metadata.csv
Downloaded: s13635-025-00191-w.pdf
Metadata for s13635-025-00191-w.pdf saved to papers\2025-03-27\metadata.csv
Downloaded: s13635-025-00197-4.pdf
Metadata for s13635-025-00197-4.pdf saved to papers\2025-03-27\metadata.csv
Downloaded: s13635-024-00185-0.pdf
Metadata for s13635-024-00185-0.pdf saved to papers\2025-03-27\metadata.csv
Downloaded: s13635-025-00194-7.pdf
Metadata for s13635-025-00194-7.pdf saved to papers\2025-03-27\metadata.csv
Reached maximum download limit of 5. Stopping.
