<a href="https://colab.research.google.com/github/rjrizani/pdf_scraper/blob/main/PDF_Metadata_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyPDF2
!pip install requests
!pip install beautifulsoup4

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
import requests
from bs4 import BeautifulSoup
import PyPDF2
import os
import time
from datetime import datetime, timedelta
from urllib.parse import urljoin

def fetch_and_extract_metadata(url, download_directory="papers"):
    """
    Fetches PDF files from a given URL, downloads them, and extracts their metadata.
    Handles potential errors during the process and includes a retry mechanism.

    Args:
        url (str): The URL of the page containing the PDF links.
        download_directory (str, optional): The directory where PDFs will be downloaded.
            Defaults to "papers".
    """
    try:
        # Create the download directory if it doesn't exist
        if not os.path.exists(download_directory):
            os.makedirs(download_directory)

        # Send an HTTP request to the URL
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Find all anchor tags that might contain PDF links.  This is made more
        # robust by looking for hrefs ending in .pdf, and by using urljoin.
        pdf_links = [urljoin(url, a['href']) for a in soup.find_all('a', href=True) if a['href'].lower().endswith('.pdf')]

        if not pdf_links:
            print(f"No PDF links found on the page: {url}")
            return

        print(f"Found {len(pdf_links)} PDF links.")

        # Iterate through each PDF link
        for pdf_url in pdf_links:
            try:
                # Get the PDF content with a timeout
                pdf_response = requests.get(pdf_url, timeout=30)
                pdf_response.raise_for_status()

                # Extract the filename from the URL, handling potential issues
                try:
                    filename = os.path.basename(pdf_url)
                    if not filename:
                        filename = "unnamed_pdf_" + str(time.time()) + ".pdf"  # Generate a unique name
                except:
                    filename = "unnamed_pdf_" + str(time.time()) + ".pdf"

                filepath = os.path.join(download_directory, filename)

                # Write the PDF content to a file
                with open(filepath, 'wb') as f:
                    f.write(pdf_response.content)

                print(f"Downloaded: {filename}")

                # Extract metadata from the downloaded PDF
                try:
                    with open(filepath, 'rb') as pdf_file:
                        pdf_reader = PyPDF2.PdfReader(pdf_file)
                        metadata = pdf_reader.metadata
                        if metadata:
                            print(f"Metadata for {filename}:")
                            for key, value in metadata.items():
                                print(f"  {key}: {value}")
                        else:
                            print(f"No metadata found in {filename}")
                except PyPDF2.errors.PdfReadError:
                    print(f"Error reading PDF: {filename}.  Skipping metadata extraction.")
                except Exception as e:
                    print(f"Error extracting metadata from {filename}: {e}")

            except requests.exceptions.RequestException as e:
                print(f"Error downloading PDF from {pdf_url}: {e}")
            except Exception as e:
                print(f"An unexpected error occurred while processing {pdf_url}: {e}")

    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL {url}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

def get_daily_download_path():
    """
    Generates a daily download path based on the current date.

    Returns:
        str: The path for the daily download directory (e.g., "papers/2024-07-24").
    """
    now = datetime.now()
    date_str = now.strftime("%Y-%m-%d")
    return os.path.join("papers", date_str)

def main():
    """
    Main function to run the script.  Demonstrates daily download and error handling.
    """
    url = "https://jis-eurasipjournals.springeropen.com/articles"
    daily_download_path = get_daily_download_path()
    fetch_and_extract_metadata(url, daily_download_path)

if __name__ == "__main__":
    main()

Found 50 PDF links.
Downloaded: s13635-025-00195-6.pdf
Metadata for s13635-025-00195-6.pdf:
  /Keywords: Self-sovereign identity;SSI;Identity management;Security incident response;SIR;Security
  /CrossMarkDomains[1]: springer.com
  /Creator: Adobe InDesign 15.1 (Windows)
  /ModDate: D:20250320180219+01'00'
  /Trapped: /False
  /CreationDate: D:20250320220924+08'00'
  /CrossmarkMajorVersionDate: 2010-04-23
  /Subject: EURASIP Journal on Information Security, https://doi.org/10.1186/s13635-025-00195-6
  /Author:  Leonhard Ziegler 
  /Title: Designing a security incident response process for self-sovereign identities
  /CrossmarkDomainExclusive: true
  /robots: noindex
  /Producer: Adobe PDF Library 15.0; modified using iText® 5.3.5 ©2000-2012 1T3XT BVBA (SPRINGER SBM; licensed version)
  /doi: 10.1186/s13635-025-00195-6
  /CrossMarkDomains[2]: springerlink.com
Downloaded: s13635-025-00191-w.pdf
Metadata for s13635-025-00191-w.pdf:
  /Keywords: IDS;BERT;GRU;DL
  /CrossMarkDomains[1]: spri

KeyboardInterrupt: 