In [8]:
# importing all the required modules
import pypdf
import os
import networkx as nx
from urllib.parse import urlparse
from uuid import uuid4
from base64 import b64encode
import requests
from collections import deque
from selenium import webdriver
# creating a pdf reader object

In [4]:
def extract_document_links(file_path):
    reader = pypdf.PdfReader(file_path)
    all_links:list[str] = []
    pdf_links = set()
    for page in reader.pages:
        if "/Annots" in page:
            for annot in page["/Annots"]:
                annot_obj = annot.get_object()
                if "/A" in annot_obj and "/URI" in annot_obj["/A"]:
                    all_links.append(annot_obj["/A"]["/URI"])
                break
    for link in all_links:
        if link.lower().endswith(".pdf"):
            pdf_links.add(link)
    return pdf_links

In [5]:
def download_document(pdf_link, save_folder = ""):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)
    parsed_url = urlparse(pdf_link)
    document_name = os.path.basename(parsed_url.path) or f"{b64encode(uuid4())}.pdf"
    filename = os.path.join(save_folder,  document_name)
    try:
        response = requests.get(pdf_link, timeout=10)
        if response.status_code == 200:
            with open(filename, "wb") as f:
                f.write(response.content)
            print(f"Downloaded: {filename}")
            return filename
        elif response.status_code in [401, 403]:
            print(f"Protected document detected: {pdf_link}")

            ## Code to Handle Authorization error
            return None
        else:
            print(f"Failed to fetch {pdf_link} (Status: {response.status_code})")
            return None
    except requests.RequestException as e:
        print(f"Error downloading {pdf_link}: {e}")
        return None

In [9]:
from selenium import webdriver

options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in background
driver = webdriver.Chrome(options=options)

driver.get("https://www.sap.com/dmc/benchmark/2020/Cert20021.pdf")

# Save the page source (useful for debugging)
with open("document.pdf", "wb") as f:
    f.write(driver.page_source.encode("utf-8"))

driver.quit()

In [6]:
def build_document_graph(pdf_path, document_folder, node_count):
    graph = nx.DiGraph()
    all_links = {}
    queue = deque([pdf_path])
    all_paths = set()
    all_paths.add(pdf_path)
    while len(queue) > 0:
        document_path = queue.popleft()
        pdf_links = extract_document_links(document_path)
        for link in pdf_links:
            if link not in all_links:
                downloaded_path = download_document(link, document_folder)
                all_links[link] = downloaded_path
            else:
                downloaded_path = all_links[link]
            if downloaded_path is not None:
                graph.add_edge(document_path, downloaded_path)
                if downloaded_path not in all_paths and len(all_paths) < node_count:
                    all_paths.add(downloaded_path)
                    queue.append(downloaded_path)
    return graph

In [7]:
document_graph = build_document_graph(pdf_path='./input-dir/sap-hana-on-vmware-vsphere-bp_0.pdf', document_folder="./pdfs/", node_count=10)

Protected document detected: https://www.sap.com/dmc/benchmark/2021/Cert21058.pdf
Failed to fetch https://pubs.vmware.com/vsphere-60/topic/com.vmware.ICbase/PDF/vsphere-esxi-vcenter-server-60-resource-management-guide.pdf (Status: 400)
Protected document detected: https://www.sap.com/dmc/benchmark/2023/Cert23030.pdf
Protected document detected: https://www.sap.com/dmc/benchmark/2020/Cert20021.pdf
