<a href="https://colab.research.google.com/github/stevenbowler/EmployeeSurvey/blob/main/ConvertPDFtoPNG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --------------------------------------------------------------
# 1. Install system packages + Python libs
# --------------------------------------------------------------
!apt-get update -qq
!apt-get install -y poppler-utils -qq
!pip install -q pdf2image tqdm

# --------------------------------------------------------------
# 2. Mount Google Drive
# --------------------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

import os, shutil
from pdf2image import convert_from_path
from tqdm import tqdm

# --------------------------------------------------------------
# 3. CONFIG – CHANGE THESE TWO LINES ONLY
# --------------------------------------------------------------
SRC_ROOT   = userdata.get('PDF_FOLDER_PATH')      # ← your folder with PDFs
TARGET_ROOT = userdata.get('PNG_TARGET_PATH')     # ← where PNGs go

DPI = 300                     # 200 = faster / smaller files
# --------------------------------------------------------------

# Make sure target exists
os.makedirs(TARGET_ROOT, exist_ok=True)

# --------------------------------------------------------------
# 4. Find every PDF (recursive)
# --------------------------------------------------------------
pdf_paths = []
for root, _, files in os.walk(SRC_ROOT):
    for f in files:
        if f.lower().endswith('.pdf'):
            pdf_paths.append(os.path.join(root, f))

print(f"Found {len(pdf_paths):,} PDF files. Converting @ {DPI} DPI …")

# --------------------------------------------------------------
# 5. Convert – mirror folder structure + name_pageXXX.png
# --------------------------------------------------------------
for pdf_path in tqdm(pdf_paths, desc="Converting", unit="pdf"):
    try:
        # ----- relative path to keep the same sub-folder tree -----
        rel_path   = os.path.relpath(pdf_path, SRC_ROOT)          # e.g. sub1/sub2/doc.pdf
        rel_dir    = os.path.dirname(rel_path)                   # sub1/sub2
        name_noext = os.path.splitext(os.path.basename(pdf_path))[0]

        out_dir = os.path.join(TARGET_ROOT, rel_dir)
        os.makedirs(out_dir, exist_ok=True)

        # ----- convert all pages -----
        pages = convert_from_path(pdf_path, dpi=DPI)

        # ----- save each page -----
        for i, img in enumerate(pages, start=1):
            png_name = f"{name_noext}_page{i:03d}.png"
            img.save(os.path.join(out_dir, png_name), "PNG")

    except Exception as e:
        print(f"\n[ERROR] {pdf_path} → {e}")

print("\nDONE! PNGs are in:")
print(TARGET_ROOT)