In [2]:
# Text extractor from pdf
import fitz  # PyMuPDF

# Open the PDF file
pdf_document = "sample_hyperlink_pdf.pdf"
pdf = fitz.open(pdf_document)

# File to save the extracted text
output_file = "sample_hyperlink_text.txt"

with open(output_file, "w", encoding="utf-8") as text_file:
    # Iterate through the pages
    for page_num in range(len(pdf)):
        page = pdf.load_page(page_num)
        text = page.get_text()
        text_file.write(text)
        text_file.write("\n\n")  # Add some spacing between pages

print(f"Text extracted and saved to {output_file}")


Text extracted and saved to sample_hyperlink_text.txt


In [5]:
# Image extractor from pdf
import fitz  # PyMuPDF

def extract_images_from_pdf(pdf_path, output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)

    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)
        image_list = page.get_images(full=True)
        
        print(f"[INFO] Found {len(image_list)} images on page {page_num}")

        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            image_filename = f"image_page{page_num+1}_{img_index+1}.{image_ext}"
            print(img)

            with open(image_filename, "wb") as image_file:
                image_file.write(image_bytes)

            print(f"[INFO] Image {img_index+1} on page {page_num+1} saved as {image_filename}")

# Example usage
pdf_path = "sample_hyperlink_pdf.pdf"
output_folder = "extracted_images"
extract_images_from_pdf(pdf_path, output_folder)


[INFO] Found 1 images on page 0
(7, 0, 100, 100, 8, 'DeviceRGB', '', 'I1', 'FlateDecode', 0)
[INFO] Image 1 on page 1 saved as image_page1_1.png


In [7]:
# Hyperlink extraction and find location
import fitz  # PyMuPDF

def extract_hyperlinks_from_pdf(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    hyperlinks = []

    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)
        links = page.get_links()
        
        for link in links:
            if link["uri"]:
                hyperlinks.append({
                    "page": page_num + 1,
                    "uri": link["uri"],
                    "rect": link["from"]
                })
    
    return hyperlinks

# Example usage
pdf_path = "sample_hyperlink_pdf.pdf"
hyperlinks = extract_hyperlinks_from_pdf(pdf_path)

print(hyperlinks)
for link in hyperlinks:
    print(f"Page {link['page']}: {link['uri']} at {link['rect']}")

# Optionally, save to a file
with open("extracted_hyperlinks.txt", "w") as file:
    for link in hyperlinks:
        file.write(f"Page {link['page']}: {link['uri']} at {link['rect']}\n")


[{'page': 1, 'uri': 'https://www.openai.com', 'rect': Rect(31.190000534057617, 206.60003662109375, 165.89999389648438, 218.60003662109375)}, {'page': 1, 'uri': 'https://www.example.com/image-link', 'rect': Rect(31.190000534057617, 390.8500061035156, 103.22000122070312, 402.8500061035156)}]
Page 1: https://www.openai.com at Rect(31.190000534057617, 206.60003662109375, 165.89999389648438, 218.60003662109375)
Page 1: https://www.example.com/image-link at Rect(31.190000534057617, 390.8500061035156, 103.22000122070312, 402.8500061035156)


In [11]:
import fitz  # PyMuPDF

def extract_text_locations(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    all_text_locations = []

    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)
        
        # Extract text and its bounding boxes
        text_instances = page.get_text("blocks")
        
        for inst in text_instances:
            # inst is a tuple containing the text and its bounding box
            bbox = inst[:4]  # First 4 elements are the bounding box
            text = inst[4]   # Fifth element is the text
            all_text_locations.append({
                "page": page_num + 1,
                "bbox": bbox,
                "text": text
            })

    return all_text_locations

# Example usage
pdf_path = "sample_hyperlink_pdf.pdf"
text_locations = extract_text_locations(pdf_path)

for loc in text_locations:
    print(f"Page {loc['page']}: '{loc['text']}' at {loc['bbox']}")

# # Optionally, save to a file
# with open("text_locations.txt", "w") as file:
#     for loc in text_locations:
#         file.write(f"Page {loc['page']}: '{loc['text']}' at {loc['bbox']}\n")


Page 1: 'Sample PDF with Hyperlinks
' at (216.6199951171875, 33.27999496459961, 378.66796875, 49.803993225097656)
Page 1: 'Chapter 1: Introduction
' at (31.190000534057617, 61.63003158569336, 163.86199951171875, 78.1540298461914)
Page 1: 'This is a sample PDF created to demonstrate adding hyperlinks and images with hyperlinks using
' at (31.190000534057617, 118.2600326538086, 564.097900390625, 134.74803161621094)
Page 1: 'the FPDF library.
' at (31.190000534057617, 146.61001586914062, 121.87400817871094, 163.09800720214844)
Page 1: 'Click here to visit OpenAI
' at (31.190000534057617, 203.30001831054688, 165.9019775390625, 219.7880096435547)
Page 1: 'Link to Image
' at (31.190000534057617, 387.5500183105469, 103.22599792480469, 404.03802490234375)


In [12]:
# Find text location and Fill with white color 
import fitz  # PyMuPDF

def extract_text_locations(pdf_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    all_text_locations = []

    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)
        
        # Extract text and its bounding boxes
        text_instances = page.get_text("blocks")
        
        for inst in text_instances:
            # inst is a tuple containing the text and its bounding box
            bbox = inst[:4]  # First 4 elements are the bounding box
            text = inst[4]   # Fifth element is the text
            all_text_locations.append({
                "page": page_num + 1,
                "bbox": bbox,
                "text": text
            })

    return all_text_locations

def redact_text_with_white_color(pdf_path, output_path):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    # Extract all text locations
    text_locations = extract_text_locations(pdf_path)

    for loc in text_locations:
        page_num = loc["page"] - 1  # Page numbers are 0-indexed in PyMuPDF
        bbox = loc["bbox"]
        
        # Get the page
        page = pdf_document.load_page(page_num)
        
        # Draw a white rectangle over the text
        page.draw_rect(bbox, color=(1, 1, 1), fill=(1, 1, 1))
    
    # Save the redacted PDF to a new file
    pdf_document.save(output_path)

# Example usage
pdf_path = "sample_hyperlink_pdf.pdf"
output_path = "redacted_pdf_file.pdf"

redact_text_with_white_color(pdf_path, output_path)

print(f"The text has been redacted in the PDF and saved to '{output_path}'")


The text has been redacted in the PDF and saved to 'redacted_pdf_file.pdf'


In [14]:
# Convert pdf to png file
import fitz  # PyMuPDF

def convert_pdf_to_png(pdf_path, output_folder):
    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    
    for page_num in range(len(pdf_document)):
        # Get the page
        page = pdf_document.load_page(page_num)
        
        # Render page to an image
        pix = page.get_pixmap()
        
        # Save the image as a PNG file
        output_path = f"page_{page_num + 1}.png"
        pix.save(output_path)
        
        print(f"Page {page_num + 1} saved as {output_path}")

# Example usage
pdf_path = "redacted_pdf_file.pdf"
output_folder = "output_images"
convert_pdf_to_png(pdf_path, output_folder)


Page 1 saved as page_1.png


In [16]:
# OCR
import pytesseract
from PIL import Image

def extract_text_from_png(png_path, text_output_path):
    # Open the image file
    img = Image.open(png_path)
    
    # Use Tesseract to do OCR on the image
    text = pytesseract.image_to_string(img)
    
    # Save the extracted text to a file
    with open(text_output_path, "w") as text_file:
        text_file.write(text)
    
    print(f"Text extracted and saved to {text_output_path}")

# Example usage
png_path = "page_1.png"
text_output_path = "extracted_text.txt"
extract_text_from_png(png_path, text_output_path)


TesseractNotFoundError: tesseract is not installed or it's not in your PATH. See README file for more information.