<a href="https://colab.research.google.com/github/suacalis/IK-Analitigi-2023/blob/main/Python_Tools_for_Working_with_PDFs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import PyPDF2

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
        return text


# Usage
file_path = 'sample.pdf'
print(extract_text_from_pdf(file_path))

In [None]:
from pdfminer.high_level import extract_text

from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument

def extract_text_with_pdfminer(file_path):
    return extract_text(file_path)

def extract_metadata(file_path):
    with open(file_path, 'rb') as file:
        parser = PDFParser(file)
        doc = PDFDocument(parser)
        metadata = doc.info[0]
    return metadata


# Usage
file_path = 'sample.pdf'
print(extract_text_with_pdfminer(file_path))
print(extract_metadata(file_path))

In [None]:
from reportlab.lib.pagesizes import letter

from reportlab.pdfgen import canvas

def create_pdf(file_path):
    c = canvas.Canvas(file_path, pagesize=letter)
    c.drawString(100, 750, "Hello from Encord!")
    c.save()


# Usage
create_pdf("test.pdf")

In [None]:
from PyPDF2 import PdfMerger

def merge_pdfs(pdf_list, output_path):
    merger = PdfMerger()
    for pdf in pdf_list:
        merger.append(pdf)
    merger.write(output_path)
    merger.close()

# Usage
pdf_list = ['file1.pdf', 'file2.pdf']
merge_pdfs(pdf_list, 'merged.pdf')

In [None]:
from PyPDF2 import PdfReader, PdfWriter

def split_pdf(input_path, start_page, end_page, output_path):
    reader = PdfReader(input_path)
    writer = PdfWriter()
    for page_num in range(start_page, end_page):
        writer.add_page(reader.pages[page_num])
    with open(output_path, 'wb') as output_pdf:
        writer.write(output_pdf)

# Usage
split_pdf('merged.pdf', 0, 2, 'split_output.pdf')

In [None]:
#Code Example for Rotating Pages
from PyPDF2 import PdfReader, PdfWriter

def rotate_pdf(input_path, output_path, rotation_degrees=90):
    reader = PdfReader(input_path)
    writer = PdfWriter()
    for page_num in range(len(reader.pages)):
        page = reader.pages[page_num]
        page.rotate(rotation_degrees)
        writer.add_page(page)
    with open(output_path, 'wb') as output_pdf:
        writer.write(output_pdf)

# Usage
input_path = 'input.pdf'
output_path = 'rotated_output.pdf'
rotate_pdf(input_path, output_path, 90)

In [None]:
#Extracting Images from PDFs using PyMuPDF (fitz)

import fitz

def extract_images(file_path):
    pdf_document = fitz.open(file_path)
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        images = page.get_images(full=True)
        for image_index, img in enumerate(images):
            xref = img[0]
            base_image = pdf_document.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            with open(f"image{page_num+1}_{image_index}.{image_ext}", "wb") as image_file:
            image_file.write(image_bytes)

# Usage
extract_images('sample.pdf')

In [None]:
#Naive Bayes
from sklearn.datasets import load_digits
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# Load the digits dataset
X, y = load_digits(return_X_y=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the model
model = GaussianNB()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

print("Predictions:", y_pred)