*Please note that the majority of this codebase (approximately 90%) was AI-generated by the large language model `anthropic/claude-sonnet-4`.
The author's contributions included the initial design, code review, integration, and comprehensive testing.*

## Cell 1: Introduction (Markdown)
### Mistral OCR-based PDF Processing and Text Extraction Pipeline

This notebook demonstrates a complete workflow to:
- OCR historical PDFs with Mistral OCR API via OpenRouter API,
- Output the extracted text in both `.txt` and `.pdf` form while preserving formatting,
- Generate a new PDF with original and OCR text side-by-side,
- Automatically process all PDFs in a folder.

**Key steps covered:**
- API call setup and usage for OCR extraction,
- Reading and writing PDF content,
- Assembling results and exporting.

**Required packages:**  
- `requests`, `PyMuPDF` (`fitz`), `PyPDF2`, `tqdm`

**⚠️ Before you start:**  
- Update all file paths to match your local environment.
- Insert your OpenRouter API key.

**💰 Cost Consideration:**
- The Mistral OCR API accessed via OpenRouter typically costs around $2.00 per 1000 pages processed.


## Cell 2: Imports and Global Settings

In [1]:
import os
import sys
import base64
import json
import requests
import tempfile
from pathlib import Path
from PyPDF2 import PdfReader, PdfWriter
import logging
from tqdm import tqdm
import time
import glob
import fitz  # PyMuPDF

# Define input and output paths

INPUT_FOLDER = "/Users/Downloads/test/"  # ⚠️ Replace with your actual input folder
OUTPUT_PDF_PATH = "/Users/Downloads/test/mistral-ocr-test.pdf"  # ⚠️ Replace with your desired output path

# Directly set the API key
OPENROUTER_API_KEY = "sk-or-v1-"  # ⚠️ Put your actual API key here!

OCR_INSTRUCTION = """
Extract all text from this historical document.
Correct likely OCR errors while preserving historical language.
Consider paper aging, faded ink, old typography, and handwritten annotations.
Preserve the original text layout as much as possible aaintaining paragraph structure and formatting
Include all visible text on each page.
DO NOT add content that isn't visible in the original
Mark uncertain text with [?] if unsure

"""

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


## Cell 3: Utility: PDF → base64

In [2]:
def encode_pdf_to_base64(pdf_path):
    """
    Encode PDF to base64 string (for API upload).
    """
    with open(pdf_path, "rb") as pdf_file:
        return base64.b64encode(pdf_file.read()).decode('utf-8')


## Cell 4: OCR Extraction with Mistral OCR via the OpenRouter API

In [3]:
def ocr_with_mistral(pdf_path, OCR_INSTRUCTION):
    """
    Use OpenRouter API with Mistral OCR to extract text from a PDF.
    Returns the extracted text as a string.
    """
    if not OPENROUTER_API_KEY:
        raise ValueError("API key is missing. Set the OPENROUTER_API_KEY variable.")
        
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {OPENROUTER_API_KEY}",
        "Content-Type": "application/json",
        "HTTP-Referer": "https://localhost",  # Replace with your actual domain if needed
        "X-Title": "PDF OCR Application"
    }
    
    base64_pdf = encode_pdf_to_base64(pdf_path)
    data_url = f"data:application/pdf;base64,{base64_pdf}"
    
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": OCR_INSTRUCTION
                },
                {
                    "type": "file",
                    "file": {
                        "filename": Path(pdf_path).name,
                        "file_data": data_url
                    }
                }
            ]
        }
    ]
    
    # Specify using Mistral's OCR engine via plugin
    plugins = [
        {
            "id": "file-parser",
            "pdf": {
                "engine": "mistral-ocr"
            }
        }
    ]
    
    payload = {
        "model": "google/gemma-3-27b-it",
        "messages": messages,
        "plugins": plugins,
        # You could uncomment and adjust max_tokens if needed.
        #"max_tokens": 4000
    }
    
    try:
        logger.info(f"Sending OCR request for {pdf_path}")
        response = requests.post(url, headers=headers, json=payload)
        response.raise_for_status()
        result = response.json()
        
        if "choices" in result and len(result["choices"]) > 0:
            extracted_text = result["choices"][0]["message"]["content"]
            logger.info(f"Successfully extracted {len(extracted_text)} characters of text")
            return extracted_text
        else:
            logger.error(f"Unexpected API response: {result}")
            return ""
    except requests.exceptions.RequestException as e:
        logger.error(f"API request failed: {e}")
        if hasattr(e, 'response') and e.response is not None:
            logger.error(f"Response: {e.response.text}")
        return ""


## Cell 5: Utility: Convert Extracted Text to a Paginated PDF

In [4]:
def create_text_pdf(text_content, output_path, original_filename="", page_offset=0):
    """
    Create a PDF file from text content.
    Adds filename and page numbers as headers.
    Returns the output file path.
    """
    doc = fitz.open()
    font_size = 10
    font_name = "Courier"
    header_font_size = 8
    
    # Split the text into "pages" by line count
    text_pages = []
    lines = text_content.split('\n')
    current_page = []
    line_count = 0
    max_lines_per_page = 55  # Adjust as needed
    
    for line in lines:
        current_page.append(line)
        line_count += 1
        if (line_count >= max_lines_per_page and line.strip() == '') or line_count >= max_lines_per_page + 10:
            text_pages.append('\n'.join(current_page))
            current_page = []
            line_count = 0
    if current_page:
        text_pages.append('\n'.join(current_page))

    # Use A4 size (595x842 pts)
    page_width = 595
    page_height = 842
    
    for page_num, page_text in enumerate(text_pages):
        page = doc.new_page(width=page_width, height=page_height)
        left_margin = 50
        right_margin = 50
        top_margin = 70
        bottom_margin = 50
        
        # Insert header if required
        if original_filename:
            header_text = f"{original_filename} - Page {page_offset + page_num + 1}"
            header_rect = fitz.Rect(left_margin, 20, page_width - right_margin, 50)
            page.insert_textbox(
                header_rect,
                header_text,
                fontsize=header_font_size,
                fontname="helvetica-bold",
                align=1,
                color=(0.5, 0.5, 0.5)
            )
            line_y = 55
            page.draw_line(
                fitz.Point(left_margin, line_y),
                fitz.Point(page_width - right_margin, line_y),
                color=(0.7, 0.7, 0.7),
                width=0.5
            )
        # Insert main text
        text_rect = fitz.Rect(
            left_margin,
            top_margin,
            page_width - right_margin,
            page_height - bottom_margin
        )
        page.insert_textbox(
            text_rect,
            page_text,
            fontsize=font_size,
            fontname=font_name,
            align=0
        )
    doc.save(output_path)
    doc.close()
    return output_path


## Cell 6: Utility: Merge Original PDF and Text PDF Side-by-Side

In [5]:
def merge_pdfs_side_by_side(original_pdf_path, text_pdf_path, output_path, original_filename=""):
    """
    Merge original PDF and OCR text PDF side by side, page by page.
    Header appears on the text (right) side.
    """
    original_doc = fitz.open(original_pdf_path)
    text_doc = fitz.open(text_pdf_path)
    result_doc = fitz.open()
    
    max_pages = max(original_doc.page_count, text_doc.page_count)
    original_width = original_doc[0].rect.width if original_doc.page_count > 0 else 595
    original_height = original_doc[0].rect.height if original_doc.page_count > 0 else 842
    
    for page_num in range(max_pages):
        result_page = result_doc.new_page(
            width=original_width * 2,
            height=original_height
        )
        if page_num < original_doc.page_count:
            result_page.show_pdf_page(
                fitz.Rect(0, 0, original_width, original_height),
                original_doc,
                page_num
            )
        if page_num < text_doc.page_count:
            result_page.show_pdf_page(
                fitz.Rect(original_width, 0, original_width * 2, original_height),
                text_doc,
                page_num
            )
        else:
            if original_filename:
                header_text = f"{original_filename} - Page {page_num + 1}"
                header_rect = fitz.Rect(original_width + 50, 20, original_width * 2 - 50, 50)
                result_page.insert_textbox(
                    header_rect,
                    header_text,
                    fontsize=8,
                    fontname="helvetica-bold",
                    align=1,
                    color=(0.5, 0.5, 0.5)
                )
    result_doc.save(output_path)
    result_doc.close()
    original_doc.close()
    text_doc.close()
    return output_path


## Cell 7: Process a Single PDF File

In [None]:
def process_pdf(pdf_path, output_pdf_path, page_offset=0):
    """
    Process a single PDF:
    - OCR via Mistral/OpenRouter,
    - Create text PDF,
    - Merge side-by-side,
    - Save OCR text as .txt.
    Returns the output PDF and text file paths.
    """
    logger.info(f"Processing PDF: {pdf_path}")
    original_filename = Path(pdf_path).stem
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_dir_path = Path(temp_dir)
        # OCR extraction step
        extracted_text = ocr_with_mistral(pdf_path, OCR_INSTRUCTION)
        # Save extracted text
        text_file_path = temp_dir_path / "extracted_text.txt"
        with open(text_file_path, "w", encoding="utf-8") as f:
            f.write(extracted_text)
        # PDF from OCR output
        text_pdf_path = temp_dir_path / "text.pdf"
        create_text_pdf(extracted_text, text_pdf_path, original_filename, page_offset)
        # Merge original and OCR PDFs
        merge_pdfs_side_by_side(pdf_path, text_pdf_path, output_pdf_path, original_filename)
        # Save a copy of text alongside the output PDF
        text_output_path = Path(output_pdf_path).with_suffix('.txt')
        with open(text_output_path, "w", encoding="utf-8") as f:
            f.write(extracted_text)
    logger.info(f"Completed processing {pdf_path}")
    return output_pdf_path, text_output_path


## Cell 8: Batch Process All PDFs in a Folder

In [None]:
def process_folder(input_folder, output_pdf_path):
    """
    Process all .pdf files in a folder:
    - Run OCR for each,
    - Assemble all results into a single final PDF,
    - Combine all OCR text into a single .txt file.
    """
    pdf_files = sorted(glob.glob(os.path.join(input_folder, "*.pdf")))
    if not pdf_files:
        logger.error(f"No PDF files found in {input_folder}")
        return None, None
    logger.info(f"Found {len(pdf_files)} PDF files to process")
    combined_text = ""
    with tempfile.TemporaryDirectory() as temp_dir:
        temp_dir_path = Path(temp_dir)
        processed_pdfs = []
        total_pages = 0
        for index, pdf_file in enumerate(tqdm(pdf_files, desc="Processing PDF files")):
            output_file = temp_dir_path / f"processed_{index}.pdf"
            try:
                _, text_file = process_pdf(pdf_file, output_file, total_pages)
                processed_pdfs.append(output_file)
                doc = fitz.open(pdf_file)
                total_pages += doc.page_count
                doc.close()
                # Add result to combined text
                pdf_name = os.path.basename(pdf_file)
                combined_text += f"\n\n=== {pdf_name} ===\n\n"
                if Path(text_file).exists():
                    combined_text += Path(text_file).read_text(encoding="utf-8")
                    combined_text += "\n\n"
            except Exception as e:
                logger.error(f"Error processing {pdf_file}: {e}", exc_info=True)
        # Combine PDFs into one
        if processed_pdfs:
            combined_doc = fitz.open()
            for pdf_path in processed_pdfs:
                doc = fitz.open(pdf_path)
                combined_doc.insert_pdf(doc)
                doc.close()
            combined_doc.save(output_pdf_path)
            combined_doc.close()
            # Save combined text
            text_output_path = Path(output_pdf_path).with_suffix('.txt')
            with open(text_output_path, "w", encoding="utf-8") as f:
                f.write(combined_text)
            logger.info(f"Combined PDF saved to {output_pdf_path}")
            logger.info(f"Combined text saved to {text_output_path}")
            return output_pdf_path, text_output_path
        else:
            logger.error("No PDFs were successfully processed")
            return None, None


## Cell 9: Main Entrypoint Function

In [6]:
def main():
    """
    Entrypoint for batch processing. Kicks off the process for the folder.
    """
    try:
        results = process_folder(INPUT_FOLDER, OUTPUT_PDF_PATH)
        if results:
            print(f"Successfully processed PDFs from {INPUT_FOLDER} to {OUTPUT_PDF_PATH}")
        else:
            print("Processing completed with errors. Check the log for details.")
    except Exception as e:
        logger.error(f"Error processing PDFs: {e}", exc_info=True)
        print(f"Error: {e}")


## Cell 10: Run the Program
### NOTE: Only run this after you've configured the paths and API key!

In [None]:

if __name__ == "__main__":
    main()
