<a href="https://colab.research.google.com/github/smypmsa/pdf-searcher/blob/main/PDF_searcher.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 🔍 PDF Keyword Search with OCR

## Overview
This notebook searches for specific keywords in PDF files using advanced OCR technology. Perfect for processing scanned documents and extracting keyword presence data into Excel reports.

## How to Use
1. Click **"Install Dependencies"** button once
2. Upload your keywords file (keywords.txt) and PDF files
3. Click **"Search Keywords in PDFs"** button
4. Download the generated Excel report
5. Use **"Clear Workspace"** button between sessions

## Keywords File Format
- One keyword per line
- Supports wildcards (*) for pattern matching
- Case-insensitive search
- Examples: `contract`, `payment*`, `*agreement*`

---

In [1]:
#@title 🔧 Install Dependencies - OCR & Processing (Run Once) { display-mode: "form" }
#@markdown Click Run to install all packages including OCR support. Takes 2-3 minutes.

import sys
import subprocess

def install_packages():
    print("🔧 Installing OCR and processing packages...")
    print("⏳ This may take 2-3 minutes...")

    # Install system dependencies
    print("📦 Installing system packages...")
    subprocess.run(['apt-get', 'update', '-qq'], capture_output=True)
    subprocess.run(['apt-get', 'install', '-y',
                   'ghostscript',
                   'python3-tk',
                   'tesseract-ocr',           # OCR engine
                   'tesseract-ocr-eng',       # English language data
                   'poppler-utils',           # PDF utilities
                   'ffmpeg',                  # Media processing
                   'libsm6',                  # OpenCV dependencies
                   'libxext6',
                   '-qq'], capture_output=True)

    # Install Python packages
    print("🐍 Installing Python packages...")
    packages = [
        'easyocr',                     # Advanced OCR
        'pymupdf',                     # PDF processing
        'openpyxl',                    # Excel file creation
        'opencv-python-headless',      # Image preprocessing
        'Pillow',                      # Image handling
        'numpy',                       # Array operations
        'pandas'                       # Data manipulation
    ]

    subprocess.run([sys.executable, '-m', 'pip', 'install'] + packages + ['-q'],
                  capture_output=True)

    # Verify installations
    print("\n🔍 Verifying installations...")
    try:
        import easyocr
        import fitz
        import openpyxl
        import cv2
        print("✅ All packages installed successfully!")
        print("✅ EasyOCR ready for text extraction")
        print("✅ PyMuPDF ready for PDF processing")
        print("✅ OpenPyXL ready for Excel generation")
    except ImportError as e:
        print(f"⚠️  Some packages failed: {e}")
        print("Try running the cell again")

    print("\n👇 Ready! Scroll down to search keywords in PDFs")

install_packages()

🔧 Installing OCR and processing packages...
⏳ This may take 2-3 minutes...
📦 Installing system packages...
🐍 Installing Python packages...

🔍 Verifying installations...
✅ All packages installed successfully!
✅ EasyOCR ready for text extraction
✅ PyMuPDF ready for PDF processing
✅ OpenPyXL ready for Excel generation

👇 Ready! Scroll down to search keywords in PDFs


In [None]:
#@title 🔍 PDF Keyword Search with OCR { display-mode: "form" }
#@markdown Upload keywords file and PDF files, then search for keyword matches

import os
import re
from pathlib import Path
import easyocr
import fitz  # PyMuPDF
from openpyxl import Workbook
from concurrent.futures import ThreadPoolExecutor, as_completed
from google.colab import files
import tempfile
import zipfile
import shutil

def read_keywords_from_content(content):
    """Read keywords from file content"""
    try:
        text = content.decode('utf-8')
    except:
        text = content.decode('latin-1')
    return [line.strip() for line in text.split('\n') if line.strip()]

def ocr_pdf(pdf_path, reader):
    """Extract text from PDF using OCR"""
    text = ""
    try:
        pdf_document = fitz.open(pdf_path)
        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x scaling for better OCR
            img_data = pix.tobytes("png")

            results = reader.readtext(img_data)
            for (bbox, extracted_text, confidence) in results:
                if confidence > 0.5:  # Filter low confidence results
                    text += extracted_text + " "
        pdf_document.close()
    except Exception as e:
        print(f"❌ Error processing PDF: {e}")

    text = re.sub(r'\s+', ' ', text)  # Replace multiple whitespace with single space
    text = text.strip()  # Remove leading/trailing whitespace

    return text

def search_keywords_in_text(text, keywords):
    """Search for keywords in extracted text"""
    text_lower = text.lower()
    results = {}

    for keyword in keywords:
        keyword_lower = keyword.lower()

        # Check if keyword contains wildcards
        if '*' in keyword_lower:
            # Convert wildcard pattern to regex pattern
            regex_pattern = keyword_lower.replace('*', '.*')
            # Use word boundaries to match whole words when no wildcards at edges
            if not keyword_lower.startswith('*'):
                regex_pattern = r'\b' + regex_pattern
            if not keyword_lower.endswith('*'):
                regex_pattern = regex_pattern + r'\b'

            matches = re.findall(regex_pattern, text_lower)
            results[keyword] = len(matches) > 0
        else:
            # Exact keyword search with word boundaries for case-insensitive match
            pattern = r'\b' + re.escape(keyword_lower) + r'\b'
            matches = re.findall(pattern, text_lower)
            results[keyword] = len(matches) > 0

    return results

def create_excel_report(results, keywords, output_path):
    """Create Excel report with search results"""
    wb = Workbook()
    ws = wb.active
    ws.title = "Keyword Search Results"

    # Headers with styling
    ws['A1'] = 'PDF File'
    for i, keyword in enumerate(keywords, start=2):
        ws.cell(row=1, column=i, value=keyword)

    # Header formatting
    for col in range(1, len(keywords) + 2):
        cell = ws.cell(row=1, column=col)
        cell.font = cell.font.copy(bold=True)

    # Data
    for row, (pdf_file, keyword_results) in enumerate(results.items(), start=2):
        ws.cell(row=row, column=1, value=pdf_file)
        for col, keyword in enumerate(keywords, start=2):
            cell_value = 'YES' if keyword_results.get(keyword, False) else 'NO'
            cell = ws.cell(row=row, column=col, value=cell_value)

            # Color coding
            if cell_value == 'YES':
                cell.font = cell.font.copy(color='00008000')  # Green
            else:
                cell.font = cell.font.copy(color='00800000')  # Red

    # Auto-adjust column widths
    for column in ws.columns:
        max_length = 0
        column_letter = column[0].column_letter
        for cell in column:
            try:
                if len(str(cell.value)) > max_length:
                    max_length = len(str(cell.value))
            except:
                pass
        adjusted_width = min(max_length + 2, 50)
        ws.column_dimensions[column_letter].width = adjusted_width

    wb.save(output_path)

def process_single_pdf(pdf_file, keywords, reader):
    """Process a single PDF file - for parallel execution"""
    print(f"🔍 Processing {pdf_file.name}...")
    text = ocr_pdf(pdf_file, reader)

    if text.strip():
        keyword_results = search_keywords_in_text(text, keywords)
        found_count = sum(1 for found in keyword_results.values() if found)
        print(f"✅ {pdf_file.name}: Found {found_count}/{len(keywords)} keywords")
        return pdf_file.name, keyword_results
    else:
        print(f"⚠️ {pdf_file.name}: No text extracted")
        return pdf_file.name, {keyword: False for keyword in keywords}

# Main processing
print("📁 Upload files for keyword search:")
print("1️⃣ First upload your keywords.txt file (one keyword per line)")
print("2️⃣ Then upload your PDF files (can select multiple)")
uploaded = files.upload()

if uploaded:
    keywords_file = None
    pdf_files = []

    # Separate keywords file and PDFs
    temp_dir = Path(tempfile.mkdtemp())

    for filename, file_content in uploaded.items():
        file_path = temp_dir / filename

        if filename.lower().endswith('.txt'):
            keywords_file = file_content
            print(f"📝 Keywords file: {filename}")
        elif filename.lower().endswith('.pdf'):
            with open(file_path, 'wb') as f:
                f.write(file_content)
            pdf_files.append(file_path)
            print(f"📄 PDF file: {filename}")

    if not keywords_file:
        print("❌ No keywords.txt file found. Please upload a text file with keywords.")
    elif not pdf_files:
        print("❌ No PDF files found. Please upload at least one PDF file.")
    else:
        try:
            # Read keywords
            keywords = read_keywords_from_content(keywords_file)
            print(f"\n🔑 Loaded {len(keywords)} keywords: {', '.join(keywords[:5])}{'...' if len(keywords) > 5 else ''}")

            if not keywords:
                print("❌ No keywords found in the file")
            else:
                print(f"\n🚀 Starting OCR processing for {len(pdf_files)} PDF files...")
                print("⏳ This may take several minutes depending on file size and count...")

                # Initialize OCR reader
                print("🤖 Initializing OCR engine...")
                reader = easyocr.Reader(['en'], gpu=False)  # Set gpu=True if available

                results = {}

                # Process PDFs (sequential for better resource management in Colab)
                for pdf_file in pdf_files:
                    pdf_name, keyword_results = process_single_pdf(pdf_file, keywords, reader)
                    results[pdf_name] = keyword_results

                # Create Excel report
                output_file = temp_dir / 'keyword_search_results.xlsx'
                create_excel_report(results, keywords, output_file)

                # Download the report
                files.download(str(output_file))

                print(f"\n📊 Report Summary:")
                print(f"   📁 Processed files: {len(results)}")
                print(f"   🔑 Keywords searched: {len(keywords)}")
                total_matches = sum(sum(1 for found in file_results.values() if found)
                                  for file_results in results.values())
                print(f"   ✅ Total keyword matches: {total_matches}")
                print(f"\n💾 Downloaded: keyword_search_results.xlsx")
                print("✅ Processing complete!")

        except Exception as e:
            print(f"❌ Error during processing: {str(e)}")
        finally:
            # Cleanup
            try:
                shutil.rmtree(temp_dir)
            except:
                pass
else:
    print("❌ No files uploaded")

📁 Upload files for keyword search:
1️⃣ First upload your keywords.txt file (one keyword per line)
2️⃣ Then upload your PDF files (can select multiple)


In [None]:
#@title 🗑️ Clear Workspace { display-mode: "form" }
#@markdown Clear all uploaded files and temporary data

import shutil
from pathlib import Path
import os

# Define the directories to be cleared
directories_to_clear = ["/content/sample_data"]

# Also clear uploaded files in /content (but preserve system folders)
content_dir = Path("/content")
system_folders = {".config", "sample_data", "__pycache__"}

# Warning message
print("⚠️ WARNING: This will delete all uploaded files and clear workspace:")
for directory in directories_to_clear:
    print(f"- {directory}")
print("- All uploaded files in /content (excluding system folders)")

# Auto-confirm for convenience (change to manual input if needed)
if True:  # Set to False to require manual confirmation
    # Clear specified directories
    for directory in directories_to_clear:
        dir_path = Path(directory)
        if dir_path.exists() and dir_path.is_dir():
            shutil.rmtree(dir_path)
            dir_path.mkdir(parents=True, exist_ok=True)
            print(f"✅ '{directory}' has been cleared.")
        else:
            print(f"ℹ️ The '{directory}' directory does not exist.")

    # Clear uploaded files from /content
    if content_dir.exists():
        cleared_files = 0
        for item in content_dir.iterdir():
            if item.name not in system_folders and item.is_file():
                # Check if it's likely an uploaded file (common extensions)
                if item.suffix.lower() in ['.pdf', '.txt', '.xlsx', '.xls', '.docx', '.doc', '.png', '.jpg', '.jpeg']:
                    item.unlink()
                    cleared_files += 1
                    print(f"✅ Removed: {item.name}")
            elif item.is_dir() and item.name not in system_folders:
                # Don't remove important system directories
                if not item.name.startswith('.'):
                    try:
                        shutil.rmtree(item)
                        print(f"✅ Removed folder: {item.name}")
                    except:
                        pass

        if cleared_files == 0:
            print("ℹ️ No uploaded files found to clear")
        else:
            print(f"\n🧹 Workspace cleared! Removed {cleared_files} files.")
            print("Ready for next processing session.")
else:
    print("Operation cancelled. No files were deleted.")