# PDF Search System
## Comprehensive PDF Document Search with Highlighting and Citations

This notebook provides functionality to:
- Import and process all PDF files in the workspace
- Search for relevant content across all documents
- Generate responses with citations, page numbers, and clickable URLs
- Highlight relevant content in yellow
- Create URLs that redirect to specific pages in PDFs


In [None]:
# Install required packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
packages = [
    "PyPDF2",
    "pymupdf",
    "sentence-transformers",
    "faiss-cpu",
    "numpy",
    "pandas",
    "flask",
    "flask-cors",
    "nltk",
    "scikit-learn"
]

for package in packages:
    try:
        __import__(package.replace('-', '_'))
        print(f"{package} already installed")
    except ImportError:
        print(f"Installing {package}...")
        install_package(package)


In [None]:
import os
import glob
import PyPDF2
import fitz  # pymupdf
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import json
import re
from typing import List, Dict, Tuple
import urllib.parse
from IPython.display import HTML, display
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Download NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("All packages imported successfully!")


In [None]:
class PDFSearchSystem:
    def __init__(self, pdf_directory="."):
        self.pdf_directory = pdf_directory
        self.documents = []
        self.embeddings = None
        self.index = None
        self.model = SentenceTransformer('all-MiniLM-L6-v2')
        self.tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
        self.tfidf_matrix = None
        
    def extract_text_from_pdf(self, pdf_path: str) -> List[Dict]:
        """Extract text from PDF with page information"""
        documents = []
        
        try:
            # Use PyMuPDF for better text extraction
            doc = fitz.open(pdf_path)
            
            for page_num in range(len(doc)):
                page = doc[page_num]
                text = page.get_text()
                
                if text.strip():  # Only add non-empty pages
                    # Split into paragraphs
                    paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
                    
                    for i, paragraph in enumerate(paragraphs):
                        if len(paragraph) > 50:  # Only meaningful paragraphs
                            documents.append({
                                'file_name': os.path.basename(pdf_path),
                                'file_path': pdf_path,
                                'page_number': page_num + 1,
                                'paragraph_index': i,
                                'text': paragraph,
                                'url': self.generate_pdf_url(pdf_path, page_num + 1)
                            })
            
            doc.close()
            print(f"Extracted {len(documents)} paragraphs from {os.path.basename(pdf_path)}")
            
        except Exception as e:
            print(f"Error processing {pdf_path}: {str(e)}")
            
        return documents
    
    def generate_pdf_url(self, pdf_path: str, page_number: int) -> str:
        """Generate a URL that opens PDF at specific page"""
        # Convert to absolute path and encode for URL
        abs_path = os.path.abspath(pdf_path)
        file_url = f"file:///{abs_path.replace(os.sep, '/')}"
        # Add page parameter (works with most PDF viewers)
        return f"{file_url}#page={page_number}"
    
    def load_all_pdfs(self):
        """Load all PDF files from directory and subdirectories"""
        pdf_files = []
        
        # Find all PDF files
        for root, dirs, files in os.walk(self.pdf_directory):
            for file in files:
                if file.lower().endswith('.pdf'):
                    pdf_files.append(os.path.join(root, file))
        
        print(f"Found {len(pdf_files)} PDF files")
        
        # Extract text from all PDFs
        all_documents = []
        for pdf_file in pdf_files:
            docs = self.extract_text_from_pdf(pdf_file)
            all_documents.extend(docs)
        
        self.documents = all_documents
        print(f"Total paragraphs extracted: {len(self.documents)}")
        
        return self.documents
    
    def create_embeddings(self):
        """Create embeddings for all documents"""
        if not self.documents:
            print("No documents loaded. Please run load_all_pdfs() first.")
            return
        
        print("Creating embeddings...")
        texts = [doc['text'] for doc in self.documents]
        
        # Create sentence embeddings
        self.embeddings = self.model.encode(texts)
        
        # Create FAISS index for fast similarity search
        dimension = self.embeddings.shape[1]
        self.index = faiss.IndexFlatIP(dimension)  # Inner product for cosine similarity
        
        # Normalize embeddings for cosine similarity
        faiss.normalize_L2(self.embeddings)
        self.index.add(self.embeddings)
        
        # Create TF-IDF matrix for keyword-based search
        self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
        
        print("Embeddings created successfully!")
    
    def search(self, query: str, top_k: int = 5, hybrid_weight: float = 0.7) -> List[Dict]:
        """Search for relevant documents using hybrid approach"""
        if self.embeddings is None:
            print("Embeddings not created. Please run create_embeddings() first.")
            return []
        
        # Semantic search using embeddings
        query_embedding = self.model.encode([query])
        faiss.normalize_L2(query_embedding)
        
        semantic_scores, semantic_indices = self.index.search(query_embedding, top_k * 2)
        semantic_scores = semantic_scores[0]
        semantic_indices = semantic_indices[0]
        
        # Keyword search using TF-IDF
        query_tfidf = self.tfidf_vectorizer.transform([query])
        keyword_scores = cosine_similarity(query_tfidf, self.tfidf_matrix)[0]
        
        # Combine scores (hybrid approach)
        final_scores = {}
        
        # Add semantic scores
        for i, idx in enumerate(semantic_indices):
            if idx < len(self.documents):
                final_scores[idx] = hybrid_weight * semantic_scores[i]
        
        # Add keyword scores
        for idx, score in enumerate(keyword_scores):
            if idx in final_scores:
                final_scores[idx] += (1 - hybrid_weight) * score
            else:
                final_scores[idx] = (1 - hybrid_weight) * score
        
        # Sort by combined score
        sorted_results = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:top_k]
        
        # Prepare results with highlighting
        results = []
        for idx, score in sorted_results:
            doc = self.documents[idx].copy()
            doc['relevance_score'] = float(score)
            doc['highlighted_text'] = self.highlight_text(doc['text'], query)
            results.append(doc)
        
        return results
    
    def highlight_text(self, text: str, query: str) -> str:
        """Highlight query terms in text"""
        query_terms = query.lower().split()
        highlighted_text = text
        
        for term in query_terms:
            if len(term) > 2:  # Only highlight meaningful terms
                pattern = re.compile(re.escape(term), re.IGNORECASE)
                highlighted_text = pattern.sub(
                    f'<mark style="background-color: yellow; padding: 2px;">{term}</mark>',
                    highlighted_text
                )
        
        return highlighted_text
    
    def format_search_results(self, results: List[Dict], query: str) -> str:
        """Format search results as HTML with clickable links"""
        if not results:
            return "<p>No relevant documents found.</p>"
        
        html_output = f"<h3>Search Results for: '{query}'</h3>\n"
        
        for i, result in enumerate(results, 1):
            html_output += f"""
            <div style="border: 1px solid #ddd; padding: 15px; margin: 10px 0; border-radius: 5px; background-color: #f9f9f9;">
                <h4>Result {i} - Relevance Score: {result['relevance_score']:.3f}</h4>
                <p><strong>Document:</strong> {result['file_name']}</p>
                <p><strong>Page:</strong> {result['page_number']}</p>
                <p><strong>Content:</strong></p>
                <div style="background-color: white; padding: 10px; border-left: 4px solid #007acc; margin: 10px 0;">
                    {result['highlighted_text']}
                </div>
                <p><a href="{result['url']}" target="_blank" style="color: #007acc; text-decoration: none; font-weight: bold;">📄 Open PDF at Page {result['page_number']}</a></p>
            </div>
            """
        
        return html_output


In [None]:
# Initialize the PDF search system
pdf_search = PDFSearchSystem(".")

# Load all PDFs
documents = pdf_search.load_all_pdfs()

# Display summary of loaded documents
if documents:
    df = pd.DataFrame(documents)
    print("\n=== Document Summary ===")
    print(f"Total paragraphs: {len(documents)}")
    print("\nDocuments by file:")
    print(df['file_name'].value_counts())
    print("\nPages covered:")
    for file_name in df['file_name'].unique():
        file_docs = df[df['file_name'] == file_name]
        print(f"  {file_name}: Pages {file_docs['page_number'].min()}-{file_docs['page_number'].max()}")
else:
    print("No documents were loaded. Please check your PDF files.")


In [None]:
# Create embeddings for semantic search
print("Creating embeddings for all documents...")
pdf_search.create_embeddings()
print("\nReady for search queries!")


In [None]:
# Interactive search function
def search_pdfs(query: str, num_results: int = 5):
    """Search PDFs and display formatted results"""
    print(f"Searching for: '{query}'...\n")
    
    results = pdf_search.search(query, top_k=num_results)
    
    if results:
        html_output = pdf_search.format_search_results(results, query)
        display(HTML(html_output))
        
        # Also return structured data for API use
        return {
            'query': query,
            'total_results': len(results),
            'results': results
        }
    else:
        print("No relevant documents found.")
        return {'query': query, 'total_results': 0, 'results': []}

# Example searches
print("=== Example Search Queries ===")
print("You can now search using the search_pdfs() function.")
print("Examples:")
print("  search_pdfs('data protection rights')")
print("  search_pdfs('GDPR compliance')")
print("  search_pdfs('consent processing')")
print("  search_pdfs('personal data breach')")


In [None]:
# Example search - GDPR compliance
search_result = search_pdfs('GDPR compliance requirements', 3)


In [None]:
# Save the search system for use in Flask API
import pickle

# Save the trained model and data
search_data = {
    'documents': pdf_search.documents,
    'embeddings': pdf_search.embeddings,
    'tfidf_vectorizer': pdf_search.tfidf_vectorizer,
    'tfidf_matrix': pdf_search.tfidf_matrix
}

with open('pdf_search_data.pkl', 'wb') as f:
    pickle.dump(search_data, f)

print("Search system data saved successfully!")
print("This data will be used by the Flask API.")
