# PDF Question Answering System with FastAPI

This notebook implements a comprehensive PDF Question Answering system using Azure OpenAI with FastAPI. 

## Features:
- **FastAPI REST API** with automatic documentation
- **OCR Support** for image-based PDFs (PyMuPDF, PDF2Image)
- **Azure OpenAI GPT-4** for intelligent question answering
- **Interactive Swagger UI** at `/docs` for API testing
- **Multiple PDF Support** with content inspection
- **Type Safety** with Pydantic models
- **High Performance** async API server

## API Endpoints:
- `POST /ask` - Ask questions about PDFs
- `GET /health` - Health check and status
- `GET /pdfs` - List available PDFs  
- `GET /inspect/{pdf_name}` - Inspect PDF content
- `GET /docs` - Interactive API documentation

In [21]:
import os
from typing import List, Dict, Optional
import json

# ---------- Azure OpenAI Setup ----------
def setup_azure_openai():
    """Setup Azure OpenAI client"""
    try:
        from openai import AzureOpenAI
        
        # Azure OpenAI configuration
        client = AzureOpenAI(
            api_key="8OgwTbueNSFrNWeEUZ2tOgnlVwYC7PXLiULoOZKz6JQgWkNcWjucJQQJ99BHACL93NaXJ3w3AAAAACOGzn2y",  # Replace with your Azure OpenAI API key
            api_version="2024-02-01",
            azure_endpoint="https://azureaitestenv.cognitiveservices.azure.com/"  # Replace with your Azure OpenAI endpoint
        )
        deployment = "gpt-4"  # Replace with your deployment name
        
        return client, deployment
    except Exception as e:
        print(f"Azure OpenAI setup failed: {e}")
        print("Using mock client for demonstration...")
        
        # Mock client for demonstration
        class MockClient:
            class Chat:
                class Completions:
                    def create(self, **kwargs):
                        class MockResponse:
                            def __init__(self):
                                self.choices = [MockChoice()]
                        class MockChoice:
                            def __init__(self):
                                self.message = MockMessage()
                        class MockMessage:
                            def __init__(self):
                                self.content = '{"answer": "Mock response - please configure Azure OpenAI credentials", "confidence": 0.5, "language": "en", "citations": []}'
                        return MockResponse()
                
                def __init__(self):
                    self.completions = self.Completions()
            
            def __init__(self):
                self.chat = self.Chat()
        
        return MockClient(), "mock-deployment"

# Setup Azure OpenAI
client, deployment = setup_azure_openai()
print(f"✅ Azure OpenAI client configured with deployment: {deployment}")

# ---------- Install required packages for OCR ----------
def install_ocr_packages():
    """Install required packages for OCR functionality"""
    import subprocess
    import sys
    
    packages = [
        'pymupdf',
        'pillow', 
        'pytesseract',
        'pdf2image'
    ]
    
    for package in packages:
        try:
            __import__(package.replace('-', '_'))
            print(f"✓ {package} already installed")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"✓ {package} installed successfully")

# Install packages
install_ocr_packages()

# ---------- Enhanced PDF text extraction with OCR ----------
def extract_pdf_text_per_page(pdf_path: str, use_ocr: bool = True) -> List[str]:
    """
    Extract text per page using PyMuPDF (fitz) with OCR fallback.
    
    Args:
        pdf_path: Path to PDF file
        use_ocr: Whether to use OCR for image-based content
    
    Returns:
        List of extracted text per page
    """
    try:
        import fitz  # PyMuPDF
        if use_ocr:
            import pytesseract
            from PIL import Image
            import io
    except ImportError as e:
        print(f"Import error: {e}")
        print("Please ensure all required packages are installed")
        return []

    doc = fitz.open(pdf_path)
    pages = []
    
    for page_num, page in enumerate(doc, 1):
        
        
        # First, try to extract text directly
        text = page.get_text("text")
        
        # If no text found or very little text, try OCR
        if use_ocr and (not text.strip() or len(text.strip()) < 50):
            print(f"  Using OCR for page {page_num} (little/no direct text found)")
            try:
                # Convert page to image
                pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))  # 2x zoom for better OCR
                img_data = pix.tobytes("png")
                img = Image.open(io.BytesIO(img_data))
                
                # Perform OCR
                ocr_text = pytesseract.image_to_string(img, lang='eng')
                
                # Use OCR text if it's longer than direct extraction
                if len(ocr_text.strip()) > len(text.strip()):
                    text = ocr_text
                    print(f"  OCR extracted {len(ocr_text)} characters")
                    
            except Exception as ocr_error:
                print(f"  OCR failed for page {page_num}: {ocr_error}")
                # Fallback to blocks method
                text_blocks = page.get_text("blocks")
                if isinstance(text_blocks, list):
                    text = "\n".join([b[4] for b in text_blocks if len(b) >= 5 and isinstance(b[4], str)])
        
        # Fallback to blocks if still no text
        if not text.strip():
            text_blocks = page.get_text("blocks")
            if isinstance(text_blocks, list):
                text = "\n".join([b[4] for b in text_blocks if len(b) >= 5 and isinstance(b[4], str)])
        
        pages.append(text.strip())
    
    doc.close()
    print(f"✓ Extracted text from {len(pages)} pages")
    return pages

# ---------- Alternative OCR method using pdf2image ----------
def extract_pdf_text_with_pdf2image(pdf_path: str) -> List[str]:
    """
    Alternative OCR method using pdf2image + pytesseract
    Better for completely image-based PDFs
    """
    try:
        from pdf2image import convert_from_path
        import pytesseract
    except ImportError as e:
        print(f"Required packages missing: {e}")
        return []
    
    try:
        # Convert PDF to images
        print("Converting PDF to images...")
        images = convert_from_path(pdf_path, dpi=200)  # Higher DPI for better OCR
        
        pages = []
        for i, image in enumerate(images, 1):
            
            
            # Perform OCR on the image
            text = pytesseract.image_to_string(image, lang='eng')
            pages.append(text.strip())
            
        print(f"✓ OCR completed for {len(pages)} pages")
        return pages
        
    except Exception as e:
        print(f"Error in pdf2image OCR: {e}")
        return []

# ---------- Baseline prompt templates ----------
FREE_TEXT_PROMPT = """You are an assistant that answers questions strictly based on the provided PDF content.

Rules:
- Use only information contained in the PDF text.
- If the answer is found, give a concise and precise response.
- Always include the page number(s) and, if applicable, diagram/table references (if they appear in the text).
- If the information is not in the PDF, say exactly:
  "The provided PDF does not contain enough information to answer this question."
- Match the user's language: reply in English if the question is English; reply in Chinese if the question is Chinese.
- Do not add extra information from outside the PDF.

PDF CONTENT (paged):
{pdf_text_block}

QUESTION:
{user_question}

ANSWER:
"""

JSON_PROMPT = """You are an assistant that answers questions strictly from the provided PDF content.

Output JSON schema (return valid JSON only, no extra text):
{{
  "answer": "<concise answer or the exact string: The provided PDF does not contain enough information to answer this question.>",
  "language": "en|zh",
  "citations": [
    {{
      "page": <integer page number, 1-based>,
      "quote": "<short supporting snippet from that page>"
    }}
  ],
  "confidence": <float 0..1>
}}

Rules:
- Use only the PDF content below.
- If uncertain or unsupported by the text, use the exact insufficiency string above.
- Keep "answer" ≤ 120 words unless the question explicitly asks for a long explanation.
- Always include at least one citation when you provide a substantive answer.
- Match the user's language (English/Chinese) for "answer" and "language".
- Do not invent references.

PDF CONTENT (paged):
{pdf_text_block}

QUESTION:
{user_question}
"""

def build_paged_block(docs_pages: Dict[str, List[str]], max_chars: int = 120000) -> str:
    """
    Turn multiple PDFs into a single paged text block:
    === filename.pdf — Page 1 ===
    <text>
    === filename.pdf — Page 2 ===
    ...
    Hard-truncate by character count to keep under model limits.
    """
    lines = []
    total = 0
    for fname, pages in docs_pages.items():
        for i, text in enumerate(pages, start=1):
            header = f"=== {os.path.basename(fname)} — Page {i} ==="
            chunk = f"{header}\n{text}\n"
            if total + len(chunk) > max_chars:
                # stop when exceeding limit
                return "\n".join(lines)
            lines.append(chunk)
            total += len(chunk)
    return "\n".join(lines)

# ---------- Azure OpenAI caller ----------
def call_azure_openai(prompt: str, format_type: str = "free") -> str:
    """
    Calls Azure OpenAI chat completions endpoint.
    """
    if format_type == "json":
        system_message = "You only answer from the provided PDF content. Return valid JSON only."
    else:
        system_message = "You only answer from the provided PDF content."
    
    resp = client.chat.completions.create(
        model=deployment,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ],
        temperature=0.1,
        max_tokens=1000,
    )
    return resp.choices[0].message.content.strip()

# ---------- Enhanced QA function with OCR options ----------
def pdf_qa(pdf_path: str, question: str, format_type: str = "json", max_chars: int = 120000, 
           ocr_method: str = "pymupdf") -> str:
    """
    Enhanced PDF Question Answering with OCR support
    
    Args:
        pdf_path: Path to PDF file
        question: User question
        format_type: "json" or "free" 
        max_chars: Max characters from PDF to include
        ocr_method: "pymupdf" (default), "pdf2image", or "no_ocr"
    
    Returns:
        Answer as string
    """
    # 1) Read PDF
    if not os.path.exists(pdf_path):
        return f"Error: File not found: {pdf_path}"
    
    try:
        print(f"Extracting text using method: {ocr_method}")
        
        if ocr_method == "pdf2image":
            pages = extract_pdf_text_with_pdf2image(pdf_path)
        elif ocr_method == "no_ocr":
            pages = extract_pdf_text_per_page(pdf_path, use_ocr=False)
        else:  # pymupdf (default)
            pages = extract_pdf_text_per_page(pdf_path, use_ocr=True)
        
        if not pages:
            return "Error: Could not extract any text from PDF"
            
        docs_pages = {pdf_path: pages}
        
        # 2) Build paged text block
        pdf_text_block = build_paged_block(docs_pages, max_chars=max_chars)
        
        # 3) Build prompt
        if format_type == "json":
            prompt = JSON_PROMPT.format(pdf_text_block=pdf_text_block, user_question=question)
        else:
            prompt = FREE_TEXT_PROMPT.format(pdf_text_block=pdf_text_block, user_question=question)
        
        # 4) Call Azure OpenAI
        answer = call_azure_openai(prompt, format_type)
        
        return answer
        
    except Exception as e:
        return f"Error processing PDF: {str(e)}"

✅ Azure OpenAI client configured with deployment: gpt-4
✓ pymupdf already installed
Installing pillow...
✓ pillow installed successfully
✓ pytesseract already installed
✓ pdf2image already installed


In [22]:
# Test the PDF QA System with OCR capabilities
import json

# Define the PDF path
pdf_path = r"d:\KattSafe\harness-gear-operation-manual.pdf"

# Test Questions about Harness Gear
test_questions = [
    "What is this manual about?",
    "What are the safety requirements mentioned in the manual?", 
    "How do you operate the harness gear?",
    "What are the maintenance procedures?",
    "What is the weight capacity or load limits?",
    "What safety checks should be performed before use?"
]

# Test different OCR methods
ocr_methods = ["pymupdf", "pdf2image", "no_ocr"]

print("="*70)
print("PDF QUESTION ANSWERING SYSTEM - OCR COMPARISON TEST")
print("="*70)

# Test a single question with different OCR methods first
test_question = "What is this manual about?"
print(f"\n Testing OCR methods with question: '{test_question}'\n")

for method in ocr_methods:
    print(f"\n{'='*20} OCR Method: {method.upper()} {'='*20}")
    
    try:
        answer = pdf_qa(pdf_path, test_question, format_type="json", ocr_method=method)
        
        try:
            parsed_answer = json.loads(answer)
            print(f" Answer: {parsed_answer.get('answer', 'No answer found')}")
            print(f" Confidence: {parsed_answer.get('confidence', 'Unknown')}")
            
            citations = parsed_answer.get('citations', [])
            if citations:
                print(" Citations:")
                for cite in citations[:2]:  # Show max 2 citations
                    page = cite.get('page', 'Unknown')
                    quote = cite.get('quote', 'No quote')[:80] + "..." if len(cite.get('quote', '')) > 80 else cite.get('quote', 'No quote')
                    print(f"    Page {page}: {quote}")
        except json.JSONDecodeError:
            print(f" Raw Answer: {answer}")
            
    except Exception as e:
        print(f" Error with {method}: {str(e)}")

print(f"\n{'='*70}")
print("COMPREHENSIVE TEST - ALL QUESTIONS (using PyMuPDF with OCR)")
print("="*70)

# Test all questions with the best performing method (pymupdf)
for i, question in enumerate(test_questions, 1):
    print(f"\n Test {i}: {question}")
    print("-" * 50)
    
    try:
        # Get answer in JSON format using PyMuPDF with OCR
        answer = pdf_qa(pdf_path, question, format_type="json", ocr_method="pymupdf")
        
        try:
            parsed_answer = json.loads(answer)
            print(f" Answer: {parsed_answer.get('answer', 'No answer found')}")
            print(f" Language: {parsed_answer.get('language', 'Unknown')}")
            print(f" Confidence: {parsed_answer.get('confidence', 'Unknown')}")
            
            citations = parsed_answer.get('citations', [])
            if citations:
                print(" Citations:")
                for cite in citations[:3]:  # Show max 3 citations
                    page = cite.get('page', 'Unknown')
                    quote = cite.get('quote', 'No quote')[:100] + "..." if len(cite.get('quote', '')) > 100 else cite.get('quote', 'No quote')
                    print(f"    Page {page}: {quote}")
        except json.JSONDecodeError:
            print(f" Raw Answer: {answer}")
            
    except Exception as e:
        print(f" Error: {str(e)}")

print(f"\n{'='*70}")
print(" Testing completed!")

PDF QUESTION ANSWERING SYSTEM - OCR COMPARISON TEST

 Testing OCR methods with question: 'What is this manual about?'


Extracting text using method: pymupdf
✓ Extracted text from 19 pages
 Answer: This manual is about the operation of Kattsafe harness gear and equipment for personnel working at heights using a harness and lanyard fall protection system.
 Confidence: 1.0
 Citations:
    Page 2: Kattsafe harness gear and equipment for personnel working at heights using a har...

Extracting text using method: pdf2image
Converting PDF to images...
✓ OCR completed for 19 pages
 Answer: This manual is about Kattsafe harness gear and equipment for personnel working at heights using a harness and lanyard fall protection system.
 Confidence: 1.0
 Citations:
    Page 2: Kattsafe harness gear and equipment for personnel working at heights using a har...

Extracting text using method: no_ocr
✓ Extracted text from 19 pages
 Answer: This manual is about the operation of Kattsafe harness gear and eq

In [23]:
# Enhanced Interactive Question Testing Function with OCR
def ask_pdf_question(question: str, pdf_file: str = None, format_type: str = "json", ocr_method: str = "pymupdf") -> None:
    """
    Enhanced interactive function to ask questions about PDFs with OCR support
    
    Args:
        question: The question to ask
        pdf_file: Path to PDF file (default: harness gear manual)
        format_type: "json" or "free"
        ocr_method: "pymupdf", "pdf2image", or "no_ocr"
    """
    # Use default PDF if not specified
    if pdf_file is None:
        pdf_file = pdf_path
    
    print(f"\n Question: {question}")
    print(f" PDF: {os.path.basename(pdf_file)}")
    print(f" OCR Method: {ocr_method}")
    print("-" * 50)
    
    try:
        answer = pdf_qa(pdf_file, question, format_type=format_type, ocr_method=ocr_method)
        
        if format_type == "json":
            try:
                parsed = json.loads(answer)
                print(f" Answer: {parsed.get('answer', 'No answer')}")
                print(f" Confidence: {parsed.get('confidence', 'Unknown')}")
                print(f" Language: {parsed.get('language', 'Unknown')}")
                
                citations = parsed.get('citations', [])
                if citations:
                    print(" Sources:")
                    for cite in citations:
                        page = cite.get('page', '?')
                        quote = cite.get('quote', '')[:150] + "..." if len(cite.get('quote', '')) > 150 else cite.get('quote', '')
                        print(f"    Page {page}: {quote}")
            except json.JSONDecodeError:
                print(f" Raw Answer: {answer}")
        else:
            print(f" Answer: {answer}")
            
    except Exception as e:
        print(f" Error: {str(e)}")

# Convenience functions for quick testing
def test_harness_gear(question: str, ocr_method: str = "pymupdf"):
    """Quick test for harness gear manual"""
    ask_pdf_question(question, pdf_path, ocr_method=ocr_method)

def test_skylight_mesh(question: str, ocr_method: str = "pymupdf"):
    """Quick test for skylight mesh PDF"""
    skylight_pdf = r"d:\KattSafe\2920-sp391-sp392-skylight-mesh-fixing-details-with-clip.pdf"
    ask_pdf_question(question, skylight_pdf, ocr_method=ocr_method)


In [24]:
# Enhanced PDF Content Inspection with OCR
def inspect_pdf_content(pdf_path: str, max_pages: int = 3, use_ocr: bool = True) -> None:
    """
    Inspect the content extracted from the PDF to understand what text is available
    
    Args:
        pdf_path: Path to the PDF file
        max_pages: Maximum number of pages to show (default: 3)
        use_ocr: Whether to use OCR for extraction
    """
    print(f" Inspecting PDF content: {os.path.basename(pdf_path)}")
    print(f" OCR enabled: {use_ocr}")
    print("="*60)
    
    try:
        pages = extract_pdf_text_per_page(pdf_path, use_ocr=use_ocr)
        print(f" Total pages found: {len(pages)}")
        
        for i, page_text in enumerate(pages[:max_pages], 1):
            print(f"\n Page {i} (first 500 characters):")
            print("-" * 40)
            if page_text.strip():
                preview = page_text.strip()[:500]
                if len(page_text.strip()) > 500:
                    preview += "..."
                print(preview)
                print(f"\n Total characters on page {i}: {len(page_text.strip())}")
            else:
                print("(No text content found on this page)")
                
        if len(pages) > max_pages:
            print(f"\n... and {len(pages) - max_pages} more pages")
            
    except Exception as e:
        print(f" Error inspecting PDF: {str(e)}")

# Inspect both PDFs with OCR
print("="*70)
print("PDF CONTENT INSPECTION WITH OCR")
print("="*70)

# Inspect the harness gear manual
print("\n🔧 HARNESS GEAR OPERATION MANUAL")
inspect_pdf_content(pdf_path, max_pages=2, use_ocr=True)

# Inspect the skylight mesh fixing details if it exists
skylight_pdf = r"d:\KattSafe\2920-sp391-sp392-skylight-mesh-fixing-details-with-clip.pdf"
if os.path.exists(skylight_pdf):
    print(f"\n SKYLIGHT MESH FIXING DETAILS")
    inspect_pdf_content(skylight_pdf, max_pages=2, use_ocr=True)
else:
    print(f"\n Skylight PDF not found: {skylight_pdf}")

PDF CONTENT INSPECTION WITH OCR

🔧 HARNESS GEAR OPERATION MANUAL
 Inspecting PDF content: harness-gear-operation-manual.pdf
 OCR enabled: True
✓ Extracted text from 19 pages
 Total pages found: 19

 Page 1 (first 500 characters):
----------------------------------------
Manual must be read and
understood prior to use install
HARNESS 
GEAR
OPERATION MANUAL

 Total characters on page 1: 86

 Page 2 (first 500 characters):
----------------------------------------
Kattsafe harness gear and 
equipment for personnel working at 
heights using a harness and lanyard 
fall protection system.
Find all related products and resources on our website 
kattsafe.com.au
Operation manual
Harness gear
Product brochure
Harness gear
2
Kattsafe / Harness gear / Operation manual

 Total characters on page 2: 300

... and 17 more pages

 SKYLIGHT MESH FIXING DETAILS
 Inspecting PDF content: 2920-sp391-sp392-skylight-mesh-fixing-details-with-clip.pdf
 OCR enabled: True
✓ Extracted text from 1 pages
 Total pages

In [25]:
# Test OCR capabilities with the Skylight Mesh Fixing Details PDF
print("="*70)
print("TESTING OCR ON SKYLIGHT MESH FIXING DETAILS PDF")
print("="*70)

# Define the second PDF path
skylight_pdf = r"d:\KattSafe\2920-sp391-sp392-skylight-mesh-fixing-details-with-clip.pdf"

# Test questions specific to skylight mesh fixing
skylight_questions = [
    "What are the fixing details mentioned in this document?",
    "What are the specifications for SP391 and SP392?",
    "What type of clips are used in the skylight mesh fixing?",
    "What are the installation procedures?",
    "What materials or components are required?"
]

# Test with different OCR methods on the skylight PDF
print(f"\n Testing skylight PDF with different OCR methods:")

for method in ["pymupdf", "pdf2image"]:
    print(f"\n{'='*15} OCR Method: {method.upper()} {'='*15}")
    
    # First, inspect the content
    print(f"Inspecting content with {method}...")
    try:
        if method == "pdf2image":
            pages = extract_pdf_text_with_pdf2image(skylight_pdf)
        else:
            pages = extract_pdf_text_per_page(skylight_pdf, use_ocr=True)
        
        print(f" Pages extracted: {len(pages)}")
        
        # Show preview of first page
        if pages and pages[0].strip():
            preview = pages[0].strip()[:300]
            print(f" First page preview: {preview}...")
        else:
            print(" No text content found on first page")
            
        # Test one question
        if pages:
            test_q = "What are the fixing details mentioned in this document?"
            answer = pdf_qa(skylight_pdf, test_q, format_type="json", ocr_method=method)
            
            try:
                parsed = json.loads(answer)
                print(f" Q: {test_q}")
                print(f" A: {parsed.get('answer', 'No answer')}")
                print(f" Confidence: {parsed.get('confidence', 'Unknown')}")
            except json.JSONDecodeError:
                print(f" Raw Answer: {answer[:200]}...")
                
    except Exception as e:
        print(f" Error with {method}: {str(e)}")



TESTING OCR ON SKYLIGHT MESH FIXING DETAILS PDF

 Testing skylight PDF with different OCR methods:

Inspecting content with pymupdf...
✓ Extracted text from 1 pages
 Pages extracted: 1
 First page preview: 8mm Bulb-tite Rivet
S/S Clip
Rubber Seal
Steel Roof Sheet
MIN 6 FIXINGS PER MESH SECTION
SKYLIGHT
METAL ROOF DECK
RIVET MUST BE FIXED TO METAL ROOF DECK
A   1029 Mountain Hwy Boronia VIC 3138
T   +61 1300 301 755
E   sales@kattsafe.com.au
W  kattsafe.com.au
Description
Skylight mesh - fixing details...
Extracting text using method: pymupdf
✓ Extracted text from 1 pages
 Q: What are the fixing details mentioned in this document?
 A: The fixing details mentioned include using an 8mm Bulb-tite Rivet, S/S Clip, and Rubber Seal. The mesh section requires a minimum of 6 fixings and the rivet must be fixed to the metal roof deck.
 Confidence: 1.0

Inspecting content with pdf2image...
Converting PDF to images...
✓ OCR completed for 1 pages
 Pages extracted: 1
 First page preview: 8mm Bulb-ti

In [26]:
# FastAPI Implementation for PDF Question Answering

# Install required packages
def install_fastapi_packages():
    """Install FastAPI and related packages"""
    import subprocess
    import sys
    
    packages = [
        'fastapi',
        'uvicorn[standard]',
        'python-multipart'
    ]
    
    for package in packages:
        try:
            if package == 'uvicorn[standard]':
                __import__('uvicorn')
            else:
                __import__(package.replace('-', '_'))
            print(f"✓ {package} already installed")
        except ImportError:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f"✓ {package} installed successfully")

install_fastapi_packages()

from fastapi import FastAPI, HTTPException, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import Optional, List, Dict
import uvicorn
import asyncio
import threading

# Pydantic models for request/response
class QuestionRequest(BaseModel):
    question: str
    pdf: str = "harness_gear"
    ocr_method: str = "pymupdf"
    format: str = "json"

class Citation(BaseModel):
    page: int
    quote: str

class QuestionResponse(BaseModel):
    success: bool
    pdf: str
    question: str
    ocr_method: str
    answer: str
    confidence: Optional[float] = None
    language: Optional[str] = None
    citations: Optional[List[Citation]] = None
    format: Optional[str] = None

class HealthResponse(BaseModel):
    status: str
    message: str
    available_pdfs: List[str]
    ocr_methods: List[str]

class PDFInfo(BaseModel):
    path: str
    exists: bool
    filename: str

class PDFListResponse(BaseModel):
    available_pdfs: Dict[str, PDFInfo]
    total_count: int

class PDFInspectionResponse(BaseModel):
    pdf_name: str
    filename: str
    total_pages: int
    total_characters: int
    preview_pages: List[Dict]

# Create FastAPI app
fastapi_app = FastAPI(
    title="PDF Question Answering API",
    description="Advanced PDF Question Answering system with OCR capabilities using Azure OpenAI",
    version="2.0.0",
    docs_url="/docs",
    redoc_url="/redoc"
)

# Add CORS middleware
fastapi_app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Global variables for PDF paths
FASTAPI_AVAILABLE_PDFS = {
    "harness_gear": r"d:\KattSafe\harness-gear-operation-manual.pdf",
    "skylight_mesh": r"d:\KattSafe\2920-sp391-sp392-skylight-mesh-fixing-details-with-clip.pdf"
}

@fastapi_app.get("/health", response_model=HealthResponse)
async def health_check():
    """Health check endpoint with API status"""
    return HealthResponse(
        status="healthy",
        message="PDF QA FastAPI is running",
        available_pdfs=list(FASTAPI_AVAILABLE_PDFS.keys()),
        ocr_methods=["pymupdf", "pdf2image", "no_ocr"]
    )

@fastapi_app.post("/ask", response_model=QuestionResponse)
async def ask_question(request: QuestionRequest):
    """
    Ask questions about PDF documents with OCR support
    
    - **question**: The question to ask about the PDF
    - **pdf**: PDF identifier ('harness_gear', 'skylight_mesh') or full path
    - **ocr_method**: OCR method to use ('pymupdf', 'pdf2image', 'no_ocr')
    - **format**: Response format ('json' or 'free')
    """
    try:
        # Determine PDF path
        if request.pdf in FASTAPI_AVAILABLE_PDFS:
            pdf_path = FASTAPI_AVAILABLE_PDFS[request.pdf]
        else:
            pdf_path = request.pdf
        
        # Check if PDF exists
        if not os.path.exists(pdf_path):
            raise HTTPException(
                status_code=404,
                detail={
                    "error": f"PDF not found: {pdf_path}",
                    "available_pdfs": list(FASTAPI_AVAILABLE_PDFS.keys())
                }
            )
        
        # Process the question (run in thread to avoid blocking)
        def process_question():
            return pdf_qa(pdf_path, request.question, 
                         format_type=request.format, 
                         ocr_method=request.ocr_method)
        
        # Run in executor to avoid blocking
        import concurrent.futures
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(process_question)
            answer = future.result(timeout=60)  # 60 second timeout
        
        # Parse answer if JSON format
        if request.format == "json":
            try:
                parsed_answer = json.loads(answer)
                citations = [
                    Citation(page=cite.get('page', 0), quote=cite.get('quote', ''))
                    for cite in parsed_answer.get('citations', [])
                ]
                
                return QuestionResponse(
                    success=True,
                    pdf=os.path.basename(pdf_path),
                    question=request.question,
                    ocr_method=request.ocr_method,
                    answer=parsed_answer.get('answer', 'No answer'),
                    confidence=parsed_answer.get('confidence', 0),
                    language=parsed_answer.get('language', 'en'),
                    citations=citations
                )
            except json.JSONDecodeError:
                return QuestionResponse(
                    success=True,
                    pdf=os.path.basename(pdf_path),
                    question=request.question,
                    ocr_method=request.ocr_method,
                    answer=answer,
                    format="raw_text"
                )
        else:
            return QuestionResponse(
                success=True,
                pdf=os.path.basename(pdf_path),
                question=request.question,
                ocr_method=request.ocr_method,
                answer=answer,
                format="free_text"
            )
            
    except concurrent.futures.TimeoutError:
        raise HTTPException(status_code=408, detail="Request timeout - processing took too long")
    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail={
                "error": f"Processing error: {str(e)}",
                "success": False
            }
        )

@fastapi_app.get("/pdfs", response_model=PDFListResponse)
async def list_pdfs():
    """List all available PDF documents"""
    pdf_info = {}
    for name, path in FASTAPI_AVAILABLE_PDFS.items():
        pdf_info[name] = PDFInfo(
            path=path,
            exists=os.path.exists(path),
            filename=os.path.basename(path)
        )
    
    return PDFListResponse(
        available_pdfs=pdf_info,
        total_count=len(FASTAPI_AVAILABLE_PDFS)
    )

@fastapi_app.get("/inspect/{pdf_name}", response_model=PDFInspectionResponse)
async def inspect_pdf(pdf_name: str):
    """Inspect PDF content and structure"""
    if pdf_name not in FASTAPI_AVAILABLE_PDFS:
        raise HTTPException(
            status_code=404,
            detail=f"PDF '{pdf_name}' not found. Available: {list(FASTAPI_AVAILABLE_PDFS.keys())}"
        )
    
    pdf_path = FASTAPI_AVAILABLE_PDFS[pdf_name]
    if not os.path.exists(pdf_path):
        raise HTTPException(
            status_code=404,
            detail=f"PDF file does not exist: {pdf_path}"
        )
    
    try:
        # Run PDF extraction in thread
        def extract_pages():
            return extract_pdf_text_per_page(pdf_path, use_ocr=True)
        
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future = executor.submit(extract_pages)
            pages = future.result(timeout=30)
        
        preview_pages = []
        for i, page_text in enumerate(pages[:3], 1):  # Show first 3 pages
            preview_pages.append({
                "page_number": i,
                "character_count": len(page_text.strip()),
                "preview": page_text.strip()[:300] + "..." if len(page_text.strip()) > 300 else page_text.strip()
            })
        
        return PDFInspectionResponse(
            pdf_name=pdf_name,
            filename=os.path.basename(pdf_path),
            total_pages=len(pages),
            preview_pages=preview_pages,
            total_characters=sum(len(page.strip()) for page in pages)
        )
        
    except concurrent.futures.TimeoutError:
        raise HTTPException(status_code=408, detail="PDF inspection timeout")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Error inspecting PDF: {str(e)}")

@fastapi_app.get("/")
async def root():
    """Root endpoint with API information"""
    return {
        "message": "PDF Question Answering API with FastAPI",
        "version": "2.0.0",
        "docs": "/docs",
        "redoc": "/redoc",
        "endpoints": {
            "health": "/health",
            "ask": "/ask (POST)",
            "pdfs": "/pdfs",
            "inspect": "/inspect/{pdf_name}"
        }
    }



✓ fastapi already installed
✓ uvicorn[standard] already installed
✓ python-multipart already installed


In [27]:
# Start FastAPI Server

import uvicorn
import threading
import time

def run_fastapi_server(host="localhost", port=8000):
    """Run FastAPI server with uvicorn"""
    print(f" Starting FastAPI PDF QA server...")
    print(f" Server will run at: http://{host}:{port}")
    print(f" Interactive API docs: http://{host}:{port}/docs")
    print(f" Alternative docs: http://{host}:{port}/redoc")
    print(f" Health check: http://{host}:{port}/health")
    
    # Configure uvicorn
    config = uvicorn.Config(
        app=fastapi_app,
        host=host,
        port=port,
        reload=False,
        log_level="info"
    )
    server = uvicorn.Server(config)
    server.run()

# Start FastAPI server in a background thread
def start_fastapi_thread():
    """Start FastAPI server in a separate thread"""
    run_fastapi_server(host="localhost", port=8000)



# Start the server in a background thread
fastapi_thread = threading.Thread(target=start_fastapi_thread, daemon=True)
fastapi_thread.start()

# Give the server time to start
time.sleep(3)



 Starting FastAPI PDF QA server...
 Server will run at: http://localhost:8000
 Interactive API docs: http://localhost:8000/docs
 Alternative docs: http://localhost:8000/redoc
 Health check: http://localhost:8000/health


INFO:     Started server process [20376]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
ERROR:    [Errno 10048] error while attempting to bind on address ('::1', 8000, 0, 0): 通常每个套接字地址(协议/网络地址/端口)只允许使用一次。
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.


In [28]:
# FastAPI Test Client and API Documentation

import requests
import json
import time

def test_fastapi_client():
    """Test the FastAPI endpoints"""
    base_url = "http://localhost:8000"
    
    # Test 1: Health Check
    print("\n1️⃣ Testing FastAPI Health Check...")
    try:
        response = requests.get(f"{base_url}/health", timeout=5)
        print(f"Status: {response.status_code}")
        print(f"Response: {response.json()}")
    except Exception as e:
        print(f" Health check failed: {e}")
        return
    
    # Test 2: Root endpoint
    print("\n2️⃣ Testing Root Endpoint...")
    try:
        response = requests.get(f"{base_url}/", timeout=5)
        print(f"Status: {response.status_code}")
        result = response.json()
        print(f"Message: {result.get('message')}")
        print(f"Version: {result.get('version')}")
        print(f"Docs URL: {result.get('docs')}")
    except Exception as e:
        print(f" Root endpoint failed: {e}")
    
    # Test 3: List PDFs
    print("\n3️ Testing List PDFs...")
    try:
        response = requests.get(f"{base_url}/pdfs", timeout=5)
        print(f"Status: {response.status_code}")
        result = response.json()
        print(f"Total PDFs: {result.get('total_count')}")
        for name, info in result.get('available_pdfs', {}).items():
            status = "✅" if info.get('exists') else ""
            print(f"  {name}: {info.get('filename')} {status}")
    except Exception as e:
        print(f" List PDFs failed: {e}")
    
    # Test 4: Ask Question
    print("\n4️⃣ Testing Question Answering...")
    test_data = {
        "question": "What is this manual about?",
        "pdf": "harness_gear",
        "ocr_method": "pymupdf",
        "format": "json"
    }
    
    try:
        start_time = time.time()
        response = requests.post(
            f"{base_url}/ask", 
            json=test_data,
            headers={"Content-Type": "application/json"},
            timeout=30
        )
        end_time = time.time()
        
        print(f"Status: {response.status_code}")
        print(f"Response time: {end_time - start_time:.2f} seconds")
        
        result = response.json()
        if result.get('success'):
            print(f" Answer: {result.get('answer')}")
            print(f" Confidence: {result.get('confidence')}")
            print(f" Citations: {len(result.get('citations', []))}")
            print(f" Language: {result.get('language')}")
        else:
            print(f" Error: {result}")
    except Exception as e:
        print(f" Question answering failed: {e}")
    
    # Test 5: Inspect PDF
    print("\n5️ Testing PDF Inspection...")
    try:
        response = requests.get(f"{base_url}/inspect/harness_gear", timeout=15)
        print(f"Status: {response.status_code}")
        result = response.json()
        print(f" PDF: {result.get('pdf_name')}")
        print(f" Total pages: {result.get('total_pages')}")
        print(f" Total characters: {result.get('total_characters')}")
        print(f" Preview pages: {len(result.get('preview_pages', []))}")
    except Exception as e:
        print(f" PDF inspection failed: {e}")

# FastAPI Features and Benefits
def show_fastapi_features():
    """Display FastAPI features and advantages"""
    print("\n" + "="*70)
    print(" FASTAPI FEATURES & ADVANTAGES")
    print("="*70)
    
    features = {
        " High Performance": "2-3x faster than Flask, built on Starlette/Uvicorn",
        " Auto Documentation": "Interactive Swagger UI at /docs and ReDoc at /redoc",
        " Type Safety": "Pydantic models for request/response validation",
        " Async Support": "Native async/await for concurrent request handling",
        " OpenAPI Standard": "Automatic OpenAPI schema generation",
        " IDE Support": "Excellent autocomplete and type checking",
        " Modern Python": "Uses latest Python features and type hints",
        " Data Validation": "Automatic request/response validation and serialization"
    }
    
    for feature, description in features.items():
        print(f"{feature:<25} {description}")
    
    print(f"\n📡 API Endpoints Available:")
    endpoints = [
        ("GET /", "Root endpoint with API information"),
        ("GET /health", "Health check and server status"),
        ("POST /ask", "Main question answering endpoint"),
        ("GET /pdfs", "List available PDF documents"),
        ("GET /inspect/{pdf_name}", "Inspect PDF content"),
        ("GET /docs", "Interactive Swagger documentation"),
        ("GET /redoc", "Alternative ReDoc documentation")
    ]
    
    for endpoint, description in endpoints:
        print(f"  {endpoint:<25} {description}")

# CURL examples for FastAPI
print("="*70)
print(" CURL EXAMPLES FOR FASTAPI API")
print("="*70)

fastapi_curl_examples = [
    {
        "name": "Health Check",
        "command": 'curl -X GET "http://localhost:8000/health"'
    },
    {
        "name": "Interactive Docs", 
        "command": 'curl -X GET "http://localhost:8000/docs" # Open in browser for interactive testing'
    },
    {
        "name": "Ask Question",
        "command": '''curl -X POST "http://localhost:8000/ask" \\
     -H "Content-Type: application/json" \\
     -d '{
       "question": "What are the safety requirements?",
       "pdf": "harness_gear",
       "ocr_method": "pymupdf",
       "format": "json"
     }' '''
    },
    {
        "name": "List PDFs",
        "command": 'curl -X GET "http://localhost:8000/pdfs"'
    },
    {
        "name": "Inspect PDF",
        "command": 'curl -X GET "http://localhost:8000/inspect/harness_gear"'
    }
]

for example in fastapi_curl_examples:
    print(f"\n📋 {example['name']}:")
    print(f"   {example['command']}")

print(f"\n POSTMAN TESTING:")
print("Base URL: http://localhost:8000")
print(" Best way to test: Open http://localhost:8000/docs in your browser!")
print("   The Swagger UI provides interactive testing for all endpoints")

print(f"\n Ready to test FastAPI! Run:")
print("test_fastapi_client()")
print("show_fastapi_features()")

 CURL EXAMPLES FOR FASTAPI API

📋 Health Check:
   curl -X GET "http://localhost:8000/health"

📋 Interactive Docs:
   curl -X GET "http://localhost:8000/docs" # Open in browser for interactive testing

📋 Ask Question:
   curl -X POST "http://localhost:8000/ask" \
     -H "Content-Type: application/json" \
     -d '{
       "question": "What are the safety requirements?",
       "pdf": "harness_gear",
       "ocr_method": "pymupdf",
       "format": "json"
     }' 

📋 List PDFs:
   curl -X GET "http://localhost:8000/pdfs"

📋 Inspect PDF:
   curl -X GET "http://localhost:8000/inspect/harness_gear"

 POSTMAN TESTING:
Base URL: http://localhost:8000
 Best way to test: Open http://localhost:8000/docs in your browser!
   The Swagger UI provides interactive testing for all endpoints

 Ready to test FastAPI! Run:
test_fastapi_client()
show_fastapi_features()


In [29]:
# PowerShell Commands to Setup and Start React Frontend

import subprocess
import os
import time

def setup_react_frontend():
    """Setup the React frontend by installing dependencies"""
    frontend_path = r"d:\KattSafe\pdf-qa-frontend"
    
    print(" Setting up React Frontend...")
    print(f" Frontend directory: {frontend_path}")
    
    try:
        # Check if Node.js is installed
        result = subprocess.run(["node", "--version"], capture_output=True, text=True, shell=True)
        if result.returncode == 0:
            print(f" Node.js version: {result.stdout.strip()}")
        else:
            print(" Node.js not found. Please install Node.js from https://nodejs.org/")
            return False
            
        # Check if npm is installed
        result = subprocess.run(["npm", "--version"], capture_output=True, text=True, shell=True)
        if result.returncode == 0:
            print(f" npm version: {result.stdout.strip()}")
        else:
            print(" npm not found.")
            return False
        
        # Change to frontend directory and install dependencies
        if os.path.exists(frontend_path):
            print("\n Installing React dependencies...")
            print("This may take a few minutes...")
            
            # Run npm install
            result = subprocess.run(
                ["npm", "install"], 
                cwd=frontend_path, 
                capture_output=True, 
                text=True, 
                shell=True,
                timeout=300  # 5 minute timeout
            )
            
            if result.returncode == 0:
                print(" Dependencies installed successfully!")
                print("\n Frontend is ready!")
                print("\n Next steps:")
                print("1. Run 'start_react_frontend()' to start the development server")
                print("2. Open http://localhost:3000 in your browser")
                print("3. Make sure your FastAPI backend is running (should already be running)")
                return True
            else:
                print(f" Failed to install dependencies: {result.stderr}")
                return False
        else:
            print(f" Frontend directory not found: {frontend_path}")
            return False
            
    except subprocess.TimeoutExpired:
        print(" Installation timed out. Try running manually.")
        return False
    except Exception as e:
        print(f" Error setting up frontend: {str(e)}")
        return False

def start_react_frontend():
    """Start the React development server"""
    frontend_path = r"d:\KattSafe\pdf-qa-frontend"
    
    print(" Starting React Frontend Development Server...")
    print(f"📁 Frontend directory: {frontend_path}")
    
    try:
        if not os.path.exists(os.path.join(frontend_path, "node_modules")):
            print(" Dependencies not installed. Run 'setup_react_frontend()' first.")
            return False
        
        print(" Starting server at http://localhost:3000")
        print(" Backend API at http://localhost:8000")
        print("\n  Note: This will start the server in the background.")
        print("    You can stop it by running 'stop_react_frontend()'")
        
        # Start the development server in the background
        process = subprocess.Popen(
            ["npm", "start"],
            cwd=frontend_path,
            shell=True,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        
        # Store the process for later termination
        global react_process
        react_process = process
        
        # Give it a moment to start
        time.sleep(3)
        
        if process.poll() is None:  # Process is still running
            print(" React development server started successfully!")
            print(" Frontend available at: http://localhost:3000")
            print(" API documentation at: http://localhost:8000/docs")
            print("\n The chatbot interface should open automatically in your browser.")
            return True
        else:
            print(" Failed to start React server")
            return False
            
    except Exception as e:
        print(f" Error starting frontend: {str(e)}")
        return False

def stop_react_frontend():
    """Stop the React development server"""
    try:
        global react_process
        if 'react_process' in globals() and react_process and react_process.poll() is None:
            react_process.terminate()
            print(" React development server stopped.")
        else:
            print("ℹ  No React server process found to stop.")
    except Exception as e:
        print(f" Error stopping frontend: {str(e)}")

def open_frontend_urls():
    """Open the frontend and API documentation URLs"""
    import webbrowser
    
    print(" Opening URLs in your default browser...")
    
    # Open frontend
    webbrowser.open("http://localhost:3000")
    print(" Opened frontend: http://localhost:3000")
    
    # Open API docs
    time.sleep(1)
    webbrowser.open("http://localhost:8000/docs")
    print(" Opened API docs: http://localhost:8000/docs")

