# Document Processing Pipeline
## Extract and Chunk PDFs using Textractor and LangChain

**Purpose**: Process 6 sample PDF documents through Textractor extraction and LangChain recursive splitting to prepare for BM25 indexing.

**Outputs**: 
- Raw extracted text (saved as checkpoint)
- Processed chunks with metadata (ready for indexing)

**Next Step**: `02_indexing.ipynb`

---
## 1. Setup & Imports

In [None]:
import json
import boto3
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Any
import pandas as pd
from textractcaller import call_textract
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Display settings
pd.set_option('display.max_colwidth', 100)

# Logging setup
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

print("‚úì Imports completed")

In [None]:
# Initialize AWS clients
session = boto3.Session()
s3_client = session.client('s3')
textract_client = session.client('textract')

# Verify credentials
sts = session.client('sts')
identity = sts.get_caller_identity()
print(f"‚úì AWS Identity: {identity['Arn']}")
print(f"‚úì Region: {session.region_name}")

---
## 2. Configuration

Centralize all parameters for reproducibility and easy experimentation.

In [None]:
class ProcessingConfig:
    """Configuration for document processing pipeline."""
    
    # Document sources
    PDF_SOURCE_TYPE = "s3"  # or "local"
    S3_BUCKET = "your-bucket-name"
    S3_PREFIX = "raw-pdfs/"
    LOCAL_PDF_DIR = "./data/pdfs/"
    
    # Output paths
    CHECKPOINT_DIR = "./checkpoints/"
    RAW_EXTRACTION_FILE = "raw_extractions.json"
    PROCESSED_CHUNKS_FILE = "processed_chunks.json"
    
    # Textractor settings
    TEXTRACT_FEATURES = []  # Empty for text only, or ["TABLES", "FORMS"] for structured data
    
    # Chunking parameters
    CHUNK_SIZE = 1000
    CHUNK_OVERLAP = 200
    SEPARATORS = ["\n\n", "\n", ". ", " ", ""]
    LENGTH_FUNCTION = len
    
    # Processing
    BATCH_SIZE = 6  # Number of documents
    PROCESSING_TIMESTAMP = datetime.utcnow().isoformat()
    
    @classmethod
    def to_dict(cls) -> Dict[str, Any]:
        """Export config as dictionary for logging."""
        return {
            k: v for k, v in cls.__dict__.items() 
            if not k.startswith('_') and not callable(v)
        }
    
    @classmethod
    def save(cls, filepath: str):
        """Save configuration to JSON."""
        with open(filepath, 'w') as f:
            json.dump(cls.to_dict(), f, indent=2)

# Display current configuration
config_df = pd.DataFrame([
    {"Parameter": k, "Value": v} 
    for k, v in ProcessingConfig.to_dict().items()
])
print("\nüìã Current Configuration:")
display(config_df)

In [None]:
# Create checkpoint directory
checkpoint_dir = Path(ProcessingConfig.CHECKPOINT_DIR)
checkpoint_dir.mkdir(parents=True, exist_ok=True)

# Save config for reproducibility
ProcessingConfig.save(checkpoint_dir / "config.json")
print(f"‚úì Configuration saved to {checkpoint_dir / 'config.json'}")

---
## 3. DocumentProcessor Class

Production-ready class for extraction and chunking.

In [None]:
class DocumentProcessor:
    """Handles PDF extraction via Textractor and chunking via LangChain."""
    
    def __init__(self, config: ProcessingConfig):
        self.config = config
        self.logger = logging.getLogger(self.__class__.__name__)
        self.s3_client = boto3.client('s3')
        self.textract_client = boto3.client('textract')
        
        # Initialize text splitter
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=config.CHUNK_SIZE,
            chunk_overlap=config.CHUNK_OVERLAP,
            separators=config.SEPARATORS,
            length_function=config.LENGTH_FUNCTION,
        )
        
    def extract_from_pdf(self, pdf_path: str, doc_id: str) -> Dict[str, Any]:
        """
        Extract text from PDF using Textractor.
        
        Args:
            pdf_path: S3 URI (s3://bucket/key) or local file path
            doc_id: Unique identifier for this document
            
        Returns:
            Dictionary with extracted text and metadata
        """
        self.logger.info(f"Extracting text from: {pdf_path}")
        
        try:
            # Call Textractor
            if pdf_path.startswith('s3://'):
                # Parse S3 URI
                parts = pdf_path.replace('s3://', '').split('/', 1)
                bucket, key = parts[0], parts[1]
                
                response = call_textract(
                    input_document=f"s3://{bucket}/{key}",
                    features=self.config.TEXTRACT_FEATURES,
                    boto3_textract_client=self.textract_client
                )
            else:
                # Local file
                with open(pdf_path, 'rb') as f:
                    response = call_textract(
                        input_document=f.read(),
                        features=self.config.TEXTRACT_FEATURES,
                        boto3_textract_client=self.textract_client
                    )
            
            # Extract text and page information
            pages = []
            full_text = ""
            
            for page_num, page in enumerate(response.pages, start=1):
                page_text = page.get_text()
                pages.append({
                    'page_number': page_num,
                    'text': page_text,
                    'char_count': len(page_text)
                })
                full_text += page_text + "\n\n"
            
            result = {
                'doc_id': doc_id,
                'doc_name': Path(pdf_path).name,
                'source_path': pdf_path,
                'full_text': full_text.strip(),
                'pages': pages,
                'page_count': len(pages),
                'total_char_count': len(full_text),
                'extraction_timestamp': datetime.utcnow().isoformat(),
                'status': 'success'
            }
            
            self.logger.info(f"‚úì Extracted {len(pages)} pages, {len(full_text):,} characters")
            return result
            
        except Exception as e:
            self.logger.error(f"‚úó Extraction failed: {str(e)}")
            return {
                'doc_id': doc_id,
                'doc_name': Path(pdf_path).name,
                'source_path': pdf_path,
                'status': 'failed',
                'error': str(e),
                'extraction_timestamp': datetime.utcnow().isoformat()
            }
    
    def chunk_document(self, extraction_result: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        Chunk extracted document text using LangChain splitter.
        
        Args:
            extraction_result: Output from extract_from_pdf()
            
        Returns:
            List of chunk dictionaries with text and metadata
        """
        if extraction_result['status'] != 'success':
            self.logger.warning(f"Skipping chunking for failed extraction: {extraction_result['doc_id']}")
            return []
        
        self.logger.info(f"Chunking document: {extraction_result['doc_id']}")
        
        try:
            # Split text into chunks
            text = extraction_result['full_text']
            chunks = self.text_splitter.split_text(text)
            
            # Create chunk objects with metadata
            chunk_objects = []
            for idx, chunk_text in enumerate(chunks):
                chunk_obj = {
                    'chunk_id': f"{extraction_result['doc_id']}_chunk_{idx}",
                    'doc_id': extraction_result['doc_id'],
                    'doc_name': extraction_result['doc_name'],
                    'chunk_index': idx,
                    'text': chunk_text,
                    'char_count': len(chunk_text),
                    'page_numbers': self._estimate_page_numbers(
                        chunk_text, 
                        extraction_result.get('pages', [])
                    ),
                    'processing_timestamp': self.config.PROCESSING_TIMESTAMP
                }
                chunk_objects.append(chunk_obj)
            
            self.logger.info(f"‚úì Created {len(chunk_objects)} chunks")
            return chunk_objects
            
        except Exception as e:
            self.logger.error(f"‚úó Chunking failed: {str(e)}")
            return []
    
    def _estimate_page_numbers(self, chunk_text: str, pages: List[Dict]) -> List[int]:
        """
        Estimate which pages a chunk spans based on text matching.
        This is a simple heuristic - could be improved.
        """
        if not pages:
            return []
        
        # Find pages containing any portion of the chunk text
        chunk_snippet = chunk_text[:100]  # First 100 chars for matching
        matching_pages = []
        
        for page in pages:
            if chunk_snippet in page['text']:
                matching_pages.append(page['page_number'])
        
        return matching_pages if matching_pages else [1]  # Default to page 1
    
    def process_document(self, pdf_path: str, doc_id: str) -> tuple:
        """
        End-to-end processing: extract and chunk.
        
        Returns:
            Tuple of (extraction_result, chunks)
        """
        extraction_result = self.extract_from_pdf(pdf_path, doc_id)
        chunks = self.chunk_document(extraction_result)
        return extraction_result, chunks
    
    def process_batch(self, pdf_sources: List[tuple]) -> tuple:
        """
        Process multiple documents.
        
        Args:
            pdf_sources: List of (pdf_path, doc_id) tuples
            
        Returns:
            Tuple of (all_extractions, all_chunks)
        """
        self.logger.info(f"Processing batch of {len(pdf_sources)} documents")
        
        all_extractions = []
        all_chunks = []
        
        for pdf_path, doc_id in pdf_sources:
            extraction, chunks = self.process_document(pdf_path, doc_id)
            all_extractions.append(extraction)
            all_chunks.extend(chunks)
        
        success_count = sum(1 for e in all_extractions if e['status'] == 'success')
        self.logger.info(f"‚úì Batch complete: {success_count}/{len(pdf_sources)} successful")
        
        return all_extractions, all_chunks

print("‚úì DocumentProcessor class defined")

---
## 4. Document Discovery & Validation

Locate and validate the 6 sample PDFs.

In [None]:
# Define your 6 PDF documents
# Modify these paths based on your actual document locations

if ProcessingConfig.PDF_SOURCE_TYPE == "s3":
    # S3 sources
    pdf_sources = [
        (f"s3://{ProcessingConfig.S3_BUCKET}/{ProcessingConfig.S3_PREFIX}doc1.pdf", "doc_001"),
        (f"s3://{ProcessingConfig.S3_BUCKET}/{ProcessingConfig.S3_PREFIX}doc2.pdf", "doc_002"),
        (f"s3://{ProcessingConfig.S3_BUCKET}/{ProcessingConfig.S3_PREFIX}doc3.pdf", "doc_003"),
        (f"s3://{ProcessingConfig.S3_BUCKET}/{ProcessingConfig.S3_PREFIX}doc4.pdf", "doc_004"),
        (f"s3://{ProcessingConfig.S3_BUCKET}/{ProcessingConfig.S3_PREFIX}doc5.pdf", "doc_005"),
        (f"s3://{ProcessingConfig.S3_BUCKET}/{ProcessingConfig.S3_PREFIX}doc6.pdf", "doc_006"),
    ]
else:
    # Local sources
    pdf_dir = Path(ProcessingConfig.LOCAL_PDF_DIR)
    pdf_files = sorted(pdf_dir.glob("*.pdf"))[:6]  # Take first 6 PDFs
    pdf_sources = [
        (str(pdf_path), f"doc_{i:03d}") 
        for i, pdf_path in enumerate(pdf_files, start=1)
    ]

# Display document list
doc_list_df = pd.DataFrame([
    {
        "Doc ID": doc_id,
        "Source Path": path,
        "Filename": Path(path).name
    }
    for path, doc_id in pdf_sources
])

print(f"\nüìÑ Found {len(pdf_sources)} documents:")
display(doc_list_df)

In [None]:
# Validation: Check if documents are accessible
print("\nüîç Validating document access...\n")

validation_results = []

for pdf_path, doc_id in pdf_sources:
    try:
        if pdf_path.startswith('s3://'):
            # Check S3 object exists
            parts = pdf_path.replace('s3://', '').split('/', 1)
            bucket, key = parts[0], parts[1]
            response = s3_client.head_object(Bucket=bucket, Key=key)
            size_mb = response['ContentLength'] / (1024 * 1024)
            status = "‚úì Accessible"
        else:
            # Check local file exists
            path = Path(pdf_path)
            if path.exists():
                size_mb = path.stat().st_size / (1024 * 1024)
                status = "‚úì Accessible"
            else:
                size_mb = 0
                status = "‚úó Not found"
        
        validation_results.append({
            "Doc ID": doc_id,
            "Filename": Path(pdf_path).name,
            "Size (MB)": f"{size_mb:.2f}",
            "Status": status
        })
        
    except Exception as e:
        validation_results.append({
            "Doc ID": doc_id,
            "Filename": Path(pdf_path).name,
            "Size (MB)": "N/A",
            "Status": f"‚úó Error: {str(e)[:50]}"
        })

validation_df = pd.DataFrame(validation_results)
display(validation_df)

# Check if all documents are accessible
accessible_count = sum(1 for r in validation_results if "‚úì" in r["Status"])
if accessible_count == len(pdf_sources):
    print(f"\n‚úì All {len(pdf_sources)} documents are accessible and ready for processing")
else:
    print(f"\n‚ö†Ô∏è Warning: Only {accessible_count}/{len(pdf_sources)} documents are accessible")

---
## 5. Raw Extraction (Textractor)

Extract text from all PDFs and save checkpoint.

In [None]:
# Initialize processor
processor = DocumentProcessor(ProcessingConfig)

print("‚úì DocumentProcessor initialized")
print(f"  - Chunk size: {ProcessingConfig.CHUNK_SIZE}")
print(f"  - Chunk overlap: {ProcessingConfig.CHUNK_OVERLAP}")
print(f"  - Textract features: {ProcessingConfig.TEXTRACT_FEATURES or 'Text only'}")

In [None]:
# Extract text from all documents
print("\nüîÑ Starting extraction process...\n")
print("=" * 80)

raw_extractions = []

for pdf_path, doc_id in pdf_sources:
    print(f"\nProcessing: {doc_id} - {Path(pdf_path).name}")
    print("-" * 80)
    
    extraction_result = processor.extract_from_pdf(pdf_path, doc_id)
    raw_extractions.append(extraction_result)
    
    # Display result summary
    if extraction_result['status'] == 'success':
        print(f"  ‚úì Pages: {extraction_result['page_count']}")
        print(f"  ‚úì Characters: {extraction_result['total_char_count']:,}")
    else:
        print(f"  ‚úó Error: {extraction_result.get('error', 'Unknown error')}")

print("\n" + "=" * 80)
print("\n‚úì Extraction phase complete")

In [None]:
# Save raw extractions checkpoint
extraction_checkpoint_path = checkpoint_dir / ProcessingConfig.RAW_EXTRACTION_FILE

with open(extraction_checkpoint_path, 'w') as f:
    json.dump(raw_extractions, f, indent=2)

print(f"üíæ Raw extractions saved to: {extraction_checkpoint_path}")
print(f"   File size: {extraction_checkpoint_path.stat().st_size / 1024:.2f} KB")

---
## 6. Extraction Quality Review

Inspect extracted text for quality issues.

In [None]:
# Summary statistics
extraction_stats = []

for extraction in raw_extractions:
    if extraction['status'] == 'success':
        extraction_stats.append({
            "Doc ID": extraction['doc_id'],
            "Doc Name": extraction['doc_name'],
            "Pages": extraction['page_count'],
            "Characters": f"{extraction['total_char_count']:,}",
            "Avg Chars/Page": f"{extraction['total_char_count'] // extraction['page_count']:,}",
            "Status": "‚úì Success"
        })
    else:
        extraction_stats.append({
            "Doc ID": extraction['doc_id'],
            "Doc Name": extraction['doc_name'],
            "Pages": "N/A",
            "Characters": "N/A",
            "Avg Chars/Page": "N/A",
            "Status": "‚úó Failed"
        })

stats_df = pd.DataFrame(extraction_stats)
print("\nüìä Extraction Statistics:\n")
display(stats_df)

# Overall stats
successful = [e for e in raw_extractions if e['status'] == 'success']
if successful:
    total_pages = sum(e['page_count'] for e in successful)
    total_chars = sum(e['total_char_count'] for e in successful)
    print(f"\nüìà Overall:")
    print(f"   Total pages extracted: {total_pages}")
    print(f"   Total characters: {total_chars:,}")
    print(f"   Average document size: {total_chars // len(successful):,} characters")

In [None]:
# Display sample text from each document
print("\nüìù Sample Text from Each Document:\n")
print("=" * 80)

for extraction in raw_extractions:
    if extraction['status'] == 'success':
        print(f"\n{extraction['doc_id']} - {extraction['doc_name']}")
        print("-" * 80)
        # Show first 500 characters
        sample_text = extraction['full_text'][:500]
        print(sample_text)
        if len(extraction['full_text']) > 500:
            print("\n[... truncated ...]")
        print()

In [None]:
# Quality checks
print("\nüîç Quality Checks:\n")

quality_issues = []

for extraction in raw_extractions:
    if extraction['status'] == 'success':
        text = extraction['full_text']
        doc_id = extraction['doc_id']
        
        # Check for potential issues
        issues = []
        
        # Very short extraction
        if len(text) < 100:
            issues.append("Very short text (< 100 chars)")
        
        # Check for excessive special characters (possible OCR issues)
        special_char_ratio = sum(1 for c in text if not c.isalnum() and not c.isspace()) / len(text)
        if special_char_ratio > 0.3:
            issues.append(f"High special char ratio ({special_char_ratio:.1%})")
        
        # Check for repeated characters (OCR artifact)
        if '.....' in text or '-----' in text:
            issues.append("Repeated characters detected")
        
        if issues:
            quality_issues.append({
                "Doc ID": doc_id,
                "Issues": "; ".join(issues)
            })

if quality_issues:
    quality_df = pd.DataFrame(quality_issues)
    print("‚ö†Ô∏è Potential Quality Issues Detected:\n")
    display(quality_df)
else:
    print("‚úì No obvious quality issues detected")
    print("   Note: Manual review of sample text above is still recommended")

---
## 7. Text Chunking

Apply LangChain RecursiveCharacterTextSplitter to create chunks.

In [None]:
# Chunk all successfully extracted documents
print("\nüîÑ Starting chunking process...\n")
print("=" * 80)

all_chunks = []

for extraction in raw_extractions:
    if extraction['status'] == 'success':
        print(f"\nChunking: {extraction['doc_id']} - {extraction['doc_name']}")
        print("-" * 80)
        
        chunks = processor.chunk_document(extraction)
        all_chunks.extend(chunks)
        
        print(f"  ‚úì Created {len(chunks)} chunks")
        if chunks:
            avg_chunk_size = sum(c['char_count'] for c in chunks) / len(chunks)
            print(f"  ‚úì Average chunk size: {avg_chunk_size:.0f} characters")

print("\n" + "=" * 80)
print(f"\n‚úì Chunking complete: {len(all_chunks)} total chunks created")

---
## 8. Chunk Analysis

Analyze chunk distribution and quality.

In [None]:
# Chunk distribution by document
import matplotlib.pyplot as plt

if all_chunks:
    # Group chunks by document
    chunks_by_doc = {}
    for chunk in all_chunks:
        doc_id = chunk['doc_id']
        if doc_id not in chunks_by_doc:
            chunks_by_doc[doc_id] = []
        chunks_by_doc[doc_id].append(chunk)
    
    # Create distribution table
    distribution_data = []
    for doc_id, chunks in chunks_by_doc.items():
        chunk_sizes = [c['char_count'] for c in chunks]
        distribution_data.append({
            "Doc ID": doc_id,
            "Chunk Count": len(chunks),
            "Avg Size": f"{sum(chunk_sizes) / len(chunks):.0f}",
            "Min Size": min(chunk_sizes),
            "Max Size": max(chunk_sizes),
            "Total Chars": f"{sum(chunk_sizes):,}"
        })
    
    dist_df = pd.DataFrame(distribution_data)
    print("\nüìä Chunk Distribution by Document:\n")
    display(dist_df)
    
    # Visualize chunk size distribution
    all_chunk_sizes = [c['char_count'] for c in all_chunks]
    
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 2, 1)
    plt.hist(all_chunk_sizes, bins=30, edgecolor='black', alpha=0.7)
    plt.axvline(ProcessingConfig.CHUNK_SIZE, color='red', linestyle='--', label=f'Target: {ProcessingConfig.CHUNK_SIZE}')
    plt.xlabel('Chunk Size (characters)')
    plt.ylabel('Frequency')
    plt.title('Chunk Size Distribution')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    doc_ids = [d["Doc ID"] for d in distribution_data]
    chunk_counts = [d["Chunk Count"] for d in distribution_data]
    plt.bar(doc_ids, chunk_counts, alpha=0.7, edgecolor='black')
    plt.xlabel('Document ID')
    plt.ylabel('Number of Chunks')
    plt.title('Chunks per Document')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.savefig(checkpoint_dir / 'chunk_distribution.png', dpi=150, bbox_inches='tight')
    plt.show()
    
    print(f"\nüìà Visualization saved to: {checkpoint_dir / 'chunk_distribution.png'}")

In [None]:
# Display example chunks
print("\nüìù Example Chunks:\n")
print("=" * 80)

# Show first chunk from each document
if all_chunks:
    docs_shown = set()
    for chunk in all_chunks:
        if chunk['doc_id'] not in docs_shown:
            docs_shown.add(chunk['doc_id'])
            print(f"\n{chunk['chunk_id']}")
            print(f"Doc: {chunk['doc_name']}, Pages: {chunk['page_numbers']}, Size: {chunk['char_count']} chars")
            print("-" * 80)
            # Show first 300 characters
            print(chunk['text'][:300])
            if len(chunk['text']) > 300:
                print("\n[... truncated ...]")
            print()

In [None]:
# Chunk boundary analysis
print("\nüîç Chunk Boundary Analysis:\n")

if all_chunks:
    # Check for very small chunks (might indicate issues)
    small_chunks = [c for c in all_chunks if c['char_count'] < ProcessingConfig.CHUNK_SIZE * 0.3]
    
    # Check for chunks at max size (might be cut mid-sentence)
    large_chunks = [c for c in all_chunks if c['char_count'] >= ProcessingConfig.CHUNK_SIZE * 0.95]
    
    print(f"Small chunks (< 30% of target): {len(small_chunks)} / {len(all_chunks)} ({len(small_chunks)/len(all_chunks)*100:.1f}%)")
    print(f"Large chunks (>= 95% of target): {len(large_chunks)} / {len(all_chunks)} ({len(large_chunks)/len(all_chunks)*100:.1f}%)")
    
    # Check overlap effectiveness
    print(f"\nOverlap check: Comparing consecutive chunks...")
    overlap_examples = 0
    for i in range(min(3, len(all_chunks) - 1)):
        chunk1 = all_chunks[i]
        chunk2 = all_chunks[i + 1]
        if chunk1['doc_id'] == chunk2['doc_id']:  # Same document
            # Check if there's any overlap
            end_of_first = chunk1['text'][-100:]  # Last 100 chars of first chunk
            start_of_second = chunk2['text'][:100]  # First 100 chars of second chunk
            
            # Simple overlap detection
            overlap_found = any(end_of_first[i:i+20] in start_of_second for i in range(len(end_of_first)-20))
            
            if overlap_found:
                overlap_examples += 1
    
    print(f"  Overlap detected in {overlap_examples}/3 sample consecutive chunk pairs")
    print(f"  (Expected behavior with overlap={ProcessingConfig.CHUNK_OVERLAP})")

---
## 9. Final Output Preparation

Validate and save processed chunks for indexing.

In [None]:
# Schema validation
print("\n‚úÖ Schema Validation:\n")

required_fields = ['chunk_id', 'doc_id', 'doc_name', 'text', 'chunk_index', 'char_count', 'page_numbers']

validation_passed = True
for chunk in all_chunks[:5]:  # Check first 5 chunks
    missing_fields = [field for field in required_fields if field not in chunk]
    if missing_fields:
        print(f"‚úó Chunk {chunk.get('chunk_id', 'unknown')} missing fields: {missing_fields}")
        validation_passed = False

if validation_passed:
    print("‚úì All chunks have required fields")
    print(f"‚úì Schema: {', '.join(required_fields)}")
else:
    print("\n‚ö†Ô∏è Warning: Some chunks have missing fields")

In [None]:
# Prepare final output format
# Add any additional metadata or formatting needed for OpenSearch

final_chunks = []
for chunk in all_chunks:
    # Create a clean version for indexing
    final_chunk = {
        'id': chunk['chunk_id'],  # Use as document ID in OpenSearch
        'text': chunk['text'],
        'metadata': {
            'doc_id': chunk['doc_id'],
            'doc_name': chunk['doc_name'],
            'chunk_index': chunk['chunk_index'],
            'char_count': chunk['char_count'],
            'page_numbers': chunk['page_numbers'],
            'processing_timestamp': chunk['processing_timestamp']
        }
    }
    final_chunks.append(final_chunk)

print(f"‚úì Prepared {len(final_chunks)} chunks for indexing")
print(f"\nSample output format:")
print(json.dumps(final_chunks[0], indent=2))

In [None]:
# Save processed chunks
chunks_output_path = checkpoint_dir / ProcessingConfig.PROCESSED_CHUNKS_FILE

with open(chunks_output_path, 'w') as f:
    json.dump(final_chunks, f, indent=2)

print(f"üíæ Processed chunks saved to: {chunks_output_path}")
print(f"   File size: {chunks_output_path.stat().st_size / 1024:.2f} KB")
print(f"   Total chunks: {len(final_chunks)}")

---
## 10. Summary & Next Steps

In [None]:
# Generate final summary
print("\n" + "=" * 80)
print("üìã PROCESSING SUMMARY")
print("=" * 80)

successful_extractions = [e for e in raw_extractions if e['status'] == 'success']

print(f"\nüìÑ Document Processing:")
print(f"   Total documents: {len(pdf_sources)}")
print(f"   Successfully extracted: {len(successful_extractions)}")
print(f"   Failed extractions: {len(pdf_sources) - len(successful_extractions)}")

if successful_extractions:
    total_pages = sum(e['page_count'] for e in successful_extractions)
    total_chars = sum(e['total_char_count'] for e in successful_extractions)
    print(f"   Total pages: {total_pages}")
    print(f"   Total characters: {total_chars:,}")

print(f"\nüì¶ Chunk Generation:")
print(f"   Total chunks created: {len(all_chunks)}")
if all_chunks:
    avg_chunk_size = sum(c['char_count'] for c in all_chunks) / len(all_chunks)
    print(f"   Average chunk size: {avg_chunk_size:.0f} characters")
    print(f"   Target chunk size: {ProcessingConfig.CHUNK_SIZE} characters")
    print(f"   Chunk overlap: {ProcessingConfig.CHUNK_OVERLAP} characters")

print(f"\nüíæ Outputs:")
print(f"   Config: {checkpoint_dir / 'config.json'}")
print(f"   Raw extractions: {checkpoint_dir / ProcessingConfig.RAW_EXTRACTION_FILE}")
print(f"   Processed chunks: {checkpoint_dir / ProcessingConfig.PROCESSED_CHUNKS_FILE}")
print(f"   Visualization: {checkpoint_dir / 'chunk_distribution.png'}")

print(f"\n‚è≠Ô∏è  Next Steps:")
print(f"   1. Review the sample chunks above for quality")
print(f"   2. If satisfied, proceed to: 02_indexing.ipynb")
print(f"   3. If adjustments needed, modify config and re-run from Section 7")

print("\n" + "=" * 80)
print("‚úì Notebook execution complete")
print("=" * 80)

---
## Notes & Observations

**Document Quality:**
- [Add observations about extraction quality]
- [Note any problematic documents]

**Chunking Strategy:**
- [Document why you chose these chunk parameters]
- [Note any adjustments made during experimentation]

**Issues Encountered:**
- [List any errors or unexpected behavior]

**Next Experiments:**
- [Ideas for different chunk sizes]
- [Alternative splitting strategies]
- [Metadata enhancements]