# Complete Document Processing System

Self-contained document processing with LLM enhancement and PDF generation.

In [3]:
# Import the processor
from modular_document_processor import main
import os

print("Complete Document Processing System loaded")
print(f"MISTRAL_API_KEY: {'Set' if os.getenv('MISTRAL_API_KEY') else 'Not set'}")
print(f"Current directory: {os.getcwd()}")
print("All outputs will be generated in this directory")

Complete Document Processing System loaded
MISTRAL_API_KEY: Set
Current directory: /Users/shantanumisra/workspace/doc_poc/complete_document_processor
All outputs will be generated in this directory


## 1. Single Document Processing (LLM Reformatting)

In [4]:
# Configuration
source_file = 'bitcoin_whitepaper.tex'
template = 'bitcoin_paper'  # 'bitcoin_paper' or 'academic_paper'

print(f"Processing: {source_file}")
print(f"Template: {template}")
print("This will make LLM API calls and generate both .tex and .pdf files...")

# Process document
result = main(
    source=source_file,
    template=template
)

print(f"\nProcessing completed!")
print(f"LaTeX document: {result['final_document']}")
if result['pdf_document']:
    print(f"PDF document: {result['pdf_document']}")
else:
    print("PDF generation failed (pdflatex not available)")
print(f"Sections processed: {result['processed_sections']}")
print(f"LLM calls made: {result['analysis']['llm_calls']}")
print(f"Quality score: {result['analysis']['quality_score']:.1f}/100")

# Store for next cell
single_result = result

Processing: bitcoin_whitepaper.tex
Template: bitcoin_paper
This will make LLM API calls and generate both .tex and .pdf files...
Loading LaTeX file...
Extracting and chunking content...
Assigning chunks to document skeleton...
Processing sections with LLM...
  Processing: Abstract
  Processing: 1. Introduction
  Processing: 2. Transactions
  Processing: 3. Timestamp Server
  Processing: 4. Proof-of-Work
  Processing: 5. Network
  Processing: 6. Incentive
  Processing: 7. Reclaiming Disk Space
  Processing: 8. Simplified Payment Verification
  Processing: 9. Combining and Splitting Value
  Processing: 10. Privacy
  Processing: 11. Calculations
  Processing: 12. Conclusion
Aggregating final document...
PDF compilation failed: 

Processing complete!
Final document: ./final_document_20250729_222549.tex
Processing log: ./processing_log_20250729_222549.txt

Processing completed!
LaTeX document: ./final_document_20250729_222549.tex
PDF generation failed (pdflatex not available)
Sections proce

## 2. Document Augmentation (Combination)

In [5]:
# Configuration for document combination
base_document = single_result['final_document'] if 'single_result' in locals() else 'bitcoin_whitepaper.tex'
additional_document = 'blockchain_security.tex'
combine_strategy = 'smart_merge'  # 'smart_merge' or 'simple_append'

print(f"Combining documents:")
print(f"  Base: {base_document}")
print(f"  Additional: {additional_document}")
print(f"  Strategy: {combine_strategy}")

# Combine documents
augment_result = main(
    source=base_document,
    source2=additional_document,
    combine_strategy=combine_strategy
)

print(f"\nAugmentation completed!")
print(f"LaTeX document: {augment_result['final_document']}")
if augment_result['pdf_document']:
    print(f"PDF document: {augment_result['pdf_document']}")
print(f"Quality score: {augment_result['analysis']['quality_score']:.1f}/100")

Combining documents:
  Base: ./final_document_20250729_222549.tex
  Additional: blockchain_security.tex
  Strategy: smart_merge
Combining documents: ./final_document_20250729_222549.tex + blockchain_security.tex
PDF compiled successfully: ./combined_document_20250729_222642.pdf
Combined document saved: ./combined_document_20250729_222642.tex
PDF generated: ./combined_document_20250729_222642.pdf

Augmentation completed!
LaTeX document: ./combined_document_20250729_222642.tex
PDF document: ./combined_document_20250729_222642.pdf
Quality score: 85.0/100


## 3. View Generated Files

In [6]:
# List all generated files in current directory
import glob
from datetime import datetime

print("Generated Files in Current Directory:")
print("=" * 50)

# Get all .tex and .pdf files
tex_files = glob.glob("*.tex")
pdf_files = glob.glob("*.pdf")
log_files = glob.glob("*.txt")

print(f"LaTeX files ({len(tex_files)}):")
for file in sorted(tex_files):
    size = os.path.getsize(file)
    mtime = datetime.fromtimestamp(os.path.getmtime(file))
    print(f"  {file}: {size:,} bytes (modified: {mtime.strftime('%H:%M:%S')})")

print(f"\nPDF files ({len(pdf_files)}):")
for file in sorted(pdf_files):
    size = os.path.getsize(file)
    mtime = datetime.fromtimestamp(os.path.getmtime(file))
    print(f"  {file}: {size:,} bytes (modified: {mtime.strftime('%H:%M:%S')})")

print(f"\nLog files ({len(log_files)}):")
for file in sorted(log_files):
    size = os.path.getsize(file)
    mtime = datetime.fromtimestamp(os.path.getmtime(file))
    print(f"  {file}: {size:,} bytes (modified: {mtime.strftime('%H:%M:%S')})")

Generated Files in Current Directory:
LaTeX files (5):
  bitcoin_whitepaper.tex: 19,531 bytes (modified: 18:02:13)
  blockchain_security.tex: 6,063 bytes (modified: 18:02:13)
  combined_document_20250729_222642.tex: 6,090 bytes (modified: 22:26:42)
  final_document_20250729_193940.tex: 30,714 bytes (modified: 19:40:30)
  final_document_20250729_222549.tex: 21,909 bytes (modified: 22:26:28)

PDF files (3):
  combined_document_20250729_222642.pdf: 139,025 bytes (modified: 22:26:43)
  final_document_20250729_193940.pdf: 189,869 bytes (modified: 19:40:32)
  final_document_20250729_222549.pdf: 183,405 bytes (modified: 22:26:31)

Log files (3):
  processing_log_20250729_193940.txt: 654 bytes (modified: 19:40:32)
  processing_log_20250729_222549.txt: 654 bytes (modified: 22:26:31)
  requirements.txt: 83 bytes (modified: 18:04:07)


## 4. Configuration and Setup

In [7]:
print("System Configuration")
print("=" * 50)

# Check configuration
try:
    import yaml
    with open('config.yaml', 'r') as f:
        config = yaml.safe_load(f)
    
    print("Current Configuration:")
    print(f"  LLM Provider: {config['llm']['provider']}")
    print(f"  Model: {config['llm']['model']}")
    print(f"  Temperature: {config['llm']['temperature']}")
    print(f"  Max tokens: {config['llm']['max_tokens']}")
    print(f"  Generate PDF: {config['output']['generate_pdf']}")
except:
    print("Using default configuration")

# Check environment
print(f"\nEnvironment:")
print(f"  MISTRAL_API_KEY: {'✓ Set' if os.getenv('MISTRAL_API_KEY') else '✗ Not set'}")
print(f"  OPENAI_API_KEY: {'✓ Set' if os.getenv('OPENAI_API_KEY') else '✗ Not set'}")
print(f"  HUGGING_FACE_TOKEN: {'✓ Set' if os.getenv('HUGGING_FACE_TOKEN') else '✗ Not set'}")

# Check LaTeX installation
import subprocess
try:
    result = subprocess.run(['pdflatex', '--version'], capture_output=True, text=True)
    if result.returncode == 0:
        print(f"  pdflatex: ✓ Available")
    else:
        print(f"  pdflatex: ✗ Not available")
except:
    print(f"  pdflatex: ✗ Not available")

print(f"\nAvailable Templates:")
print(f"  - bitcoin_paper: Bitcoin whitepaper structure (12 sections)")
print(f"  - academic_paper: Standard academic format (6 sections)")

print(f"\nCombination Strategies:")
print(f"  - smart_merge: Intelligently merge matching sections")
print(f"  - simple_append: Append all sections from second document")

System Configuration
Current Configuration:
  LLM Provider: mistral
  Model: open-mistral-7b
  Temperature: 0.2
  Max tokens: 2048
  Generate PDF: True

Environment:
  MISTRAL_API_KEY: ✓ Set
  OPENAI_API_KEY: ✓ Set
  HUGGING_FACE_TOKEN: ✗ Not set
  pdflatex: ✓ Available

Available Templates:
  - bitcoin_paper: Bitcoin whitepaper structure (12 sections)
  - academic_paper: Standard academic format (6 sections)

Combination Strategies:
  - smart_merge: Intelligently merge matching sections
  - simple_append: Append all sections from second document
