# Modular Document Processing System

Following the exact workflow from modular_refactor.py with all functionality in one consolidated processor.

In [None]:
# Import the consolidated processor
from modular_document_processor import main
import os

print("Modular Document Processing System loaded")
print("Following exact modular_refactor.py workflow")
print(f"MISTRAL_API_KEY: {'Set' if os.getenv('MISTRAL_API_KEY') else 'Not set'}")
print(f"HUGGING_FACE_TOKEN: {'Set' if os.getenv('HUGGING_FACE_TOKEN') else 'Not set'}")

## 1. Single Document Processing (LLM Reformatting)

In [None]:
# Configuration
source_file = 'bitcoin_whitepaper.tex'
template = 'bitcoin_paper'  # 'bitcoin_paper' or 'academic_paper'
output_format = 'latex'
chunking_strategy = 'semantic'

print(f"Processing: {source_file}")
print(f"Template: {template}")
print(f"This will make actual LLM API calls for each section...")

# Process document (following exact modular_refactor.py workflow)
result = main(
    source=source_file,
    template=template,
    output_format=output_format,
    chunking_strategy=chunking_strategy
)

print(f"\nProcessing completed!")
print(f"Final document: {result['final_document']}")
print(f"Session path: {result['session_path']}")
print(f"Sections processed: {result['processed_sections']}")
print(f"LLM calls made: {result['analysis']['llm_calls']}")
print(f"Quality score: {result['analysis']['quality_score']:.1f}/100")

# Store for next cell
single_result = result

## 2. Document Augmentation (Combination)

In [None]:
# Configuration for document combination
base_document = single_result['final_document'] if 'single_result' in locals() else 'bitcoin_whitepaper.tex'
additional_document = 'blockchain_security.tex'
combine_strategy = 'smart_merge'  # 'smart_merge' or 'simple_append'

print(f"Combining documents:")
print(f"  Base: {base_document}")
print(f"  Additional: {additional_document}")
print(f"  Strategy: {combine_strategy}")

# Combine documents (following exact modular_refactor.py workflow)
augment_result = main(
    source=base_document,
    source2=additional_document,
    combine_strategy=combine_strategy,
    output_format=output_format
)

print(f"\nAugmentation completed!")
print(f"Combined document: {augment_result['final_document']}")
print(f"Session path: {augment_result['session_path']}")
print(f"Quality score: {augment_result['analysis']['quality_score']:.1f}/100")

## 3. Analysis and Results

In [None]:
print("Processing Analysis")
print("=" * 50)

if 'single_result' in locals():
    print("Single Document Processing:")
    print(f"  Final document: {single_result['final_document']}")
    print(f"  Sections processed: {single_result['processed_sections']}")
    print(f"  LLM calls: {single_result['analysis']['llm_calls']}")
    print(f"  Quality score: {single_result['analysis']['quality_score']:.1f}/100")
    print(f"  Session path: {single_result['session_path']}")

if 'augment_result' in locals():
    print(f"\nDocument Augmentation:")
    print(f"  Combined document: {augment_result['final_document']}")
    print(f"  Quality score: {augment_result['analysis']['quality_score']:.1f}/100")
    print(f"  Session path: {augment_result['session_path']}")

# Show session files
if 'single_result' in locals():
    session_path = single_result['session_path']
    if os.path.exists(session_path):
        files = os.listdir(session_path)
        print(f"\nGenerated files in {session_path}:")
        for file in sorted(files):
            file_path = os.path.join(session_path, file)
            size = os.path.getsize(file_path)
            print(f"  {file}: {size:,} bytes")

## 4. Configuration Options

In [None]:
print("Available Configuration Options")
print("=" * 50)

print("Templates:")
print("  - bitcoin_paper: Bitcoin whitepaper structure (12 sections)")
print("  - academic_paper: Standard academic format (6 sections)")

print("\nLLM Providers:")
print("  - mistral: Mistral AI (fast, reliable)")
print("  - openai: OpenAI (high quality)")
print("  - huggingface: Hugging Face (free tier available)")

print("\nCombination Strategies:")
print("  - smart_merge: Intelligently merge matching sections")
print("  - simple_append: Append all sections from second document")

print("\nChunking Strategies:")
print("  - semantic: LLM-enhanced intelligent chunking")
print("  - regex_only: Pattern-based chunking only")

print("\nOutput Formats:")
print("  - latex: LaTeX document format")
print("  - markdown: Markdown format")

# Show current config
try:
    import yaml
    with open('config.yaml', 'r') as f:
        config = yaml.safe_load(f)
    
    print(f"\nCurrent Configuration:")
    print(f"  LLM Provider: {config['llm']['provider']}")
    print(f"  Model: {config['llm']['model']}")
    print(f"  Temperature: {config['llm']['temperature']}")
    print(f"  Max tokens: {config['llm']['max_tokens']}")
except:
    print("\nUsing default configuration")

## 5. Advanced Usage Examples

In [None]:
print("Advanced Usage Examples")
print("=" * 50)

print("Example 1: Academic paper processing")
print("result = main(")
print("    source='research_paper.tex',")
print("    template='academic_paper',")
print("    output_format='latex'")
print(")")

print("\nExample 2: Document combination")
print("result = main(")
print("    source='paper1.tex',")
print("    source2='paper2.tex',")
print("    combine_strategy='smart_merge'")
print(")")

print("\nExample 3: Custom configuration")
print("# Edit config.yaml to change LLM provider:")
print("llm:")
print("  provider: 'huggingface'")
print("  model: 'mistralai/Mistral-7B-Instruct-v0.3'")

print("\nWorkflow Summary:")
print("1. Load LaTeX file")
print("2. Extract and chunk content")
print("3. Assign chunks to document skeleton")
print("4. Process each section with LLM")
print("5. Aggregate final document")
print("6. Generate analysis and reports")

print("\nThis follows the exact modular_refactor.py workflow!")