# LLM-Based Document Processing System

Complete document reformatting with actual LLM processing following the modular workflow.

In [None]:
# Import required libraries
import os
import yaml
import json
from IPython.display import display, Markdown

# Import our LLM document processor
from llm_document_processor import LLMDocumentProcessor

print("LLM Document Processing System loaded successfully")
print("This system will actually call LLM APIs for document reformatting")
print("Make sure you have MISTRAL_API_KEY or OPENAI_API_KEY in your .env file")

## 1. LLM Document Processing (Actual Reformatting)

In [None]:
# Configuration - modify these as needed
input_file = 'bitcoin_whitepaper.tex'
template = 'bitcoin_paper'  # 'bitcoin_paper' or 'academic_paper'

print(f"Processing: {input_file}")
print(f"Template: {template}")
print("WARNING: This will make actual LLM API calls and may take several minutes!")
print("Each section will be processed individually with the LLM.")

try:
    processor = LLMDocumentProcessor()
    
    # This will actually call the LLM for each section
    result = processor.process_document(
        source=input_file,
        template=template
    )
    
    print("\nLLM Processing completed!")
    print(f"Output: {result['final_document']}")
    print(f"Sections processed: {result['processed_sections']}")
    print(f"LLM calls made: {result['analysis']['llm_calls']}")
    print(f"Quality Score: {result['analysis']['quality_score']:.1f}/100")
    
    llm_result = result
    
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()

## 2. Processing Analysis

In [None]:
# Analyze the LLM processing results
if 'llm_result' in locals():
    print("LLM Processing Analysis")
    print("=" * 50)
    
    analysis = llm_result['analysis']
    
    print(f"Original document:")
    print(f"  Characters: {analysis['original_elements']['characters']:,}")
    print(f"  Words: {analysis['original_elements']['words']:,}")
    print(f"  Equations: {analysis['original_elements']['equations']}")
    
    print(f"\nProcessed document:")
    print(f"  Characters: {analysis['final_elements']['characters']:,}")
    print(f"  Words: {analysis['final_elements']['words']:,}")
    print(f"  Sections: {analysis['final_elements']['sections']}")
    
    print(f"\nProcessing metrics:")
    print(f"  LLM API calls: {analysis['llm_calls']}")
    print(f"  Quality score: {analysis['quality_score']:.1f}/100")
    print(f"  Session path: {llm_result['session_path']}")
    
    # Show processing log
    print(f"\nProcessing Log (last 10 entries):")
    for log_entry in analysis['processing_log'][-10:]:
        print(f"  {log_entry}")
else:
    print("No LLM processing results available. Run the processing cell first.")

## 3. Compare Original vs Processed

In [None]:
# Compare original and processed documents
if 'llm_result' in locals():
    print("Document Comparison")
    print("=" * 50)
    
    # Read both documents
    with open('bitcoin_whitepaper.tex', 'r') as f:
        original = f.read()
    
    with open(llm_result['final_document'], 'r') as f:
        processed = f.read()
    
    print(f"Original document:")
    print(f"  Length: {len(original):,} characters")
    print(f"  Preview: {original[:200]}...")
    
    print(f"\nProcessed document:")
    print(f"  Length: {len(processed):,} characters")
    print(f"  Preview: {processed[:200]}...")
    
    # Show improvement metrics
    length_change = (len(processed) - len(original)) / len(original) * 100
    print(f"\nTransformation metrics:")
    print(f"  Length change: {length_change:+.1f}%")
    print(f"  Structure: Original sections -> Organized academic format")
    print(f"  Enhancement: Raw content -> LLM-refined prose")
else:
    print("No processing results available.")

## 4. Individual Section Analysis

In [None]:
# Analyze individual processed sections
if 'llm_result' in locals():
    print("Individual Section Analysis")
    print("=" * 50)
    
    session_path = llm_result['session_path']
    
    if os.path.exists(session_path):
        files = [f for f in os.listdir(session_path) if f.endswith('.tex') and f != 'final_document.tex']
        
        print(f"Generated section files ({len(files)}):")
        
        for file in sorted(files):
            file_path = os.path.join(session_path, file)
            size = os.path.getsize(file_path)
            
            with open(file_path, 'r') as f:
                content = f.read()
            
            section_name = file.replace('.tex', '').replace('_', ' ')
            word_count = len(content.split())
            
            print(f"\n{section_name}:")
            print(f"  File: {file}")
            print(f"  Size: {size:,} bytes")
            print(f"  Words: {word_count:,}")
            print(f"  Preview: {content[:150]}...")
    else:
        print("Session directory not found.")
else:
    print("No processing results available.")

## 5. Configuration and Settings

In [None]:
# Show current configuration and available options
print("System Configuration")
print("=" * 50)

# Load config
try:
    with open('config.yaml', 'r') as f:
        config = yaml.safe_load(f)
    
    print("Current LLM Configuration:")
    print(f"  Provider: {config['llm']['provider']}")
    print(f"  Model: {config['llm']['model']}")
    print(f"  Max tokens: {config['llm']['max_tokens']}")
    print(f"  Temperature: {config['llm']['temperature']}")
    
    print(f"\nProcessing Configuration:")
    print(f"  Template: {config['processing']['template']}")
    print(f"  Enhancement: {config['processing']['enable_enhancement']}")
    print(f"  Chunk strategy: {config['processing']['chunk_strategy']}")
    
except FileNotFoundError:
    print("Using default configuration (config.yaml not found)")

print(f"\nAvailable Templates:")
print(f"  - bitcoin_paper: Bitcoin whitepaper structure (12 sections)")
print(f"  - academic_paper: Standard academic format (6 sections)")

print(f"\nSupported LLM Providers:")
print(f"  - mistral: Mistral AI (requires MISTRAL_API_KEY)")
print(f"  - openai: OpenAI (requires OPENAI_API_KEY)")

print(f"\nEnvironment Variables:")
print(f"  MISTRAL_API_KEY: {'Set' if os.getenv('MISTRAL_API_KEY') else 'Not set'}")
print(f"  OPENAI_API_KEY: {'Set' if os.getenv('OPENAI_API_KEY') else 'Not set'}")