# XML Analysis Framework - Documentation Testing

This notebook contains all the examples from the README documentation, allowing you to test them interactively and verify they work correctly.

## Setup

First, install the required packages:

In [None]:
# Install the XML analysis framework
!pip install xml-analysis-framework

# Import required modules
import xml_analysis_framework as xaf
import json
from datetime import datetime
from pathlib import Path

print(f"XML Analysis Framework version: {xaf.__version__}")

## Test Data

We'll use a synthetic KML file for testing:

In [None]:
# Path to our test file
test_file = "data/mapbox-example.kml"

# Verify the file exists
if Path(test_file).exists():
    print(f"✅ Test file found: {test_file}")
    print(f"File size: {Path(test_file).stat().st_size / 1024:.1f} KB")
else:
    print(f"❌ Test file not found: {test_file}")

## Simple API Examples

Testing the simple API from the README:

In [None]:
# 🎯 One-line analysis with specialized handlers
result = xaf.analyze(test_file)
print(f"Document type: {result['document_type'].type_name}")
print(f"Handler used: {result['handler_used']}")
print(f"Confidence: {result['document_type'].confidence}")

In [None]:
# 📊 Basic schema analysis  
schema = xaf.analyze_schema(test_file)
print(f"Elements: {schema.total_elements}, Depth: {schema.max_depth}")
print(f"Root element: {schema.root_element}")
print(f"Namespaces: {schema.namespaces}")

In [None]:
# ✂️ Smart chunking for AI/ML
chunks = xaf.chunk(test_file, strategy="auto")
print(f"Created {len(chunks)} optimized chunks")

# Show details of first few chunks
for i, chunk in enumerate(chunks[:3]):
    print(f"\nChunk {i+1}:")
    print(f"  ID: {chunk.chunk_id}")
    print(f"  Content length: {len(chunk.content)} chars")
    print(f"  Element path: {chunk.element_path}")
    print(f"  Token estimate: {chunk.token_estimate}")
    print(f"  Elements included: {chunk.elements_included}")

In [None]:
# 💾 Save chunks to JSON
chunks_data = [
    {
        "chunk_id": chunk.chunk_id,
        "content": chunk.content,
        "element_path": chunk.element_path,
        "start_line": chunk.start_line,
        "end_line": chunk.end_line,
        "elements_included": chunk.elements_included,
        "metadata": chunk.metadata,
        "token_estimate": chunk.token_estimate
    }
    for chunk in chunks
]

# Write to file
with open("chunks_output.json", "w") as f:
    json.dump(chunks_data, f, indent=2)

print(f"✅ Saved {len(chunks_data)} chunks to chunks_output.json")

## Advanced API Examples

Testing the advanced API with multiple chunking strategies:

In [None]:
# Enhanced analysis with full results
analysis = xaf.analyze_enhanced(test_file)

print(f"Type: {analysis.type_name} (confidence: {analysis.confidence:.2f})")
print(f"AI use cases: {len(analysis.ai_use_cases)}")
print(f"AI use cases: {analysis.ai_use_cases}")

if analysis.quality_metrics:
    print(f"Quality metrics: {analysis.quality_metrics}")
else:
    print("Quality metrics: Not available")

print(f"\nKey findings: {analysis.key_findings}")
print(f"Structured data keys: {list(analysis.structured_data.keys())}")

In [None]:
# Different chunking strategies
hierarchical_chunks = xaf.chunk(test_file, strategy="hierarchical")
sliding_chunks = xaf.chunk(test_file, strategy="sliding_window") 
content_chunks = xaf.chunk(test_file, strategy="content_aware")

print(f"Hierarchical chunks: {len(hierarchical_chunks)}")
print(f"Sliding window chunks: {len(sliding_chunks)}")
print(f"Content-aware chunks: {len(content_chunks)}")

In [None]:
# Process chunks
print("First 3 hierarchical chunks:")
for chunk in hierarchical_chunks[:3]:
    print(f"Chunk {chunk.chunk_id}: {len(chunk.content)} chars")
    print(f"Path: {chunk.element_path}, Elements: {len(chunk.elements_included)}")

In [None]:
# 💾 Save different chunking strategies to separate files
# Helper function to convert chunk to dict
def chunk_to_dict(chunk):
    return {
        "chunk_id": chunk.chunk_id,
        "content": chunk.content,
        "element_path": chunk.element_path,
        "start_line": chunk.start_line,
        "end_line": chunk.end_line,
        "elements_included": chunk.elements_included,
        "metadata": chunk.metadata,
        "token_estimate": chunk.token_estimate
    }

# Save each strategy's results
strategies = {
    "hierarchical": hierarchical_chunks,
    "sliding_window": sliding_chunks,
    "content_aware": content_chunks
}

for strategy_name, chunks in strategies.items():
    chunks_data = [chunk_to_dict(chunk) for chunk in chunks]
    
    with open(f"chunks_{strategy_name}.json", "w") as f:
        json.dump({
            "strategy": strategy_name,
            "total_chunks": len(chunks_data),
            "chunks": chunks_data
        }, f, indent=2)
    
    print(f"Saved {len(chunks_data)} chunks to chunks_{strategy_name}.json")

## Expert API Examples

Testing the expert API with direct class access and custom configuration:

In [None]:
# For advanced customization, use the classes directly
from xml_analysis_framework import XMLDocumentAnalyzer, ChunkingOrchestrator

analyzer = XMLDocumentAnalyzer(max_file_size_mb=500)
orchestrator = ChunkingOrchestrator(max_file_size_mb=1000)

# Custom analysis
result = analyzer.analyze_document(test_file)

print(f"Analysis result type: {type(result)}")
print(f"Document type: {result.type_name}")
print(f"Confidence: {result.confidence}")
print(f"Handler used: {result.handler_used}")

In [None]:
# Custom chunking with config
from xml_analysis_framework.core.chunking import ChunkingConfig

config = ChunkingConfig(
    max_chunk_size=2000,
    min_chunk_size=300,
    overlap_size=150,
    preserve_hierarchy=True
)

chunks = orchestrator.chunk_document(test_file, result, strategy="auto", config=config)

print(f"Custom chunking created {len(chunks)} chunks")
print(f"Config - Max: {config.max_chunk_size}, Min: {config.min_chunk_size}, Overlap: {config.overlap_size}")

In [None]:
# 💾 Save with analysis metadata
output_data = {
    "metadata": {
        "file": test_file,
        "processed_at": datetime.now().isoformat(),
        "document_type": result.type_name,
        "confidence": result.confidence,
        "handler_used": result.handler_used,
        "chunking_config": {
            "strategy": "auto",
            "max_chunk_size": config.max_chunk_size,
            "min_chunk_size": config.min_chunk_size,
            "overlap_size": config.overlap_size,
            "preserve_hierarchy": config.preserve_hierarchy
        }
    },
    "analysis": {
        "ai_use_cases": result.ai_use_cases,
        "key_findings": result.key_findings,
        "quality_metrics": result.quality_metrics
    },
    "chunks": [
        {
            "chunk_id": chunk.chunk_id,
            "content": chunk.content,
            "element_path": chunk.element_path,
            "start_line": chunk.start_line,
            "end_line": chunk.end_line,
            "elements_included": chunk.elements_included,
            "metadata": chunk.metadata,
            "token_estimate": chunk.token_estimate
        }
        for chunk in chunks
    ]
}

with open("analysis_and_chunks.json", "w") as f:
    json.dump(output_data, f, indent=2)

print(f"✅ Saved complete analysis with {len(chunks)} chunks to analysis_and_chunks.json")

## Summary

All documentation examples have been tested! The framework successfully:

1. ✅ Analyzed the KML document and detected its type
2. ✅ Generated schema information
3. ✅ Created optimized chunks using different strategies
4. ✅ Exported all data to JSON format
5. ✅ Worked with custom configurations

Check the generated JSON files to see the structured output!