# Test Few-Shot Classification Implementation

This notebook tests the new `{FEW_SHOT_EXAMPLES}` placeholder functionality in the Classification service.

In [1]:
import sys
import os
import yaml
import json
from pathlib import Path

# Set ROOT_DIR for local file access from notebook directory
os.environ['ROOT_DIR'] = '../'

# Add the idp_common package to the path
sys.path.insert(0, '../lib/idp_common_pkg')

from idp_common.classification.service import ClassificationService
from idp_common.classification.models import PageClassification, DocumentClassification

## Load the Few-Shot Configuration

In [2]:
# Load the few-shot configuration
config_path = '../config_library/pattern-2/few_shot_example/config.yaml'
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("Configuration loaded successfully!")
print(f"Number of classes: {len(config.get('classes', []))}")
print(f"Classification method: {config.get('classification', {}).get('classificationMethod')}")

Configuration loaded successfully!
Number of classes: 11
Classification method: multimodalPageLevelClassification


## Examine the Task Prompt Template

In [3]:
# Look at the task prompt to see the FEW_SHOT_EXAMPLES placeholder
task_prompt = config['classification']['task_prompt']
print("Task prompt template:")
print("=" * 50)
print(task_prompt)
print("=" * 50)

# Check if it contains the placeholder
has_placeholder = "{FEW_SHOT_EXAMPLES}" in task_prompt
print(f"\nContains {{FEW_SHOT_EXAMPLES}} placeholder: {has_placeholder}")

Task prompt template:
Classify this document into exactly one of these categories:

{CLASS_NAMES_AND_DESCRIPTIONS}

Respond only with a JSON object containing the class label. For example: {{"class": "letter"}}
<few_shot_examples>
{FEW_SHOT_EXAMPLES}
</few_shot_examples>
<<CACHEPOINT>>
<document_ocr_data>
{DOCUMENT_TEXT}
</document_ocr_data>

Contains {FEW_SHOT_EXAMPLES} placeholder: True


## Initialize Classification Service

In [4]:
# Initialize the classification service with the few-shot config
try:
    service = ClassificationService(
        config=config,
        backend="bedrock",
        region="us-east-1"  # You may need to adjust this
    )
    print("Classification service initialized successfully!")
except Exception as e:
    print(f"Error initializing service: {e}")
    print("Note: This is expected if AWS credentials are not configured for Bedrock")

Classification service initialized successfully!


## Examine Class Examples Structure

In [5]:
# Let's examine the examples in the configuration
print("Examples found in configuration:")
print("=" * 50)

classes = config.get('classes', [])
total_examples = 0

for class_obj in classes:
    class_name = class_obj.get('name', 'Unknown')
    examples = class_obj.get('examples', [])
    
    print(f"\nClass: {class_name}")
    print(f"Number of examples: {len(examples)}")
    
    for i, example in enumerate(examples):
        print(f"  Example {i+1}:")
        print(f"    Name: {example.get('name', 'N/A')}")
        print(f"    Class Prompt: {example.get('classPrompt', 'N/A')}")
        print(f"    Image Path: {example.get('imagePath', 'N/A')}")
        
        # Check if image file exists (test the path resolution logic)
        image_path = example.get('imagePath')
        if image_path:
            if image_path.startswith('s3://'):
                print(f"    S3 URI: {image_path}")
            else:
                # Check environment variables for path resolution
                config_bucket = os.environ.get('CONFIGURATION_BUCKET')
                root_dir = os.environ.get('ROOT_DIR')
                
                if config_bucket:
                    s3_uri = f"s3://{config_bucket}/{image_path}"
                    print(f"    Would use S3: {s3_uri}")
                elif root_dir:
                    # Use ROOT_DIR
                    full_path = os.path.join(root_dir, image_path)
                    full_path = os.path.normpath(full_path)
                    exists = os.path.exists(full_path)
                    print(f"    ROOT_DIR path: {full_path}")
                    print(f"    Image exists: {exists}")
                    if exists:
                        size = os.path.getsize(full_path)
                        print(f"    Image size: {size} bytes")
                else:
                    # Fall back to calculated path
                    full_path = os.path.join('..', image_path)
                    full_path = os.path.normpath(full_path)
                    exists = os.path.exists(full_path)
                    print(f"    Calculated path: {full_path}")
                    print(f"    Image exists: {exists}")
                    if exists:
                        size = os.path.getsize(full_path)
                        print(f"    Image size: {size} bytes")
        
        total_examples += 1

print(f"\nTotal examples across all classes: {total_examples}")
print(f"\nEnvironment variables:")
print(f"  ROOT_DIR: {os.environ.get('ROOT_DIR', 'Not set')}")
print(f"  CONFIGURATION_BUCKET: {os.environ.get('CONFIGURATION_BUCKET', 'Not set')}")

Examples found in configuration:

Class: letter
Number of examples: 3
  Example 1:
    Name: Letter1
    Class Prompt: This is an example of the class 'letter'
    Image Path: config_library/pattern-2/few_shot_example/example-images/letter1.jpg
    ROOT_DIR path: ../config_library/pattern-2/few_shot_example/example-images/letter1.jpg
    Image exists: True
    Image size: 106629 bytes
  Example 2:
    Name: Letter2
    Class Prompt: This is an example of the class 'letter'
    Image Path: config_library/pattern-2/few_shot_example/example-images/letter2.jpg
    ROOT_DIR path: ../config_library/pattern-2/few_shot_example/example-images/letter2.jpg
    Image exists: False
  Example 3:
    Name: Email1
    Class Prompt: This is an example of the class 'email'
    Image Path: config_library/pattern-2/few_shot_example/example-images/email1.jpg
    ROOT_DIR path: ../config_library/pattern-2/few_shot_example/example-images/email1.jpg
    Image exists: True
    Image size: 49648 bytes

Class: f

## Test Few-Shot Examples Content Building

In [6]:
# Test the few-shot examples content building
print("Testing _build_few_shot_examples_content method...")

try:
    # Get the few-shot examples content
    examples_content = service._build_few_shot_examples_content()
    
    print(f"Generated {len(examples_content)} content items from examples")
    print("\nContent items:")
    
    for i, item in enumerate(examples_content):
        print(f"\nItem {i+1}:")
        if 'text' in item:
            print(f"  Type: text")
            print(f"  Content: {item['text'][:100]}{'...' if len(item['text']) > 100 else ''}")
        elif 'image' in item:
            print(f"  Type: image")
            print(f"  Format: {item['image'].get('format', 'unknown')}")
            if 'source' in item['image'] and 'bytes' in item['image']['source']:
                print(f"  Size: {len(item['image']['source']['bytes'])} bytes")
        else:
            print(f"  Type: unknown")
            print(f"  Keys: {list(item.keys())}")
            
except Exception as e:
    print(f"Error building few-shot examples: {e}")
    print("This might be due to missing image files or path issues")

Failed to load example image from config_library/pattern-2/few_shot_example/example-images/letter2.jpg: [Errno 2] No such file or directory: '../config_library/pattern-2/few_shot_example/example-images/letter2.jpg'


Testing _build_few_shot_examples_content method...
Generated 5 content items from examples

Content items:

Item 1:
  Type: text
  Content: This is an example of the class 'letter'

Item 2:
  Type: image
  Format: jpeg
  Size: 106629 bytes

Item 3:
  Type: text
  Content: This is an example of the class 'letter'

Item 4:
  Type: text
  Content: This is an example of the class 'email'

Item 5:
  Type: image
  Format: jpeg
  Size: 49648 bytes


## Test Complete Content Building with Examples

In [7]:
# Test the complete content building with few-shot examples
print("Testing _build_content_with_few_shot_examples method...")

# Sample document text for testing
sample_document_text = "This is a sample document for testing classification."

try:
    # Get classification config
    classification_config = service._get_classification_config()
    task_prompt_template = classification_config['task_prompt']
    
    # Build content with few-shot examples
    content = service._build_content_with_few_shot_examples(
        task_prompt_template=task_prompt_template,
        document_text=sample_document_text,
        class_names_and_descriptions=service._format_classes_list()
    )
    
    print(f"Generated content array with {len(content)} items")
    print("\nContent structure:")
    
    for i, item in enumerate(content):
        print(f"\nItem {i+1}:")
        if 'text' in item:
            print(f"  Type: text")
            text_preview = item['text'][:200].replace('\n', '\\n')
            print(f"  Preview: {text_preview}{'...' if len(item['text']) > 200 else ''}")
        elif 'image' in item:
            print(f"  Type: image")
            print(f"  Format: {item['image'].get('format', 'unknown')}")
            if 'source' in item['image'] and 'bytes' in item['image']['source']:
                print(f"  Size: {len(item['image']['source']['bytes'])} bytes")
        else:
            print(f"  Type: unknown")
            print(f"  Keys: {list(item.keys())}")
            
except Exception as e:
    print(f"Error building content with few-shot examples: {e}")
    import traceback
    traceback.print_exc()

Failed to load example image from config_library/pattern-2/few_shot_example/example-images/letter2.jpg: [Errno 2] No such file or directory: '../config_library/pattern-2/few_shot_example/example-images/letter2.jpg'


Testing _build_content_with_few_shot_examples method...
Generated content array with 7 items

Content structure:

Item 1:
  Type: text
  Preview: Classify this document into exactly one of these categories:\n\nletter  	[ A formal written correspondence with sender/recipient addresses, date, salutation, body, and closing signature ]\nform  	[ A str...

Item 2:
  Type: text
  Preview: This is an example of the class 'letter'

Item 3:
  Type: image
  Format: jpeg
  Size: 106629 bytes

Item 4:
  Type: text
  Preview: This is an example of the class 'letter'

Item 5:
  Type: text
  Preview: This is an example of the class 'email'

Item 6:
  Type: image
  Format: jpeg
  Size: 49648 bytes

Item 7:
  Type: text
  Preview: \n</few_shot_examples>\n<<CACHEPOINT>>\n<document_ocr_data>\nThis is a sample document for testing classification.\n</document_ocr_data>


## Test Path Resolution Logic

In [8]:
# Test path resolution with different environment variables
print("Testing image path resolution logic:")
print("=" * 50)

# Save original environment state
original_bucket = os.environ.get('CONFIGURATION_BUCKET')
original_root_dir = os.environ.get('ROOT_DIR')

print(f"Current ROOT_DIR: {os.environ.get('ROOT_DIR', 'Not set')}")
print(f"Current CONFIGURATION_BUCKET: {os.environ.get('CONFIGURATION_BUCKET', 'Not set')}")

# Test 1: With ROOT_DIR (current setup)
print("\n1. WITH ROOT_DIR environment variable (current setup):")
print("-" * 50)

try:
    examples_content = service._build_few_shot_examples_content()
    print(f"Successfully built {len(examples_content)} content items using ROOT_DIR")
    
    # Count successful image loads
    image_items = [item for item in examples_content if 'image' in item]
    print(f"Loaded {len(image_items)} image items from local files")
    
except Exception as e:
    print(f"Error building examples with ROOT_DIR: {e}")

# Test 2: Without ROOT_DIR but with calculated path
print("\n2. WITHOUT ROOT_DIR (using calculated path):")
print("-" * 50)

if 'ROOT_DIR' in os.environ:
    del os.environ['ROOT_DIR']

try:
    # Create a new service instance without ROOT_DIR
    test_service = ClassificationService(
        config=config,
        backend="bedrock",
        region="us-east-1"
    )
    
    examples_content = test_service._build_few_shot_examples_content()
    print(f"Successfully built {len(examples_content)} content items using calculated path")
    
    # Count successful image loads
    image_items = [item for item in examples_content if 'image' in item]
    print(f"Loaded {len(image_items)} image items from local files")
    
except Exception as e:
    print(f"Error building examples without ROOT_DIR: {e}")

# Test 3: With CONFIGURATION_BUCKET
print("\n3. WITH CONFIGURATION_BUCKET environment variable:")
print("-" * 50)

# Set a test bucket name
os.environ['CONFIGURATION_BUCKET'] = 'test-config-bucket'

try:
    test_service = ClassificationService(
        config=config,
        backend="bedrock",
        region="us-east-1"
    )
    
    print(f"CONFIGURATION_BUCKET set to: {os.environ.get('CONFIGURATION_BUCKET')}")
    print("Note: This would attempt to load images from S3, which may fail without proper setup")
    
    # This will likely fail since the S3 bucket doesn't exist, but it shows the logic
    try:
        examples_content = test_service._build_few_shot_examples_content()
        print(f"Successfully built {len(examples_content)} content items using S3")
    except Exception as e:
        print(f"Expected error when trying to access S3: {e}")
        print("This is normal - the logic correctly tries to use S3 when CONFIGURATION_BUCKET is set")

except Exception as e:
    print(f"Error with CONFIGURATION_BUCKET test: {e}")

# Restore original environment
if original_bucket is not None:
    os.environ['CONFIGURATION_BUCKET'] = original_bucket
elif 'CONFIGURATION_BUCKET' in os.environ:
    del os.environ['CONFIGURATION_BUCKET']

if original_root_dir is not None:
    os.environ['ROOT_DIR'] = original_root_dir
elif 'ROOT_DIR' in os.environ:
    del os.environ['ROOT_DIR']
    
# Restore ROOT_DIR for notebook usage
os.environ['ROOT_DIR'] = '../'

Failed to load example image from config_library/pattern-2/few_shot_example/example-images/letter2.jpg: [Errno 2] No such file or directory: '../config_library/pattern-2/few_shot_example/example-images/letter2.jpg'
Failed to load example image from config_library/pattern-2/few_shot_example/example-images/letter2.jpg: [Errno 2] No such file or directory: '/home/ec2-user/projects/genaiic-idp-accelerator/config_library/pattern-2/few_shot_example/example-images/letter2.jpg'


Testing image path resolution logic:
Current ROOT_DIR: ../
Current CONFIGURATION_BUCKET: Not set

1. WITH ROOT_DIR environment variable (current setup):
--------------------------------------------------
Successfully built 5 content items using ROOT_DIR
Loaded 2 image items from local files

2. WITHOUT ROOT_DIR (using calculated path):
--------------------------------------------------
Successfully built 5 content items using calculated path
Loaded 2 image items from local files

3. WITH CONFIGURATION_BUCKET environment variable:
--------------------------------------------------
CONFIGURATION_BUCKET set to: test-config-bucket
Note: This would attempt to load images from S3, which may fail without proper setup


Error reading binary content from s3://test-config-bucket/config_library/pattern-2/few_shot_example/example-images/letter1.jpg: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied
Failed to load example image from config_library/pattern-2/few_shot_example/example-images/letter1.jpg: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied
Error reading binary content from s3://test-config-bucket/config_library/pattern-2/few_shot_example/example-images/letter2.jpg: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied
Failed to load example image from config_library/pattern-2/few_shot_example/example-images/letter2.jpg: An error occurred (AccessDenied) when calling the GetObject operation: Access Denied
Error reading binary content from s3://test-config-bucket/config_library/pattern-2/few_shot_example/example-images/email1.jpg: An error occurred (AccessDenied) when calling the GetObject operation

Successfully built 3 content items using S3


## Test Prompt Template Splitting

In [9]:
# Test how the prompt template gets split at the FEW_SHOT_EXAMPLES placeholder
task_prompt = config['classification']['task_prompt']

print("Testing prompt template splitting:")
print("=" * 50)

parts = task_prompt.split("{FEW_SHOT_EXAMPLES}")
print(f"Number of parts after split: {len(parts)}")

if len(parts) == 2:
    print("\nPart 1 (before examples):")
    print("-" * 30)
    print(parts[0])
    
    print("\nPart 2 (after examples):")
    print("-" * 30)
    print(parts[1])
else:
    print("Unexpected number of parts - prompt may not contain the placeholder correctly")

Testing prompt template splitting:
Number of parts after split: 2

Part 1 (before examples):
------------------------------
Classify this document into exactly one of these categories:

{CLASS_NAMES_AND_DESCRIPTIONS}

Respond only with a JSON object containing the class label. For example: {{"class": "letter"}}
<few_shot_examples>


Part 2 (after examples):
------------------------------

</few_shot_examples>
<<CACHEPOINT>>
<document_ocr_data>
{DOCUMENT_TEXT}
</document_ocr_data>


## Compare Old vs New Content Structure

In [10]:
# Compare the old single-text approach vs new multi-content approach
print("Comparing content structures:")
print("=" * 50)

# Sample document text for testing
sample_document_text = "This is a sample document for testing classification."

# Old approach (without FEW_SHOT_EXAMPLES)
old_task_prompt = task_prompt.replace("{FEW_SHOT_EXAMPLES}", "[Few shot examples would go here]")
old_prepared = service._prepare_prompt_from_template(
    old_task_prompt,
    {
        "DOCUMENT_TEXT": sample_document_text,
        "CLASS_NAMES_AND_DESCRIPTIONS": service._format_classes_list(),
    },
    required_placeholders=["DOCUMENT_TEXT", "CLASS_NAMES_AND_DESCRIPTIONS"]
)
old_content = [{"text": old_prepared}]

print("OLD APPROACH (single text item):")
print(f"Content items: {len(old_content)}")
print(f"Item 1 type: {list(old_content[0].keys())[0]}")
print(f"Text length: {len(old_content[0]['text'])} characters")

print("\nNEW APPROACH (with few-shot examples):")
try:
    new_content = service._build_content_with_few_shot_examples(
        task_prompt_template=task_prompt,
        document_text=sample_document_text,
        class_names_and_descriptions=service._format_classes_list()
    )
    
    print(f"Content items: {len(new_content)}")
    
    text_items = sum(1 for item in new_content if 'text' in item)
    image_items = sum(1 for item in new_content if 'image' in item)
    
    print(f"Text items: {text_items}")
    print(f"Image items: {image_items}")
    
    total_text_length = sum(len(item['text']) for item in new_content if 'text' in item)
    print(f"Total text length: {total_text_length} characters")
    
except Exception as e:
    print(f"Error with new approach: {e}")

Failed to load example image from config_library/pattern-2/few_shot_example/example-images/letter2.jpg: [Errno 2] No such file or directory: '../config_library/pattern-2/few_shot_example/example-images/letter2.jpg'


Comparing content structures:
OLD APPROACH (single text item):
Content items: 1
Item 1 type: text
Text length: 1683 characters

NEW APPROACH (with few-shot examples):
Content items: 7
Text items: 5
Image items: 2
Total text length: 1769 characters


## Save Content for Inspection

In [11]:
# Save the generated content structure to a JSON file for detailed inspection
try:
    content = service._build_content_with_few_shot_examples(
        task_prompt_template=task_prompt,
        document_text=sample_document_text,
        class_names_and_descriptions=service._format_classes_list()
    )
    
    # Create a serializable version (remove binary image data for JSON)
    serializable_content = []
    for item in content:
        if 'text' in item:
            serializable_content.append({
                'type': 'text',
                'content': item['text']
            })
        elif 'image' in item:
            serializable_content.append({
                'type': 'image',
                'format': item['image'].get('format', 'unknown'),
                'size_bytes': len(item['image']['source']['bytes']) if 'source' in item['image'] and 'bytes' in item['image']['source'] else 0
            })
        else:
            serializable_content.append({
                'type': 'unknown',
                'keys': list(item.keys())
            })
    
    # Save to file
    output_file = 'few_shot_content_structure.json'
    with open(output_file, 'w') as f:
        json.dump({
            'summary': {
                'total_items': len(content),
                'text_items': sum(1 for item in content if 'text' in item),
                'image_items': sum(1 for item in content if 'image' in item)
            },
            'content': serializable_content
        }, f, indent=2)
    
    print(f"Content structure saved to {output_file}")
    print(f"You can inspect the detailed structure in that file.")
    
except Exception as e:
    print(f"Error saving content: {e}")

Failed to load example image from config_library/pattern-2/few_shot_example/example-images/letter2.jpg: [Errno 2] No such file or directory: '../config_library/pattern-2/few_shot_example/example-images/letter2.jpg'


Content structure saved to few_shot_content_structure.json
You can inspect the detailed structure in that file.


## Test Edge Cases

In [12]:
# Test what happens with a config that doesn't have the FEW_SHOT_EXAMPLES placeholder
print("Testing fallback behavior without FEW_SHOT_EXAMPLES placeholder:")
print("=" * 60)

# Create a modified config without the placeholder
modified_config = config.copy()
modified_config['classification']['task_prompt'] = task_prompt.replace(
    "{FEW_SHOT_EXAMPLES}", 
    "[Static examples would be here]"
)

# Initialize service with modified config
try:
    fallback_service = ClassificationService(
        config=modified_config,
        backend="bedrock",
        region="us-east-1"
    )
    
    # Test content building
    fallback_content = fallback_service._build_content_with_few_shot_examples(
        task_prompt_template=modified_config['classification']['task_prompt'],
        document_text=sample_document_text,
        class_names_and_descriptions=fallback_service._format_classes_list()
    )
    
    print(f"Fallback content items: {len(fallback_content)}")
    print(f"Should be 1 (fallback to single text item): {len(fallback_content) == 1}")
    
    if fallback_content:
        print(f"First item type: {'text' if 'text' in fallback_content[0] else 'other'}")
        
except Exception as e:
    print(f"Error testing fallback: {e}")

Testing fallback behavior without FEW_SHOT_EXAMPLES placeholder:
Fallback content items: 1
Should be 1 (fallback to single text item): True
First item type: text
