In [None]:
 import boto3
import json
import pickle
import time
from tqdm import tqdm

# Initialize Bedrock client
bedrock_runtime = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1'  # Change to your region
)

In [None]:
def get_titan_v2_embedding(text):
    """
    Get embedding from Titan V2
    Returns: list of 1024 floats
    """
    body = json.dumps({
        "inputText": text
    })
    
    response = bedrock_runtime.invoke_model(
        modelId='amazon.titan-embed-text-v2:0',
        body=body,
        contentType='application/json',
        accept='application/json'
    )
    
    response_body = json.loads(response['body'].read())
    embedding = response_body['embedding']
    
    return embedding

def get_titan_v1_embedding(text):
    """
    Get embedding from Titan V1
    Returns: list of 1536 floats
    """
    body = json.dumps({
        "inputText": text
    })
    
    response = bedrock_runtime.invoke_model(
        modelId='amazon.titan-embed-text-v1',
        body=body,
        contentType='application/json',
        accept='application/json'
    )
    
    response_body = json.loads(response['body'].read())
    embedding = response_body['embedding']
    
    return embedding

In [None]:
def embed_all_chunks(chunks, embedding_function, model_name):
    """
    Generate embeddings for all chunks
    
    Args:
        chunks: list of chunk dicts with 'text' key
        embedding_function: function to call for embedding
        model_name: string identifier for tracking
    
    Returns:
        list of dicts with chunk data + embedding
    """
    embedded_chunks = []
    
    print(f"Embedding {len(chunks)} chunks with {model_name}...")
    
    for i, chunk in enumerate(tqdm(chunks)):
        try:
            # Get embedding
            embedding = embedding_function(chunk['text'])
            
            # Store everything together
            embedded_chunk = {
                'chunk_id': i,
                'text': chunk['text'],
                'embedding': embedding,
                'document_id': chunk.get('document_id', 'unknown'),
                'page': chunk.get('page', None),
                'chunk_position': chunk.get('chunk_position', i),
                'token_count': chunk.get('token_count', 0),
                'model': model_name
            }
            
            embedded_chunks.append(embedded_chunk)
            
            # Small delay to avoid rate limiting
            time.sleep(0.1)
            
        except Exception as e:
            print(f"\nError embedding chunk {i}: {e}")
            # Store with None embedding
            embedded_chunks.append({
                'chunk_id': i,
                'text': chunk['text'],
                'embedding': None,
                'error': str(e),
                'model': model_name
            })
    
    return embedded_chunks

# Usage
embedded_chunks_v2 = embed_all_chunks(
    chunks=my_chunks,  # Your chunks list
    embedding_function=get_titan_v2_embedding,
    model_name='titan_v2'
)

# Save to file
with open('embedded_chunks_titan_v2.pkl', 'wb') as f:
    pickle.dump(embedded_chunks_v2, f)

print(f"âœ“ Embedded {len(embedded_chunks_v2)} chunks")

In [None]:
# Test both Titan versions
models_to_test = {
    'titan_v2': get_titan_v2_embedding,
    'titan_v1': get_titan_v1_embedding
}

all_embeddings = {}

for model_name, embed_func in models_to_test.items():
    print(f"\n{'='*60}")
    print(f"Testing {model_name}")
    print(f"{'='*60}")
    
    embedded = embed_all_chunks(my_chunks, embed_func, model_name)
    all_embeddings[model_name] = embedded
    
    # Save each model separately
    with open(f'embeddings_{model_name}.pkl', 'wb') as f:
        pickle.dump(embedded, f)

# Save everything
with open('all_embeddings.pkl', 'wb') as f:
    pickle.dump(all_embeddings, f)

In [None]:
# Check embeddings were created
print(f"Total chunks embedded: {len(embedded_chunks_v2)}")
print(f"Embedding dimension: {len(embedded_chunks_v2[0]['embedding'])}")
print(f"Sample embedding (first 5 values): {embedded_chunks_v2[0]['embedding'][:5]}")

# Check for any errors
errors = [e for e in embedded_chunks_v2 if e['embedding'] is None]
print(f"Chunks with errors: {len(errors)}")

In [None]:
# Load saved embeddings
with open('embedded_chunks_titan_v2.pkl', 'rb') as f:
    loaded_embeddings = pickle.load(f)

print(f"Loaded {len(loaded_embeddings)} embedded chunks")