# S3 Vectors Large-Scale Test

Comprehensive testing of S3 Vectors with large datasets and IVFPQ indexing.

This notebook demonstrates:
- Large-scale vector operations (50k+ vectors)
- IVFPQ index creation and optimization
- Performance testing with real-world datasets
- Batch processing and efficient data handling

**Note**: This notebook requires significant compute time and memory for large datasets.

In [6]:
# Setup and check existing buckets
import boto3
import time
import requests
import numpy as np

# Connect to S3 Vectors
s3vectors_client = boto3.client(
    's3vectors',
    endpoint_url='http://localhost:8000',
    aws_access_key_id='minioadmin',
    aws_secret_access_key='minioadmin123',
    region_name='us-east-1'
)

print("🔍 Checking existing buckets and their vector counts...")

# List all buckets
try:
    buckets_response = s3vectors_client.list_vector_buckets()
    buckets = buckets_response.get('vectorBuckets', [])
    print(f"Found {len(buckets)} existing buckets:")
    
    for bucket in buckets:
        bucket_name = bucket['vectorBucketName']
        print(f"\n📦 Bucket: {bucket_name}")
        
        # List indexes in this bucket
        try:
            indexes_response = s3vectors_client.list_indexes(vectorBucketName=bucket_name)
            indexes = indexes_response.get('indexes', [])
            
            if indexes:
                for index in indexes:
                    index_name = index['indexName']
                    print(f"  📊 Index: {index_name}")
        except Exception as e:
            print(f"  ❌ Error listing indexes: {e}")
            
except Exception as e:
    print(f"❌ Error listing buckets: {e}")

🔍 Checking existing buckets and their vector counts...
Found 19 existing buckets:

📦 Bucket: basic-demo-1755537293
  📊 Index: index_demo-index_table

📦 Bucket: boto3-test-1755425169

📦 Bucket: boto3-test-1755425216
  📊 Index: index_demo-index_table

📦 Bucket: boto3-test-1755425169

📦 Bucket: boto3-test-1755425216
  📊 Index: index_test-index_table

📦 Bucket: dbpedia-test-bucket
  📊 Index: index_test-index_table

📦 Bucket: dbpedia-test-bucket
  📊 Index: index_dbpedia-index_table

📦 Bucket: final-test-1755531795
  📊 Index: index_final-test-index_table

📦 Bucket: final-test-1755531916
  📊 Index: index_dbpedia-index_table

📦 Bucket: final-test-1755531795
  📊 Index: index_final-test-index_table

📦 Bucket: final-test-1755531916
  📊 Index: index_final-test-index_table

📦 Bucket: simple-test-1755532018
  📊 Index: index_simple-index_table
  📊 Index: index_test-working-index_table

📦 Bucket: test-1755425607
  📊 Index: index_final-test-index_table

📦 Bucket: simple-test-1755532018
  📊 Index: index

In [7]:
# Helper function to get embeddings from LM Studio
def get_embedding(text):
    """Get embedding from LM Studio local server"""
    try:
        response = requests.post(
            "http://localhost:1234/v1/embeddings",
            headers={"Content-Type": "application/json"},
            json={
                "input": text,
                "model": "text-embedding-nomic-embed-text-v1.5"
            },
            timeout=30
        )
        
        if response.status_code == 200:
            return response.json()['data'][0]['embedding']
        else:
            print(f"⚠️ Warning: LM Studio request failed with status {response.status_code}, using fallback")
            return np.random.rand(768).tolist()  # 768-dim fallback
            
    except Exception as e:
        print(f"⚠️ Warning: LM Studio connection failed ({e}), using fallback")
        return np.random.rand(768).tolist()  # 768-dim fallback

# Test the embedding function
test_embedding = get_embedding("test query")
print(f"✅ Embedding function working, dimension: {len(test_embedding)}")

✅ Embedding function working, dimension: 768


In [8]:
# Create bucket for large-scale test (or reuse existing)
bucket_name = "large-scale-test-50k"
index_name = "ivfpq-index"

print(f"🚀 Setting up large-scale test with {bucket_name}")

# Try to create bucket (will succeed if it doesn't exist)
try:
    response = s3vectors_client.create_vector_bucket(
        vectorBucketName=bucket_name
    )
    print(f"✅ Created new bucket: {bucket_name}")
except Exception as e:
    if "already exists" in str(e).lower():
        print(f"📦 Using existing bucket: {bucket_name}")
    else:
        print(f"❌ Error creating bucket: {e}")
        raise

🚀 Setting up large-scale test with large-scale-test-50k
✅ Created new bucket: large-scale-test-50k


In [9]:
# Generate and insert 50k vectors in batches
print("🔢 Generating and inserting 50,000 vectors...")

# Categories for diversity
categories = ["technology", "science", "history", "literature", "sports", "music", "art", "travel", "food", "nature"]

total_vectors = 50000
batch_size = 100
vectors_inserted = 0

start_time = time.time()

for batch_num in range(0, total_vectors, batch_size):
    batch_vectors = []
    
    for i in range(batch_size):
        if vectors_inserted >= total_vectors:
            break
            
        doc_id = f"doc_{vectors_inserted}"
        category = categories[vectors_inserted % len(categories)]
        text = f"This is document {vectors_inserted} about {category} with detailed content and information."
        
        # Get embedding
        embedding = get_embedding(text)
        
        batch_vectors.append({
            "key": doc_id,
            "data": {"float32": embedding},
            "metadata": {
                "text": text,
                "category": category,
                "doc_id": doc_id
            }
        })
        
        vectors_inserted += 1
    
    if batch_vectors:
        # Insert batch
        try:
            s3vectors_client.put_vectors(
                vectorBucketName=bucket_name,
                indexName=index_name,
                vectors=batch_vectors
            )
            
            # Progress update every 1000 vectors
            if vectors_inserted % 1000 == 0 or vectors_inserted == total_vectors:
                elapsed = time.time() - start_time
                rate = vectors_inserted / elapsed if elapsed > 0 else 0
                print(f"📊 Inserted {vectors_inserted:,}/{total_vectors:,} vectors ({rate:.1f} vectors/sec)")
        
        except Exception as e:
            print(f"❌ Error inserting batch at {vectors_inserted}: {e}")
            break

total_time = time.time() - start_time
print(f"\n✅ Completed! Inserted {vectors_inserted:,} vectors in {total_time:.1f} seconds")
print(f"📈 Average rate: {vectors_inserted/total_time:.1f} vectors/second")

🔢 Generating and inserting 50,000 vectors...
📊 Inserted 1,000/50,000 vectors (39.2 vectors/sec)
📊 Inserted 1,000/50,000 vectors (39.2 vectors/sec)
📊 Inserted 2,000/50,000 vectors (40.4 vectors/sec)
📊 Inserted 2,000/50,000 vectors (40.4 vectors/sec)
📊 Inserted 3,000/50,000 vectors (40.8 vectors/sec)
📊 Inserted 3,000/50,000 vectors (40.8 vectors/sec)
📊 Inserted 4,000/50,000 vectors (41.0 vectors/sec)
📊 Inserted 4,000/50,000 vectors (41.0 vectors/sec)
📊 Inserted 5,000/50,000 vectors (41.1 vectors/sec)
📊 Inserted 5,000/50,000 vectors (41.1 vectors/sec)
📊 Inserted 6,000/50,000 vectors (41.3 vectors/sec)
📊 Inserted 6,000/50,000 vectors (41.3 vectors/sec)
📊 Inserted 7,000/50,000 vectors (41.4 vectors/sec)
📊 Inserted 7,000/50,000 vectors (41.4 vectors/sec)
📊 Inserted 8,000/50,000 vectors (41.4 vectors/sec)
📊 Inserted 8,000/50,000 vectors (41.4 vectors/sec)
📊 Inserted 9,000/50,000 vectors (41.5 vectors/sec)
📊 Inserted 9,000/50,000 vectors (41.5 vectors/sec)
📊 Inserted 10,000/50,000 vectors (41.

In [10]:
# Create IVFPQ index for efficient similarity search
print(f"🔧 Creating IVFPQ index '{index_name}' for large-scale search...")

try:
    start_time = time.time()
    
    response = s3vectors_client.create_index(
        vectorBucketName=bucket_name,
        indexName=index_name,
        dimension=768,
        dataType="float32",
        distanceMetric="cosine"
    )
    
    index_time = time.time() - start_time
    print(f"✅ IVFPQ index created successfully in {index_time:.1f} seconds!")
    print(f"📊 Index details: {response}")
    
except Exception as e:
    if "already exists" in str(e).lower():
        print(f"📊 Index '{index_name}' already exists, using existing index")
    else:
        print(f"❌ Error creating index: {e}")
        raise

🔧 Creating IVFPQ index 'ivfpq-index' for large-scale search...
✅ IVFPQ index created successfully in 0.6 seconds!
📊 Index details: {'ResponseMetadata': {'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Thu, 21 Aug 2025 06:18:38 GMT', 'server': 'uvicorn', 'content-length': '170', 'content-type': 'application/json'}, 'RetryAttempts': 0}}
✅ IVFPQ index created successfully in 0.6 seconds!
📊 Index details: {'ResponseMetadata': {'HTTPStatusCode': 200, 'HTTPHeaders': {'date': 'Thu, 21 Aug 2025 06:18:38 GMT', 'server': 'uvicorn', 'content-length': '170', 'content-type': 'application/json'}, 'RetryAttempts': 0}}


In [11]:
# Test similarity search with various queries
print("🔍 Testing similarity search on large dataset...")

# Test queries for different categories
test_queries = [
    "artificial intelligence and machine learning",
    "historical events and ancient civilizations", 
    "classical music and composers",
    "space exploration and astronomy",
    "environmental conservation and nature"
]

for i, query_text in enumerate(test_queries, 1):
    print(f"\n🎯 Query {i}: '{query_text}'")
    
    # Get query embedding
    query_embedding = get_embedding(query_text)
    
    # Search with different top-k values
    for top_k in [5, 10]:
        try:
            start_time = time.time()
            
            response = s3vectors_client.query_vectors(
                vectorBucketName=bucket_name,
                indexName=index_name,
                queryVector=query_embedding,
                topK=top_k,
                returnMetadata=True
            )
            
            search_time = time.time() - start_time
            
            vectors_found = len(response.get('vectors', []))
            print(f"  📊 Found {vectors_found} results (top-{top_k}) in {search_time*1000:.1f}ms")
            
            # Show top 3 results
            for j, result in enumerate(response.get('vectors', [])[:3]):
                metadata = result.get('metadata', {})
                distance = result.get('distance', 0.0)
                similarity = 1 - distance  # Convert distance to similarity
                category = metadata.get('category', 'unknown')
                key = result.get('key', 'unknown')
                print(f"    {j+1}. {key} (similarity: {similarity:.3f}, category: {category})")
        
        except Exception as e:
            print(f"    ❌ Error in search: {e}")

print("\n✅ Large-scale similarity search testing completed!")

🔍 Testing similarity search on large dataset...

🎯 Query 1: 'artificial intelligence and machine learning'
    ❌ Error in search: Parameter validation failed:
Invalid type for parameter queryVector, value: [-0.01944350078701973, 0.03250119462609291, -0.15876196324825287, 0.0023260731250047684, 0.04277420416474342, 0.03695228323340416, 0.023584989830851555, -0.02699401043355465, -0.00547292735427618, 0.06639376282691956, 0.049575075507164, 0.032042693346738815, 0.12299171090126038, 0.029790256172418594, -0.009925048798322678, -0.003692124504595995, -0.06097208335995674, -0.04377865791320801, -0.037606555968523026, -0.008589806035161018, -0.009126558899879456, 0.03535635396838188, 0.022429587319493294, 0.005853317212313414, 0.08373699337244034, 0.01151413656771183, -0.055005479604005814, -0.01846512034535408, 0.046805430203676224, 0.04794007167220116, 0.009393741376698017, -0.03456559777259827, 0.031080568209290504, -0.022807128727436066, -0.02474253810942173, -0.02295578271150589, 0.110