# AWS S3 Vectors Sematic Search 

This notebook demonstrates how to use Amazon S3 Vectors for semantic search by creating an index, uploading vectors, and querying them using a custom text model. It includes steps for data preparation, vector encoding, and querying the S3 Vectors service.


In [None]:

# --------------------------------------------------------------
# Setup Prerequisites for Semantic Search with AWS S3Vector
# --------------------------------------------------------------

%pip install pandas numpy torch boto3 sentence-transformers

In [27]:
# ------------------------------------------
# Load and preprocess the data
# ------------------------------------------
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import boto3 

# Load the CSV file
print('Loading the file...')
df_raw = pd.read_csv("../data/medicaid-drugs.csv")

# Get first 100 records per state
df = df_raw.groupby('State', group_keys=False).head(100).copy()

print('File loaded and filtered.')

# Fill missing values for key numeric fields
df.fillna({
    'Number of Prescriptions': 0,
    'Total Amount Reimbursed': 0
}, inplace=True)

# Build vocabularies for categorical features
state_vocab = {s: i for i, s in enumerate(df['State'].unique())}
ndc_vocab = {s: i for i, s in enumerate(df['NDC'].unique())}
drug_vocab = {s: i for i, s in enumerate(df['Product Name'].unique())}

print('Vocabularies built:')
print(f'- States: {len(state_vocab)}')
print(f'- NDCs: {len(ndc_vocab)}')
print(f'- Drugs: {len(drug_vocab)}')

Loading the file...
File loaded and filtered.
Vocabularies built:
- States: 52
- NDCs: 270
- Drugs: 193


In [21]:
# Encode categorical data as integers
df['state_id'] = df['State'].map(state_vocab)
df['ndc_id'] = df['NDC'].map(ndc_vocab)
df['drug_id'] = df['Product Name'].map(drug_vocab)

print('Encode categorical data')

Encode categorical data


In [22]:
# Normalize numeric features
numeric_feats = df[['Number of Prescriptions', 'Total Amount Reimbursed']].astype(np.float32)
numeric_feats = (numeric_feats - numeric_feats.mean()) / (numeric_feats.std() + 1e-6)

# Convert to tensors
state_ids = torch.tensor(df['state_id'].values, dtype=torch.long)
ndc_ids = torch.tensor(df['ndc_id'].values, dtype=torch.long)
drug_ids = torch.tensor(df['drug_id'].values, dtype=torch.long)
numeric_tensor = torch.tensor(numeric_feats.values, dtype=torch.float32)

print('Convert to tensors')

Convert to tensors


In [23]:

# ------------------------------------------
# Define your encoder
# ------------------------------------------
class MedicaidDrugEncoder(nn.Module):
    def __init__(self, state_vocab_size, ndc_vocab_size, drug_vocab_size, embed_dim, num_inputs):
        super().__init__()
        self.state_embed = nn.Embedding(state_vocab_size, embed_dim)
        self.ndc_embed = nn.Embedding(ndc_vocab_size, embed_dim)
        self.drug_embed = nn.Embedding(drug_vocab_size, embed_dim)
        self.numeric_proj = nn.Linear(num_inputs, embed_dim)

        self.combined_proj = nn.Sequential(
            nn.Linear(embed_dim * 4, 128),   # Combine all input embeddings
            nn.ReLU(),
            nn.Linear(128, 32)               # 🔽 Final projection to 32-d
        )

    def forward(self, state_ids, ndc_ids, drug_ids, numeric_tensor):

        state_vec = self.state_embed(state_ids)
        ndc_vec = self.ndc_embed(ndc_ids)
        drug_vec = self.drug_embed(drug_ids)
        numeric_vec = self.numeric_proj(numeric_tensor)

        # Concatenate all parts
        combined = torch.cat([state_vec, ndc_vec, drug_vec, numeric_vec], dim=-1)

        # Project to final 32-d vector
        output = self.combined_proj(combined)
        return output


In [24]:
# ------------------------------------------
# Generate embeddings
# ------------------------------------------
embed_dim = 32
encoder = MedicaidDrugEncoder(
    len(state_vocab), len(ndc_vocab), len(drug_vocab),
    embed_dim=embed_dim,
    num_inputs=2
)

with torch.no_grad():
    embeddings = encoder(state_ids, ndc_ids, drug_ids, numeric_tensor)

embedding_matrix = embeddings.numpy().astype(np.float32)

print('Embedding completed : Embedding shape: ', embedding_matrix.shape)

Embedding completed : Embedding shape:  (5200, 32)


In [25]:
# ------------------------------------------
# AWS S3 Vectors Infrastructure
# ------------------------------------------
bucket_name = 'medicaid-drug-vectors'
index_name = 'medicaid-2025-index' 
region = 'us-east-1'

client = boto3.client('s3vectors', region_name=region)

In [None]:
# Create index for 32-dimensional vectors

client.create_index(
    vectorBucketName=bucket_name,
    indexName=index_name,
    dataType='float32',
    dimension=32,
    distanceMetric='cosine'
)

print(f'Index {index_name} created in bucket {bucket_name}.')

In [None]:

# Assuming embedding_matrix is shape (batch_size, 32)
vectors_to_upload = []

for i, row in enumerate(embedding_matrix):
    metadata_row = df.iloc[i]
    vectors_to_upload.append({
        "key": f"drug-vector-{i}",
        "data": {
            "float32": row.tolist()  # Convert each vector to a list of floats
        },
        "metadata": {
            "state": metadata_row.get("State", "UNK"),
            "year": int(metadata_row.get("Year", 0)),
            "drugname": metadata_row.get("Product Name", "unknown")
        }
    })

print(f"Prepared {len(vectors_to_upload)} vectors for upload.")

Prepared 5200 vectors for upload.


In [None]:
# Upload in batches

batch_size = 500
for i in range(0, len(vectors_to_upload), batch_size):
    batch = vectors_to_upload[i:i+batch_size]
    response = client.put_vectors(
        vectorBucketName=bucket_name,
        indexName=index_name,
        vectors=batch
    )
    print(f"Uploaded batch {i // batch_size + 1}: {response}")

In [43]:
# -------------------------
# Perform a semantic search
# -------------------------

# ✅ Step 1: Define a text encoder

# Custom filter query
from sentence_transformers import SentenceTransformer

# Load a transformer and reduce to 32-dim
text_model = SentenceTransformer("all-MiniLM-L6-v2") 

# Reduce to 32-dim with a linear projection
reducer = nn.Linear(384, 32)

In [75]:
#✅ Step 2: Encode the user query 

input_text = "diabetes treatment"
embedding_custom = text_model.encode(input_text, convert_to_numpy=True)  # shape: (384,)

# Reduce to 32-dim
embedding_32 = reducer(torch.tensor(embedding_custom)).detach().numpy().astype(np.float32)
query_vector_custom = embedding_32.tolist()

In [76]:
#✅ Step 3: Query the S3 Vector index

response_custom = client.query_vectors(
    vectorBucketName=bucket_name,
    indexName=index_name,
    queryVector={
        "float32": query_vector_custom
    },
    filter={"state": "SC"},
    topK=3,
    returnMetadata=True,
    returnDistance=True
)

In [77]:
# Results for semantic search
for match in response_custom.get("vectors", []):
    print(f"Match key: {match['key']}")
    print(f"Distance: {match['distance']:.4f}")
    print(f"Metadata: {match.get('metadata', {})}")

Match key: drug-vector-4199
Distance: 0.7495
Metadata: {'year': 2022, 'state': 'SC', 'drugname': 'HUMULIN N '}
Match key: drug-vector-4181
Distance: 0.7550
Metadata: {'state': 'SC', 'year': 2022, 'drugname': 'INSULIN LI'}
Match key: drug-vector-4182
Distance: 0.7551
Metadata: {'year': 2022, 'drugname': 'INSULIN LI', 'state': 'SC'}
