# VectrixDB Direct Embedders

Access the embedding models directly for custom use cases.

## Using the Dense Embedder

Get vector embeddings directly without storing in a database.

In [1]:
from vectrixdb.models import DenseEmbedder

# Initialize embedder (use bundled English model)
embedder = DenseEmbedder(language="en")

# Embed single text
text = "Machine learning is transforming industries."
embedding = embedder.embed(text)

print(f"Text: {text}")
print(f"Embedding shape: {embedding.shape}")
print(f"First 5 values: {embedding[0][:5]}")

Text: Machine learning is transforming industries.
Embedding shape: (1, 384)
First 5 values: [-0.05103514  0.01069377  0.02810179 -0.01240932  0.02174898]


In [2]:
# Embed multiple texts (batch)
texts = [
    "Python is great for data science.",
    "JavaScript powers modern web apps.",
    "Rust provides memory safety."
]

# embed() handles both single texts and lists
embeddings = embedder.embed(texts)

print(f"Batch embeddings shape: {embeddings.shape}")
print(f"Number of texts: {len(texts)}")
print(f"Embedding dimension: {embeddings.shape[1]}")

Batch embeddings shape: (3, 384)
Number of texts: 3
Embedding dimension: 384


## Computing Similarity

In [3]:
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Compare texts
text1 = "I love machine learning."
text2 = "AI and ML are fascinating."
text3 = "The weather is nice today."

# embed() returns 2D array (n_texts, dim), so use [0] to get 1D
emb1 = embedder.embed(text1)[0]
emb2 = embedder.embed(text2)[0]
emb3 = embedder.embed(text3)[0]

print(f"Similarity between:")
print(f"  '{text1}' and '{text2}': {cosine_similarity(emb1, emb2):.4f}")
print(f"  '{text1}' and '{text3}': {cosine_similarity(emb1, emb3):.4f}")

Similarity between:
  'I love machine learning.' and 'AI and ML are fascinating.': 0.8857
  'I love machine learning.' and 'The weather is nice today.': 0.7552


## Using the Sparse Embedder (BM25)

In [4]:
from vectrixdb.models import SparseEmbedder

# Initialize sparse embedder
sparse = SparseEmbedder()

# Get sparse representation
text = "Python machine learning tutorial"
sparse_vec = sparse.embed(text)

print(f"Text: {text}")
print(f"Sparse vector type: {type(sparse_vec)}")
print(f"Non-zero entries: {len(sparse_vec) if isinstance(sparse_vec, dict) else 'N/A'}")

Text: Python machine learning tutorial
Sparse vector type: <class 'list'>
Non-zero entries: N/A


## Using the Reranker

In [5]:
from vectrixdb.models import RerankerEmbedder

# Initialize reranker (use bundled English model)
reranker = RerankerEmbedder(language="en")

# Query and candidate documents
query = "What is deep learning?"
documents = [
    "Deep learning is a subset of machine learning using neural networks.",
    "The weather forecast predicts rain tomorrow.",
    "Neural networks have multiple layers for feature extraction.",
    "Pizza is a popular Italian dish."
]

# Score each document
scores = reranker.score(query, documents)

print(f"Query: {query}\n")
print("Reranking scores:")
for doc, score in sorted(zip(documents, scores), key=lambda x: x[1], reverse=True):
    print(f"  {score:.4f}: {doc[:50]}...")

Query: What is deep learning?

Reranking scores:
  1.0000: Deep learning is a subset of machine learning usin...
  0.0000: Pizza is a popular Italian dish....
  0.0000: Neural networks have multiple layers for feature e...
  0.0000: The weather forecast predicts rain tomorrow....


## Custom Model with Direct Embedder

In [6]:
# The embedder uses bundled models by default
# You can also use custom ONNX models with model_dir parameter
# Example: custom_embedder = DenseEmbedder(model_dir="/path/to/onnx/model")

# Using a different dimension is possible with custom models
embedder_en = DenseEmbedder(language="en")

# Get embeddings
text = "Custom model embedding test."
embedding = embedder_en.embed(text)

print(f"Embedding shape: {embedding.shape}")
print(f"Dimension: {embedding.shape[1]}")

Embedding shape: (1, 384)
Dimension: 384


## Use Case: Finding Similar Documents

In [7]:
# Document collection
documents = [
    "Python is excellent for data analysis.",
    "JavaScript enables interactive web pages.",
    "Data science uses Python extensively.",
    "React is a JavaScript framework.",
    "Machine learning with Python is powerful."
]

# Embed all documents (embed() handles lists)
doc_embeddings = embedder.embed(documents)

# Find most similar to a query
query = "Python for ML"
query_emb = embedder.embed(query)

# Calculate similarities
similarities = [cosine_similarity(query_emb[0], doc_emb) for doc_emb in doc_embeddings]

# Rank by similarity
print(f"Query: '{query}'\n")
print("Most similar documents:")
for doc, sim in sorted(zip(documents, similarities), key=lambda x: x[1], reverse=True):
    print(f"  {sim:.4f}: {doc}")

Query: 'Python for ML'

Most similar documents:
  0.8862: Machine learning with Python is powerful.
  0.8706: Python is excellent for data analysis.
  0.8565: Data science uses Python extensively.
  0.7538: React is a JavaScript framework.
  0.7508: JavaScript enables interactive web pages.
