# Content Embeddings


## 0) Helpers

In [1]:
import numpy as np
import math

def l2_normalize(x, axis=1, eps=1e-12):
    n = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / np.maximum(n, eps)

def recall_at_k(query_emb, item_emb, true_idx, k=1):
    q = l2_normalize(np.asarray(query_emb, dtype=np.float32))
    it = l2_normalize(np.asarray(item_emb, dtype=np.float32))
    S = q @ it.T
    k = min(k, S.shape[1])
    topk = np.argpartition(-S, kth=k-1, axis=1)[:, :k]
    hits = 0
    for i, t in enumerate(true_idx):
        if int(t) in topk[i]:
            hits += 1
    return hits / len(true_idx)

print("ok")


ok


## 1) Minimal text embeddings (TF-IDF)

In [2]:
import re
from collections import Counter

docs = [
    "wireless noise cancelling bluetooth headphones",
    "bluetooth earbuds with good mic",
    "running shoes fitness training",
    "espresso coffee beans grinder",
    "mirrorless camera lens tripod",
    "kitchen knife cookware pan recipe",
]

def tokenize(s):
    return re.findall(r"[a-z0-9]+", s.lower())

tok_docs = [tokenize(d) for d in docs]
N = len(docs)

df = Counter()
for toks in tok_docs:
    for w in set(toks):
        df[w] += 1

vocab = {w:i for i, w in enumerate(df.keys())}
V = len(vocab)

X = np.zeros((N, V), dtype=np.float32)
for di, toks in enumerate(tok_docs):
    tf = Counter(toks)
    for w, c in tf.items():
        X[di, vocab[w]] = c

idf = np.zeros((V,), dtype=np.float32)
for w, i in vocab.items():
    idf[i] = math.log((N + 1) / (df[w] + 1)) + 1.0

X_tfidf = l2_normalize(X * idf[None, :])

true = np.arange(N, dtype=np.int32)
print("TF-IDF shape:", X_tfidf.shape)
print("Recall@1:", recall_at_k(X_tfidf, X_tfidf, true, k=1))
print("Recall@3:", recall_at_k(X_tfidf, X_tfidf, true, k=3))


TF-IDF shape: (6, 26)
Recall@1: 1.0
Recall@3: 1.0


## 2) Modern text embeddings (Sentence-BERT)
Pretrained transformer encoder for sentence-level vectors.

In [3]:
from sentence_transformers import SentenceTransformer

texts = [
    "wireless noise cancelling headphones",
    "bluetooth earbuds with good mic",
    "running shoes for training",
    "espresso beans for cappuccino",
    "mirrorless camera with zoom lens",
    "kitchen knife set for cooking",
]

model = SentenceTransformer("all-MiniLM-L6-v2")
emb_txt = model.encode(texts, normalize_embeddings=True)
emb_txt = np.asarray(emb_txt, dtype=np.float32)

true = np.arange(len(texts), dtype=np.int32)
print("Sentence-BERT shape:", emb_txt.shape)
print("Recall@1:", recall_at_k(emb_txt, emb_txt, true, k=1))
print("Recall@3:", recall_at_k(emb_txt, emb_txt, true, k=3))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Sentence-BERT shape: (6, 384)
Recall@1: 1.0
Recall@3: 1.0


## 3) Minimal image embeddings (downsample + gradients)

In [4]:
H, W = 32, 32

def make_img(kind, seed=0):
    rng = np.random.default_rng(seed)
    img = np.zeros((H, W), dtype=np.float32)
    if kind == "vertical":
        img[:, ::2] = 1.0
    elif kind == "horizontal":
        img[::2, :] = 1.0
    elif kind == "center":
        img[H//4:3*H//4, W//4:3*W//4] = 1.0
    img += 0.05 * rng.normal(size=(H, W)).astype(np.float32)
    return np.clip(img, 0.0, 1.0)

kinds = ["vertical", "horizontal", "center"]
imgs = np.stack([make_img(k, seed=i) for i, k in enumerate(kinds)], axis=0)

def downsample16(x):
    return x.reshape(16, 2, 16, 2).mean(axis=(1,3))

def img_embed_minimal(img):
    x = downsample16(img)
    gx = np.diff(x, axis=1, append=x[:, -1:])
    gy = np.diff(x, axis=0, append=x[-1:, :])
    feat = np.concatenate([x.flatten(), gx.flatten(), gy.flatten()]).astype(np.float32)
    feat /= (np.linalg.norm(feat) + 1e-12)
    return feat

emb_img = np.stack([img_embed_minimal(imgs[i]) for i in range(len(imgs))], axis=0)

true = np.arange(len(imgs), dtype=np.int32)
print("Minimal image emb shape:", emb_img.shape)
print("Recall@1:", recall_at_k(emb_img, emb_img, true, k=1))
print("Recall@2:", recall_at_k(emb_img, emb_img, true, k=2))


Minimal image emb shape: (3, 768)
Recall@1: 1.0
Recall@2: 1.0


## 4) Modern vision embeddings (ViT encoder)
Pretrained ViT. Frozen. L2-normalized vectors for cosine similarity.

In [5]:
import torch
from torchvision import models

device = "cuda" if torch.cuda.is_available() else "cpu"

H, W = 32, 32
def make_img(kind, seed=0):
    rng = np.random.default_rng(seed)
    img = np.zeros((H, W), dtype=np.float32)
    if kind == "vertical":
        img[:, ::2] = 1.0
    elif kind == "horizontal":
        img[::2, :] = 1.0
    elif kind == "center":
        img[H//4:3*H//4, W//4:3*W//4] = 1.0
    img += 0.05 * rng.normal(size=(H, W)).astype(np.float32)
    return np.clip(img, 0.0, 1.0)

kinds = ["vertical", "horizontal", "center"]
imgs = np.stack([make_img(k, seed=i) for i, k in enumerate(kinds)], axis=0)

vit = models.vit_b_16(weights=models.ViT_B_16_Weights.DEFAULT)
vit.heads = torch.nn.Identity()
vit.eval().to(device)

X = []
for i in range(len(imgs)):
    x = imgs[i]
    x3 = np.stack([x, x, x], axis=0)
    xt = torch.tensor(x3, dtype=torch.float32).unsqueeze(0)
    xt = torch.nn.functional.interpolate(xt, size=(224, 224), mode="bilinear", align_corners=False)

    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 3, 1, 1)
    std  = torch.tensor([0.229, 0.224, 0.225]).view(1, 3, 1, 1)
    xt = (xt - mean) / std
    X.append(xt)

X = torch.cat(X, dim=0).to(device)

with torch.no_grad():
    emb_vis = vit(X).detach().cpu().numpy().astype(np.float32)

emb_vis = l2_normalize(emb_vis)

true = np.arange(len(imgs), dtype=np.int32)
print("ViT emb shape:", emb_vis.shape)
print("Recall@1:", recall_at_k(emb_vis, emb_vis, true, k=1))
print("Recall@2:", recall_at_k(emb_vis, emb_vis, true, k=2))


Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth


100%|██████████| 330M/330M [00:02<00:00, 148MB/s]


ViT emb shape: (3, 768)
Recall@1: 1.0
Recall@2: 1.0


## 5) CLIP (joint text–image embeddings)
Text and images live in the same vector space. Evaluate text→image retrieval.

In [6]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

texts = ["vertical stripes", "horizontal stripes", "center square"]

H, W = 32, 32
def make_img(kind, seed=0):
    rng = np.random.default_rng(seed)
    img = np.zeros((H, W), dtype=np.float32)
    if kind == "vertical":
        img[:, ::2] = 1.0
    elif kind == "horizontal":
        img[::2, :] = 1.0
    elif kind == "center":
        img[H//4:3*H//4, W//4:3*W//4] = 1.0
    img += 0.05 * rng.normal(size=(H, W)).astype(np.float32)
    return np.clip(img, 0.0, 1.0)

kinds = ["vertical", "horizontal", "center"]
imgs = np.stack([make_img(k, seed=i) for i, k in enumerate(kinds)], axis=0)

pil_imgs = []
for i in range(len(imgs)):
    arr = (imgs[i] * 255).astype(np.uint8)
    pil_imgs.append(Image.fromarray(arr, mode="L").convert("RGB"))

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

inputs = proc(text=texts, images=pil_imgs, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
    out = model(**inputs)
    t_emb = out.text_embeds.detach().cpu().numpy().astype(np.float32)
    i_emb = out.image_embeds.detach().cpu().numpy().astype(np.float32)

t_emb = l2_normalize(t_emb)
i_emb = l2_normalize(i_emb)

true = np.arange(len(texts), dtype=np.int32)
print("CLIP text emb:", t_emb.shape, "CLIP image emb:", i_emb.shape)
print("Text→Image Recall@1:", recall_at_k(t_emb, i_emb, true, k=1))
print("Text→Image Recall@2:", recall_at_k(t_emb, i_emb, true, k=2))


  pil_imgs.append(Image.fromarray(arr, mode="L").convert("RGB"))


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

CLIP text emb: (3, 512) CLIP image emb: (3, 512)
Text→Image Recall@1: 0.3333333333333333
Text→Image Recall@2: 0.6666666666666666


## Production Notes
- Item embeddings are computed offline (batch) and refreshed on a fixed cadence.
- Query embeddings are computed online or cached for short windows.
- Text uses transformer encoders, images typically use ViT-style encoders.
- Embeddings are L2-normalized and compared with cosine similarity.
- Vectors, ANN index, and model version are deployed together.
- New versions are validated with shadow traffic before rollout.
- Need to monitor retrieval coverage, drift, and tail latency.