# User Embeddings

- **Minimal**: user vector = weighted average of recent item embeddings
- **Modern**: learned user encoder (two-tower style) over user history


## 0) Helpers

In [1]:
import numpy as np
import math
from numpy.random import default_rng

rng = default_rng(3)

def l2_normalize(x, axis=1, eps=1e-12):
    n = np.linalg.norm(x, axis=axis, keepdims=True)
    return x / np.maximum(n, eps)

def recall_at_k(query_emb, item_emb, positives, k=20):
    q = l2_normalize(np.asarray(query_emb, dtype=np.float32))
    it = l2_normalize(np.asarray(item_emb, dtype=np.float32))
    S = q @ it.T
    topk = np.argpartition(-S, kth=k-1, axis=1)[:, :k]
    hits = 0
    for i, pos in enumerate(positives):
        if int(pos) in topk[i]:
            hits += 1
    return hits / len(positives)


## 1) Synthetic interactions

In [2]:
n_users = 3000
n_items = 1500
d = 48

U_true = rng.normal(size=(n_users, d)).astype(np.float32)
V_true = rng.normal(size=(n_items, d)).astype(np.float32)

pop = rng.power(a=2.0, size=n_items).astype(np.float32)
pop = pop / pop.sum()

def sigmoid(x): return 1.0 / (1.0 + np.exp(-x))

def sample_item(u_vec):
    scores = (u_vec @ V_true.T).astype(np.float32)
    scores = scores - scores.max()
    p_aff = np.exp(scores).astype(np.float64)
    p_aff = p_aff / p_aff.sum()
    p = 0.85 * p_aff + 0.15 * pop
    return int(rng.choice(n_items, p=p))

T = 30
histories = []
for u in range(n_users):
    drift = (0.4 * rng.normal(size=(d,))).astype(np.float32)
    seq = []
    for t in range(T):
        u_vec = U_true[u] + (drift if t > T//2 else 0.0)
        seq.append(sample_item(u_vec))
    histories.append(seq)

histories = np.array(histories, dtype=np.int32)
print("histories:", histories.shape)
print("example history:", histories[0][:10])


histories: (3000, 30)
example history: [1348  355  880  446 1348 1348 1348 1348 1348  862]


## 2) Item embedding space (given)

In [3]:
item_emb = (V_true + 0.15 * rng.normal(size=V_true.shape).astype(np.float32)).astype(np.float32)
item_emb = l2_normalize(item_emb)

targets = histories[:, -1]
prefix = histories[:, :-1]
print("prefix:", prefix.shape, "targets:", targets.shape)


prefix: (3000, 29) targets: (3000,)


## 3) Minimal user embedding: weighted average

In [4]:
def user_embed_avg(prefix_items, item_emb, decay=0.05):
    U, L = prefix_items.shape
    t = np.arange(L, dtype=np.float32)
    w = np.exp(decay * (t - (L - 1))).astype(np.float32)  # newest ~1
    w = w / (w.sum() + 1e-12)
    X = item_emb[prefix_items]  # [U, L, d]
    u = (X * w[None, :, None]).sum(axis=1)
    return l2_normalize(u)

u_avg = user_embed_avg(prefix, item_emb, decay=0.07)

print("Avg user embedding  Recall@20:", round(recall_at_k(u_avg, item_emb, targets, k=20), 4))
print("Avg user embedding  Recall@50:", round(recall_at_k(u_avg, item_emb, targets, k=50), 4))


Avg user embedding  Recall@20: 0.7387
Avg user embedding  Recall@50: 0.7727


## 4) Minimal upgrade: short-term + long-term mix

In [5]:
u_long = user_embed_avg(prefix, item_emb, decay=0.01)
u_short = user_embed_avg(prefix, item_emb, decay=0.15)
u_mix = l2_normalize(0.4 * u_long + 0.6 * u_short)

print("Mixed user embedding Recall@20:", round(recall_at_k(u_mix, item_emb, targets, k=20), 4))
print("Mixed user embedding Recall@50:", round(recall_at_k(u_mix, item_emb, targets, k=50), 4))


Mixed user embedding Recall@20: 0.738
Mixed user embedding Recall@50: 0.7717


## 5) Modern: learned user encoder (two-tower style)

In [6]:
# Train a small user encoder (two-tower style) using in-batch negatives
L = 20
train_users = 2200

X_hist = histories[:train_users, :L]
y_next = histories[:train_users, L]

hist_vecs = item_emb[X_hist]                # [U, L, d]
mean_vec = hist_vecs.mean(axis=1)
last_vec = hist_vecs[:, -1, :]
weights = np.linspace(0.5, 1.0, L, dtype=np.float32)[None, :, None]
short_vec = l2_normalize((hist_vecs * weights).sum(axis=1))

feat = np.concatenate([mean_vec, last_vec, short_vec], axis=1).astype(np.float32)  # [U, 3d]
feat = l2_normalize(feat)

h = 128
W1 = (0.02 * rng.normal(size=(feat.shape[1], h))).astype(np.float32)
b1 = np.zeros((h,), dtype=np.float32)
W2 = (0.02 * rng.normal(size=(h, d))).astype(np.float32)
b2 = np.zeros((d,), dtype=np.float32)

def relu(x): return np.maximum(x, 0.0)

def forward(feat):
    z1 = relu(feat @ W1 + b1)
    u = z1 @ W2 + b2
    return l2_normalize(u)

def train(epochs=6, lr=0.18, batch=256, seed=4):
    global W1, b1, W2, b2
    rng2 = default_rng(seed)
    n = feat.shape[0]
    idx = np.arange(n)
    for ep in range(epochs):
        rng2.shuffle(idx)
        loss = 0.0
        for start in range(0, n, batch):
            bidx = idx[start:start+batch]
            Xb = feat[bidx]
            yb = y_next[bidx]

            Ub = forward(Xb)            # [B, d]
            Vb = item_emb[yb]           # [B, d]

            S = Ub @ Vb.T               # [B, B]
            S = S - S.max(axis=1, keepdims=True)
            P = np.exp(S).astype(np.float32)
            P = P / (P.sum(axis=1, keepdims=True) + 1e-12)

            diag = np.diag(P)
            loss += float((-np.log(diag + 1e-12)).mean())

            G = P
            G[np.arange(len(bidx)), np.arange(len(bidx))] -= 1.0
            G = G / len(bidx)

            dUb = G @ Vb

            z1 = relu(Xb @ W1 + b1)
            mask = (z1 > 0).astype(np.float32)

            dW2 = (z1.T @ dUb).astype(np.float32)
            db2 = dUb.sum(axis=0).astype(np.float32)

            dz1 = (dUb @ W2.T) * mask
            dW1 = (Xb.T @ dz1).astype(np.float32)
            db1 = dz1.sum(axis=0).astype(np.float32)

            W2 -= lr * dW2
            b2 -= lr * db2
            W1 -= lr * dW1
            b1 -= lr * db1

        print(f"epoch {ep+1}/{epochs} loss≈{loss/(n/batch):.4f}")

train()

eval_users = np.arange(train_users, n_users, dtype=np.int32)
Xe = histories[eval_users, :L]
ye = histories[eval_users, L]

hv = item_emb[Xe]
mean_vec = hv.mean(axis=1)
last_vec = hv[:, -1, :]
weights = np.linspace(0.5, 1.0, L, dtype=np.float32)[None, :, None]
short_vec = l2_normalize((hv * weights).sum(axis=1))

feat_e = l2_normalize(np.concatenate([mean_vec, last_vec, short_vec], axis=1).astype(np.float32))
u_learned = forward(feat_e)

print("Learned user encoder Recall@20:", round(recall_at_k(u_learned, item_emb, ye, k=20), 4))
print("Learned user encoder Recall@50:", round(recall_at_k(u_learned, item_emb, ye, k=50), 4))


epoch 1/6 loss≈5.7526
epoch 2/6 loss≈5.7335
epoch 3/6 loss≈5.7151
epoch 4/6 loss≈5.6972
epoch 5/6 loss≈5.6818
epoch 6/6 loss≈5.6640
Learned user encoder Recall@20: 0.0788
Learned user encoder Recall@50: 0.1388


## Production notes
- Weighted averages are strong and cheap, we can ship them first.
- Learned user encoders help when intent shifts fast or personalization matters at retrieval.
- Keep stable normalization, similarity, versioning with the ANN index.
- Cache user vectors and bound tail latency, so we always have cold-start fallbacks.
