In [60]:
from datasets import load_dataset
import numpy as np
import math
from ddsketch import DDSketch

In [49]:
EMBEDDING_DIM = 768
NUM_TREES = 10
NUM_EMBEDDINGS = 35_167_920

# Indicates target size for all embeddings in a leaf node.
# Treat this like a 'page size', in that the client will load it all at once.
TARGET_LEAF_SIZE_BYTES = 10 * (1024 * 1024)
EMBEDDING_SIZE_BYTES = EMBEDDING_DIM * 8
TARGET_LEAF_NUM_EMBEDDINGS = TARGET_LEAF_SIZE_BYTES // EMBEDDING_SIZE_BYTES

TREE_DEPTH = math.ceil(math.log2(NUM_EMBEDDINGS) - math.log2(TARGET_LEAF_NUM_EMBEDDINGS))

In [50]:
TREE_DEPTH

15

In [53]:
random_vectors = np.random.normal(size=(NUM_TREES, math.floor(math.pow(2, TREE_DEPTH - 1)), EMBEDDING_DIM))

In [54]:
random_vectors.shape

(10, 16384, 768)

In [59]:
random_vectors.nbytes

1006632960

In [56]:
sample_emb = np.array(doc["emb"])

In [57]:
rand_dot_prods = random_vectors @ sample_emb

In [63]:
random_vectors.shape[:2]

(10, 16384)

In [None]:
docs = load_dataset(f"Cohere/wikipedia-22-12-en-embeddings", split="train", streaming=True)

In [74]:
[dim_i, dim_j] = random_vectors.shape[:2]
sketches = [[ DDSketch() for _  in range(dim_j)] for _ in range(dim_i)]

In [75]:
num_docs = 0
for doc in docs:
    num_docs += 1

    if num_docs > 10:
        break

    doc_emb = np.array(doc["emb"])
    doc_dot_prods = random_vectors @ doc_emb
    for i in range(dim_i):
        for j in range(dim_j):
            sketch_ij = sketches[i][j]
            elt = doc_dot_prods[i][j]

            sketch_ij.add(elt)

In [None]:
rand_dot_prods = random_vectors @ sample_emb
for i in range(dim_i):
    for j in range(dim_j):
        sketch_ij = sketches[i][j]
        elt = rand_dot_prods[i][j]

        sketch_ij.add(elt)