In [1]:
!pip3 install sentence_transformers
import numpy as np
from time import time
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [3]:
# Data preparation
# Use Brown corpus
import nltk

nltk.download("brown")
from nltk.corpus import brown

sentences = [" ".join(sent) for sent in brown.sents()]

In [4]:
embedding_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = embedding_model.encode(
    sentences=sentences, convert_to_numpy=True, show_progress_bar=True
)
query_embedding, index_embeddings = embeddings[0], embeddings[1:]

Downloading .gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 2.54MB/s]
Downloading 1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.07MB/s]
Downloading README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 42.6MB/s]
Downloading config.json: 100%|██████████| 571/571 [00:00<00:00, 4.96MB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 983kB/s]
Downloading data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 13.7MB/s]
Downloading pytorch_model.bin: 100%|██████████| 438M/438M [00:08<00:00, 53.5MB/s] 
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 155kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 898kB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 6.25MB/s]
Downloading tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 2.18MB/s]
Downloading train_script.py: 100%|██████████| 13.1k/13.1k [00:00<00:00, 14.5MB/s]
Downloading vocab

In [None]:
print(query_embedding.shape, index_embeddings.shape)

In [None]:
# Semantic search function
def semantic_search(query_embedding, embeddings):
    similarities = cosine_similarity([query_embedding], embeddings)
    return np.argsort(-similarities[0])

In [None]:
embedding_sizes = list(range(25, query_embedding.shape[0], 25))
times = []
n_repeat = 500

for size in embedding_sizes:
    # Generate embeddings
    query_embedding_reduced = query_embedding[:size]

    # Reduce the dimension of the query embedding to match the current size
    index_embeddings_reduced = index_embeddings[:, :size]

    # Perform semantic search and measure time
    start_time = time()
    for i in range(n_repeat):
        semantic_search(query_embedding_reduced, index_embeddings_reduced)
    end_time = time()

    time_taken = (end_time - start_time) / n_repeat
    print(f"Embedding Size: {size}, Time taken: {time_taken:.4f} seconds\n")
    times.append(time_taken)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(embedding_sizes, times, marker="o", linestyle="-", color="blue")
plt.title("Trade-off Between Embedding Size and Search Speed")
plt.xlabel("Embedding Size")
plt.ylabel("Time Taken (seconds)")
plt.grid(True)


# Display the graph
plt.show()