In [2]:
%pip3 install lancedb sentence_transformers
import requests
from sentence_transformers import SentenceTransformer
import lancedb
import numpy as np

In [3]:
# Function to normalize a vector
def normalize_vector(v):
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm


# Function to calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


# Generate random vectors
np.random.seed(0)
vector1 = np.random.rand(1000)
vector2 = np.random.rand(1000)

# Normalize the vectors
norm_vector1 = normalize_vector(vector1)
norm_vector2 = normalize_vector(vector2)

In [4]:
%time score = cosine_similarity(vector1, vector2)

CPU times: user 58 µs, sys: 23 µs, total: 81 µs
Wall time: 83.2 µs


In [5]:
%time score = np.dot(norm_vector1, norm_vector2)

CPU times: user 21 µs, sys: 11 µs, total: 32 µs
Wall time: 36 µs


In [6]:
# URL of a book from Project Gutenberg
# Using "Pride and Prejudice" and "The Inverted Pyramid" as an example
text = ""
for url in [
    "https://www.gutenberg.org/cache/epub/72392/pg72392.txt",
    "http://www.gutenberg.org/files/1342/1342-0.txt",
]:
    # Attempting to download the book
    try:
        response = requests.get(url)
        response.raise_for_status()  # will raise an HTTPError if the HTTP request returned an unsuccessful status code
        book_text = response.text
    except requests.exceptions.RequestException:
        continue

    text += book_text

sentences = text.split("\n")
split_index = int(len(sentences) * 0.95)

# Splitting the list
index_sentences = sentences[:split_index]
search_sentences = sentences[split_index:]

len(index_sentences), len(search_sentences)

(25153, 1324)

In [7]:
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

In [8]:
index_vecs = embedding_model.encode(
    sentences=index_sentences,
    normalize_embeddings=False,
    convert_to_numpy=True,
    show_progress_bar=True,
)
search_vecs = embedding_model.encode(
    sentences=search_sentences,
    normalize_embeddings=False,
    convert_to_numpy=True,
    show_progress_bar=True,
)

Batches: 100%|██████████| 787/787 [02:02<00:00,  6.43it/s]
Batches: 100%|██████████| 42/42 [00:06<00:00,  6.56it/s]

CPU times: user 3min 36s, sys: 1min 1s, total: 4min 38s
Wall time: 2min 9s





In [9]:
np.linalg.norm(index_vecs[0])

6.1352563

In [10]:
index_vecs_norm = embedding_model.encode(
    sentences=index_sentences,
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=True,
)
search_vecs_norm = embedding_model.encode(
    sentences=search_sentences,
    normalize_embeddings=True,
    convert_to_numpy=True,
    show_progress_bar=True,
)

Batches: 100%|██████████| 787/787 [02:05<00:00,  6.27it/s]
Batches: 100%|██████████| 42/42 [00:06<00:00,  6.48it/s]

CPU times: user 3min 37s, sys: 1min 3s, total: 4min 40s
Wall time: 2min 12s





In [17]:
np.linalg.norm(index_vecs_norm[0])

0.99999994

In [18]:
print(index_vecs.shape, index_vecs_norm.shape)

(25153, 768) (25153, 768)


In [19]:
%rm -rf data/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
uri = "data/table"
tbl = lancedb.connect(uri).create_table(
    "table", data=[{"vector": v} for v in index_vecs]
)

uri = "data/table_normalized"
tbl_norm = lancedb.connect(uri).create_table(
    "table", data=[{"vector": v} for v in index_vecs_norm]
)

In [32]:
random_vec = search_vecs_norm[np.random.randint(search_vecs_norm.shape[0])]
%timeit -n 50 tbl_norm.search(random_vec).metric("dot").limit(10).to_df()

20.1 ms ± 607 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)


In [33]:
random_vec = search_vecs[np.random.randint(search_vecs.shape[0])]
%timeit -n 50 tbl.search(random_vec).metric("cosine").limit(10).to_df()

16.2 ms ± 410 µs per loop (mean ± std. dev. of 7 runs, 50 loops each)
