<a target="_blank" href="https://colab.research.google.com/github/shaankhosla/semanticsearch/blob/main/notebooks/Cosine_vs_Dot.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


In [None]:
%%capture

%pip install sentence_transformers
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
from math import isclose

In [2]:
# Function to normalize a vector
def normalize_vector(v):
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm


# Function to calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


# Generate random vectors
np.random.seed(0)
vector1 = np.random.rand(1000)
vector2 = np.random.rand(1000)

# Normalize the vectors
norm_vector1 = normalize_vector(vector1)
norm_vector2 = normalize_vector(vector2)

In [3]:
%time score = cosine_similarity(vector1, vector2)

CPU times: user 63 µs, sys: 0 ns, total: 63 µs
Wall time: 66.5 µs


In [4]:
%time score = np.dot(norm_vector1, norm_vector2)

CPU times: user 21 µs, sys: 3 µs, total: 24 µs
Wall time: 29.3 µs


In [31]:
isclose(cosine_similarity(vector1, vector2), np.dot(norm_vector1, norm_vector2))

True

In [5]:
# URL of a book from Project Gutenberg
# Using "Pride and Prejudice" and "The Inverted Pyramid" as an example
text = ""
for url in [
    "https://www.gutenberg.org/cache/epub/72392/pg72392.txt",
    "http://www.gutenberg.org/files/1342/1342-0.txt",
]:
    # Attempting to download the book
    try:
        response = requests.get(url)
        response.raise_for_status()  # will raise an HTTPError if the HTTP request returned an unsuccessful status code
        book_text = response.text
    except requests.exceptions.RequestException:
        continue

    text += book_text

sentences = text.split("\n")
split_index = int(len(sentences) * 0.95)

# Splitting the list
index_sentences = sentences[:split_index]
search_sentences = sentences[split_index:]

len(index_sentences), len(search_sentences)

(25153, 1324)

In [6]:
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.66k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [7]:
index_vecs = embedding_model.encode(
    sentences=index_sentences,
    normalize_embeddings=False,  # Not normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)
search_vecs = embedding_model.encode(
    sentences=search_sentences,
    normalize_embeddings=False,  # Not normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)

Batches:   0%|          | 0/787 [00:00<?, ?it/s]

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

In [8]:
np.linalg.norm(index_vecs[0])  # Norm isn't 1

6.1352563

In [9]:
index_vecs_norm = embedding_model.encode(
    sentences=index_sentences,
    normalize_embeddings=True,  # Normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)
search_vecs_norm = embedding_model.encode(
    sentences=search_sentences,
    normalize_embeddings=True,  # Normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)

Batches:   0%|          | 0/787 [00:00<?, ?it/s]

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

In [10]:
np.linalg.norm(index_vecs_norm[0])  # Norm is 1

1.0

In [11]:
print(index_vecs.shape, index_vecs_norm.shape)

(25153, 768) (25153, 768)


In [18]:
N = 100

In [19]:
%%time
for v1 in search_vecs[:N]:
    for v2 in index_vecs:
        cosine_similarity(v1, v2)

CPU times: user 22.8 s, sys: 177 ms, total: 23 s
Wall time: 23.1 s


In [20]:
%%time
for v1 in search_vecs_norm[:N]:
    for v2 in index_vecs_norm:
        np.dot(v1, v2)

CPU times: user 4.68 s, sys: 24.1 ms, total: 4.7 s
Wall time: 5 s
