<a target="_blank" href="https://colab.research.google.com/github/shaankhosla/semanticsearch/blob/main/notebooks/Cosine_vs_Dot.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


In [1]:
%%capture

%pip install sentence_transformers==2.2.2
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
from math import isclose

In [2]:
# Function to normalize a vector
def normalize_vector(v):
    norm = np.linalg.norm(v)
    if norm == 0:
        return v
    return v / norm


# Function to calculate cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))


# Generate random vectors
np.random.seed(0)
vector1 = np.random.rand(1000)
vector2 = np.random.rand(1000)

# Normalize the vectors
norm_vector1 = normalize_vector(vector1)
norm_vector2 = normalize_vector(vector2)

In [3]:
print(np.linalg.norm(vector1))
print(np.linalg.norm(norm_vector1))

18.17651933044476
1.0


In [4]:
%time score = cosine_similarity(vector1, vector2)

CPU times: user 486 µs, sys: 0 ns, total: 486 µs
Wall time: 482 µs


In [5]:
%time score = np.dot(norm_vector1, norm_vector2)

CPU times: user 58 µs, sys: 8 µs, total: 66 µs
Wall time: 51.5 µs


In [6]:
print(cosine_similarity(vector1, vector2))
print(np.dot(norm_vector1, norm_vector2))

isclose(cosine_similarity(vector1, vector2), np.dot(norm_vector1, norm_vector2))

0.7463645813901374
0.7463645813901376


True

In [7]:
# URL of a book from Project Gutenberg
# Using "Pride and Prejudice" and "The Inverted Pyramid" as an example
text = ""
for url in [
    "https://www.gutenberg.org/cache/epub/72392/pg72392.txt",
    "http://www.gutenberg.org/files/1342/1342-0.txt",
]:
    # Attempting to download the book
    try:
        response = requests.get(url)
        response.raise_for_status()  # will raise an HTTPError if the HTTP request returned an unsuccessful status code
        book_text = response.text
    except requests.exceptions.RequestException:
        continue

    text += book_text

sentences = text.split("\n")
split_index = int(len(sentences) * 0.95)

# Splitting the list
index_sentences = sentences[:split_index]
search_sentences = sentences[split_index:]

len(index_sentences), len(search_sentences)

(25153, 1324)

In [8]:
for sentence in sentences[:5]:
    print(sentence)
    print("\n\n")

﻿The Project Gutenberg eBook of The inverted pyramid



    



This ebook is for the use of anyone anywhere in the United States and



most other parts of the world at no cost and with almost no restrictions



whatsoever. You may copy it, give it away or re-use it under the terms





In [9]:
embedding_model = SentenceTransformer("multi-qa-mpnet-base-dot-v1")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.66k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [10]:
index_vecs = embedding_model.encode(
    sentences=index_sentences,
    normalize_embeddings=False,  # Not normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)
search_vecs = embedding_model.encode(
    sentences=search_sentences,
    normalize_embeddings=False,  # Not normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)

Batches:   0%|          | 0/787 [00:00<?, ?it/s]

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

In [15]:
print(index_vecs)

[[ 0.03549176 -0.04622148 -0.3408849  ... -0.40279573  0.01364037
  -0.18062787]
 [-0.5292812  -0.26967177 -0.2199727  ...  0.0802938   0.00908589
  -0.25519413]
 [ 0.15273279 -0.1525886  -0.4436004  ... -0.32564378 -0.2011139
  -0.38183895]
 ...
 [-0.52928144 -0.2696717  -0.2199725  ...  0.0802938   0.00908624
  -0.2551939 ]
 [-0.04850688 -0.11196937 -0.3250154  ...  0.05422214 -0.06920172
  -0.15496542]
 [-0.52928144 -0.26967177 -0.21997264 ...  0.08029383  0.00908582
  -0.2551937 ]]


In [12]:
np.linalg.norm(index_vecs[0])  # Norm isn't 1

6.1352563

In [13]:
index_vecs_norm = embedding_model.encode(
    sentences=index_sentences,
    normalize_embeddings=True,  # Normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)
search_vecs_norm = embedding_model.encode(
    sentences=search_sentences,
    normalize_embeddings=True,  # Normalized
    convert_to_numpy=True,
    show_progress_bar=True,
)

Batches:   0%|          | 0/787 [00:00<?, ?it/s]

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

In [14]:
print(index_vecs_norm)

[[ 0.00578489 -0.00753375 -0.05556164 ... -0.06565263  0.00222328
  -0.02944097]
 [-0.08130968 -0.04142774 -0.03379283 ...  0.01233496  0.0013958
  -0.03920364]
 [ 0.02736867 -0.02734283 -0.07949015 ... -0.05835313 -0.03603823
  -0.06842292]
 ...
 [-0.08130971 -0.04142773 -0.0337928  ...  0.01233496  0.00139585
  -0.03920361]
 [-0.0080246  -0.01852334 -0.053768   ...  0.00897009 -0.01144819
  -0.02563627]
 [-0.08130971 -0.04142774 -0.03379282 ...  0.01233497  0.00139579
  -0.03920358]]


In [None]:
np.linalg.norm(index_vecs_norm[0])  # Norm is 1

1.0

In [None]:
print(index_vecs.shape, index_vecs_norm.shape)

(25153, 768) (25153, 768)


In [None]:
N = 100

In [None]:
%%time
for v1 in search_vecs[:N]:
    for v2 in index_vecs:
        cosine_similarity(v1, v2)

CPU times: user 22.8 s, sys: 177 ms, total: 23 s
Wall time: 23.1 s


In [None]:
%%time
for v1 in search_vecs_norm[:N]:
    for v2 in index_vecs_norm:
        np.dot(v1, v2)

CPU times: user 4.68 s, sys: 24.1 ms, total: 4.7 s
Wall time: 5 s
