In [1]:
import chromadb

In [3]:
import chromadb
from chromadb.utils import embedding_functions
default_ef = embedding_functions.DefaultEmbeddingFunction()

In [6]:
val = default_ef(["foo"])
print(len(val[0]))

384


#### Similarity

- Calculate the similarity between two sentences as a number between 0 and 1.
- Try out your own sentences and check if the similarity calculations match your intuition.

In [8]:
emb_1 = default_ef(
    ["What is the meaning of life?"]) # 42!

emb_2 = default_ef(
    ["How does one spend their time well on Earth?"])

emb_3 = default_ef(
    ["Would you like a salad?"])

vec_1 = [emb_1[0]]
vec_2 = [emb_2[0]]
vec_3 = [emb_3[0]]

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

print(cosine_similarity(vec_1,vec_2)) 
print(cosine_similarity(vec_2,vec_3))
print(cosine_similarity(vec_1,vec_3))

[[0.28405571]]
[[0.03041392]]
[[0.16022199]]


### From word to sentence embeddings
One possible way to calculate sentence embeddings from word embeddings is to take the average of the word embeddings.
This ignores word order and context, so two sentences with different meanings, but the same set of words will end up with the same sentence embedding.

In [11]:
in_1 = "The kids play in the park."
in_2 = "The play was for kids in the park."

In [12]:
in_pp_1 = ["kids", "play", "park"]
in_pp_2 = ["play", "kids", "park"]

In [18]:
embeddings_1 = [e for e in default_ef(in_pp_1)]

In [19]:
import numpy as np
emb_array_1 = np.stack(embeddings_1)
print(emb_array_1.shape)

(3, 384)


In [16]:
import numpy as np
embeddings_2 = [e for e in default_ef(in_pp_2)]
emb_array_2 = np.stack(embeddings_2)
print(emb_array_2.shape)

(3, 384)


- Take the average embedding across the 3 word embeddings 
- You'll get a single embedding of length 768.

In [20]:
emb_1_mean = emb_array_1.mean(axis = 0) 
print(emb_1_mean.shape)

(384,)


In [None]:
emb_2_mean = emb_array_2.mean(axis = 0)

In [23]:
print(emb_1_mean[:4])
print(emb_2_mean[:4])

[ 0.00600172  0.0263909   0.02692608 -0.00738033]
[ 0.00600172  0.0263909   0.02692608 -0.00738033]


#### Get sentence embeddings from the model.
- These sentence embeddings account for word order and context.
- Verify that the sentence embeddings are not the same.

In [24]:
print(in_1)
print(in_2)

The kids play in the park.
The play was for kids in the park.


In [25]:
embedding_1 = default_ef([in_1])
embedding_2 = default_ef([in_2])

In [27]:
vector_1 = embedding_1[0]
print(vector_1[:4])
vector_2 = embedding_2[0]
print(vector_2[:4])

[0.030205348506569862, -0.042503293603658676, 0.04944058880209923, -0.009381156414747238]
[0.0074311248026788235, 0.06007609888911247, 0.019712315872311592, -0.04297603294253349]
