## Setup and Import Libraries

In [17]:
import numpy as np
import vertexai
from utils import authenticate
from vertexai.language_models import TextEmbeddingModel
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
credentials, PROJECT_ID = authenticate()

In [3]:
print(PROJECT_ID)

spherical-jetty-465410-u7


In [4]:
REGION = 'us-central1'

In [5]:
vertexai.init(
    project=PROJECT_ID, 
    location=REGION, 
    credentials=credentials
)

## Use Embedding Model

In [7]:
embedding_model = TextEmbeddingModel.from_pretrained("text-embedding-005")

In [8]:
embedding = embedding_model.get_embeddings(["life"])

In [9]:
vector = embedding[0].values
print(f"Length = {len(vector)}")
print(vector[:10])

Length = 768
[-0.03913220763206482, 0.011950427666306496, -0.033064886927604675, -0.01965772733092308, 0.005785454995930195, 0.0117251668125391, 0.03553176298737526, -0.03931012004613876, -0.021659476682543755, -0.006962527055293322]


In [10]:
embedding = embedding_model.get_embeddings(["What is the meaning of life?"])
vector = embedding[0].values
print(f"Length = {len(vector)}")
print(vector[:10])

Length = 768
[-0.07483217120170593, 0.022149646654725075, 0.05012010037899017, -0.024886615574359894, 0.008297066204249859, 0.004097355995327234, -0.0024798335507512093, -0.008898804895579815, 0.012297150678932667, 0.021235981956124306]


## Similarity

In [11]:
emb_1 = embedding_model.get_embeddings(["What is the meaning of life?"]) # 42!
emb_2 = embedding_model.get_embeddings(["How does one spend their time well on Earth?"])
emb_3 = embedding_model.get_embeddings(["Would you like a salad?"])

vec_1 = [emb_1[0].values]
vec_2 = [emb_2[0].values]
vec_3 = [emb_3[0].values]

In [12]:
print(cosine_similarity(vec_1,vec_2)) 
print(cosine_similarity(vec_2,vec_3))
print(cosine_similarity(vec_1,vec_3))

[[0.60462254]]
[[0.40394714]]
[[0.46909041]]


## From word to sentence embeddings

- One possible way to calculate sentence embeddings from word embeddings is to take the average of the word embeddings.
- This ignores word order and context, so two sentences with different meanings, but the same set of words will end up with the same sentence embedding.

In [13]:
in_1 = "The kids play in the park."
in_2 = "The play was for kids in the park."

- Remove stop words like ["the", "in", "for", "an", "is"] and punctuation.

In [14]:
in_pp_1 = ["kids", "play", "park"]
in_pp_2 = ["play", "kids", "park"]

- Generate one embedding for each word. So this is a list of three lists.

In [15]:
embeddings_1 = [emb.values for emb in embedding_model.get_embeddings(in_pp_1)]

- Use numpy to convert this list of lists into a 2D array of 3 rows and 768 columns.

In [18]:
emb_array_1 = np.stack(embeddings_1)
print(emb_array_1.shape)

(3, 768)


In [19]:
embeddings_2 = [emb.values for emb in embedding_model.get_embeddings(in_pp_2)]
emb_array_2 = np.stack(embeddings_2)
print(emb_array_2.shape)

(3, 768)


- Take the average embedding across the 3 word embeddings
- You'll get a single embedding of length 768.

In [20]:
emb_1_mean = emb_array_1.mean(axis = 0) 
print(emb_1_mean.shape)

(768,)


In [21]:
emb_2_mean = emb_array_2.mean(axis = 0)
print(emb_2_mean.shape)

(768,)


In [22]:
print(emb_1_mean[:4])
print(emb_2_mean[:4])

[-0.04184873 -0.00628616 -0.00435847 -0.06600513]
[-0.04184873 -0.00628616 -0.00435847 -0.06600513]


**Get sentence embeddings from the model.**

- These sentence embeddings account for word order and context.
- Verify that the sentence embeddings are not the same.

In [23]:
print(in_1)
print(in_2)

The kids play in the park.
The play was for kids in the park.


In [24]:
embedding_1 = embedding_model.get_embeddings([in_1])
embedding_2 = embedding_model.get_embeddings([in_2])

In [25]:
vector_1 = embedding_1[0].values
print(vector_1[:4])

[-0.07484156638383865, -0.04629531502723694, 0.002040080027654767, -0.06604499369859695]


In [26]:
vector_2 = embedding_2[0].values
print(vector_2[:4])

[-0.06621360033750534, -0.00042614544508978724, 0.04016496241092682, -0.08443807810544968]
