In [2]:
#pandas - fast, powerful, flexible and easy to use open source data analysis and manipulation tool
import pandas as pd

sentences = pd.Series([
    'The book I read was very interesting.',
    'Cooking can be a relaxing hobby.',
    'The scent of flowers filled the air.'
])

#DataFrame - 2 dimensional data structure, like a 2 dimensional array, or a table with rows and columns
df=pd.DataFrame({ 'text': sentences })

df

Unnamed: 0,text
0,The book I read was very interesting.
1,Cooking can be a relaxing hobby.
2,The scent of flowers filled the air.


In [5]:
from vertexai.preview.language_models import TextEmbeddingModel

def determine_text_embedding(text):
    model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
    embeddings = model.get_embeddings([text]) 
    return [embedding.values for embedding in embeddings]

In [6]:
determine_text_embedding("An example Sentence")

[[-0.03703715652227402,
  0.03955046460032463,
  0.02004142664372921,
  -0.0034781747963279486,
  0.028821486979722977,
  -0.044681716710329056,
  0.03389149159193039,
  0.03335868567228317,
  0.020109418779611588,
  -0.05968587473034859,
  0.0028773746453225613,
  0.02122156135737896,
  0.010590668767690659,
  0.010344757698476315,
  -0.03356008976697922,
  -0.037029631435871124,
  -0.03943483158946037,
  -0.04834739491343498,
  0.03661597892642021,
  0.0025168531574308872,
  -0.11210343986749649,
  -0.03341804817318916,
  0.02813202328979969,
  -0.0034307092428207397,
  -0.011287861503660679,
  -0.06882021576166153,
  -0.0008553309598937631,
  0.04806569591164589,
  -0.017958665266633034,
  -0.03027132712304592,
  0.01652245596051216,
  0.03426377847790718,
  -0.022181067615747452,
  0.02553563378751278,
  -0.017098644748330116,
  0.00042176293209195137,
  -0.007199215702712536,
  -0.013973436318337917,
  0.033049046993255615,
  0.0068560512736439705,
  -0.005104040261358023,
  -0.02

In [7]:
df=df.assign(
    embedding=(
        df["text"].apply(lambda x : determine_text_embedding(x))
    )
)

df

Unnamed: 0,text,embedding
0,The book I read was very interesting.,"[[-0.00655606621876359, -0.0009876351105049253..."
1,Cooking can be a relaxing hobby.,"[[-0.024364925920963287, -0.00290481629781425,..."
2,The scent of flowers filled the air.,"[[-0.03162015601992607, 0.02951684221625328, -..."


In [8]:
prompt = "Tell me about Cooking"
prompt_embedding=determine_text_embedding(prompt)

In [None]:
#numpy - support for large, multi-dimensional arrays and matrices, 
#        along with a large collection of high-level mathematical functions to operate on these arrays. 
import numpy as np

# Calculate dot product to find out similarity
# Higher dot product => similar vectors
def determine_embedding_similarity(vec1, vec2):
    return np.dot(np.squeeze(np.array(vec1)),np.squeeze(np.array(vec2)))

In [None]:
df["similarity"]=df["embedding"].apply(lambda x: determine_embedding_similarity(x,prompt_embedding[0]))
df

In [None]:
index_of_highest_similarity = df['similarity'].idxmax()
text_with_highest_similarity = df.loc[index_of_highest_similarity, 'text']
text_with_highest_similarity