**Install Vertex AI SDK for Python**

In [None]:
#Install Libraries
! pip3 install --upgrade --user google-cloud-aiplatform

In [1]:
#Authenticate. Using this, since I was running for collab
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

In [2]:
# Define project information
PROJECT_ID = "genai-387917"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

In [3]:
from vertexai.generative_models import GenerationConfig, GenerativeModel, Image, Part

In [4]:
#Load file that container product reviews.
from google.colab import files
uploaded = files.upload()

Saving prod_reviews.csv to prod_reviews.csv


In [None]:
# Will use this later.
#model = GenerativeModel("gemini-1.0-pro")
#responses = model.generate_content("Why is the sky blue?", stream=True)

#for response in responses:
    #print(response.text, end="")

In [5]:
#Init Text Embedding model
from vertexai.preview.language_models import TextEmbeddingModel
model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")

In [6]:
#read all the reviews from the csv file that you uplaoded
import pandas as pd
df = pd.read_csv('prod_reviews.csv')

In [19]:
#Get embeddings
import time
import tqdm  # to show a progress bar

# get embeddings for a list of texts
BATCH_SIZE = 5


def get_embeddings_wrapper(texts):
    embs = []
    for i in tqdm.tqdm(range(0, len(texts), BATCH_SIZE)):
        time.sleep(1)  # to avoid the quota error
        result = model.get_embeddings(texts[i : i + BATCH_SIZE])
        embs = embs + [e.values for e in result]
    return embs

In [20]:
# get embeddings for the question titles and add them as "embedding" column
df = df.assign(embedding=get_embeddings_wrapper(list(df.reviews)))
df.head(10)

100%|██████████| 5/5 [00:05<00:00,  1.12s/it]


Unnamed: 0,id,reviews,embedding
0,1,The product is good,"[-0.039588287472724915, 0.0002919341204687953,..."
1,2,The product is bad,"[-0.013406762853264809, 0.01671764813363552, 0..."
2,3,Did not work,"[-0.01432005874812603, 0.03364266827702522, -0..."
3,4,Horrible product,"[0.001533816335722804, 0.03917508199810982, 0...."
4,5,Would never buy,"[-0.009193393401801586, 0.0014845000114291906,..."
5,6,Loved it!,"[0.0064478409476578236, -0.021919049322605133,..."
6,7,Please don't buy!,"[-0.02352028526365757, -0.002306506270542741, ..."
7,8,Loved it!,"[0.0064478409476578236, -0.021919049322605133,..."
8,9,Bad customer service,"[0.010052811354398727, -0.013862664811313152, ..."
9,10,Loved it!,"[0.0064478409476578236, -0.021919049322605133,..."


In [21]:
from datetime import datetime
UID = datetime.now().strftime("%m%d%H%M")

In [23]:
# save id and embedding as a json file
jsonl_string = df[["id", "reviews", "embedding"]].to_json(orient="records", lines=True)
with open("questions.json", "w") as f:
    f.write(jsonl_string)

# show the first few lines of the json file
! head -n 13 questions.json

{"id":1,"reviews":"The product is good","embedding":[-0.0395882875,0.0002919341,0.0292619653,0.0336313993,0.0248739924,-0.0636084005,-0.0001883781,0.0139750233,-0.0286372378,-0.0322874188,-0.0195468962,0.0299632363,0.0475748442,0.0300907753,-0.0002630139,0.0292436108,-0.0310235657,-0.0518661775,0.0129030077,-0.0401229821,-0.0770297274,0.0196099132,-0.0148634184,-0.025521392,-0.0467524007,-0.0708683059,0.0142272133,0.0657908842,-0.0292616114,-0.0284229908,0.0375614725,0.0068553509,-0.0180993155,-0.0145092681,0.0245220177,0.008315674,-0.0198020432,-0.0129505601,0.0087253256,0.0214294232,0.0475759804,0.0040128017,0.0121406261,-0.0065930886,-0.0360479616,-0.0255533066,-0.0397421531,0.0488053598,0.045827534,-0.0263289716,0.015853662,-0.0284047499,0.0523712784,0.0168017838,-0.0088839587,0.0089062694,-0.0497039668,-0.0281880312,-0.0383746363,-0.032352075,0.007918979,0.0281815697,-0.018905418,-0.0603494644,-0.0085123703,0.0120513793,0.0114198523,-0.0082937917,-0.0044286679,-0.0205152482,0.0139

In [24]:
BUCKET_URI = f"gs://{PROJECT_ID}-embvs-tutorial-{UID}"
! gsutil mb -l $LOCATION -p {PROJECT_ID} {BUCKET_URI}
! gsutil cp questions.json {BUCKET_URI}

Creating gs://genai-387917-embvs-tutorial-04220040/...
ServiceException: 409 A Cloud Storage bucket named 'genai-387917-embvs-tutorial-04220040' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.
Copying file://questions.json [Content-Type=application/json]...
/ [1 files][252.3 KiB/252.3 KiB]                                                
Operation completed over 1 objects/252.3 KiB.                                    


In [25]:
# init the aiplatform package
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# create Index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"embvs-tutorial-index-{UID}",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=10,
     distance_measure_type="DOT_PRODUCT_DISTANCE",
)

In [None]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"embvs-tutorial-index-{UID}", public_endpoint_enabled=True
)

In [None]:
#my_index_id = "5836898213537251328"
#my_index = aiplatform.MatchingEngineIndex(my_index_id)

In [None]:
#my_index_endpoint_id = "4019273551428190208"
#my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(my_index_endpoint_id)

In [28]:
DEPLOYED_INDEX_ID = f"embvs_tutorial_deployed_{UID}"

In [29]:
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/95065625117/locations/us-central1/indexEndpoints/1490924573124722688
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/95065625117/locations/us-central1/indexEndpoints/1490924573124722688/operations/7109115114850091008
INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/95065625117/locations/us-central1/indexEndpoints/1490924573124722688


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7ce1ce0f3e20> 
resource name: projects/95065625117/locations/us-central1/indexEndpoints/1490924573124722688

In [33]:
#Embedded the search text
test_embeddings = get_embeddings_wrapper(["Great products"])

100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


In [34]:
# Test query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_INDEX_ID,
    queries=test_embeddings,
    num_neighbors=15,
)
# show the result
import numpy as np

for idx, neighbor in enumerate(response[0]):
    id = np.int64(neighbor.id)
    similar = df.query("id == @id", engine="python")
    print(f"{neighbor.distance:.4f} {similar.reviews.values[0]}")

0.9129 great product
0.8988 fantastic product
0.8468 The product is good
0.7857 Better products are available
0.7734 Horrible product
0.7294 The product is bad
0.6542 service was super
0.6516 Please don't buy!
0.6516 Please don't buy!
0.6516 Please don't buy!
0.6516 Please don't buy!
0.6516 Please don't buy!
0.6516 Please don't buy!
0.6241 service was superlative
0.6151 Would never buy
