In [1]:
! pip install --upgrade --quiet --user google-cloud-aiplatform google-cloud-storage

In [1]:
PROJECT_ID = "qwiklabs-gcp-00-e6f60a4bc84c"  
LOCATION = "us-central1"

In [2]:
from datetime import datetime
UID = datetime.now().strftime("%m%d%H%M")

In [3]:
import pandas as pd

CSV_URL = "https://storage.googleapis.com/qwiklabs-gcp-00-e6f60a4bc84c/google_merch_shop_items.csv"

df = pd.read_csv(CSV_URL)
df["title"]

0                          Google Sticker
1                    Google Cloud Sticker
2                       Android Black Pen
3                   Google Ombre Lime Pen
4                    For Everyone Eco Pen
                      ...                
197        Google Recycled Black Backpack
198    Google Cascades Unisex Zip Sweater
199    Google Cascades Womens Zip Sweater
200         Google Cloud Skyline Backpack
201       Google City Black Tote Backpack
Name: title, Length: 202, dtype: object

In [4]:
df.head()

Unnamed: 0,id,link,title,description,price,currency,availability,condition,image_link,mpn
0,Google_Sticker,https://shop.merch.google/product/google-stick...,Google Sticker,Nothing beats a classic! You can't go wrong w...,1.50 USD,USD,IN_STOCK,NEW,https://ik.imagekit.io/RM/store/20160512512/as...,GGOEGCKA166399
1,Google_Cloud_Sticker,https://shop.merch.google/product/google-cloud...,Google Cloud Sticker,"Brighten up your tumbler, notebook, laptop, an...",1.50 USD,USD,IN_STOCK,NEW,https://ik.imagekit.io/RM/store/20160512512/as...,GGOECCKQ173599
2,Android_Black_Pen,https://shop.merch.google/product/android-blac...,Android Black Pen,Add a delightful addition to your writing stat...,2.00 USD,USD,IN_STOCK,NEW,https://ik.imagekit.io/RM/store/20160512512/as...,GGOEAOAB217899
3,Google_Ombre_Lime_Pen,https://shop.merch.google/product/google-ombre...,Google Ombre Lime Pen,You just can't help but feel a little cheery w...,2.00 USD,USD,IN_STOCK,NEW,https://ik.imagekit.io/RM/store/20160512512/as...,GGOEGOAA172299
4,For_Everyone_Eco_Pen,https://shop.merch.google/product/for-everyone...,For Everyone Eco Pen,Smooth writing and eco-friendly&you've got a w...,2.00 USD,USD,IN_STOCK,NEW,https://ik.imagekit.io/RM/store/20160512512/as...,GGOEGOAQ196699


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Sample Text Data
corpus = df.title.tolist()
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer()
# Fit and Transform
vectorizer.fit_transform(corpus)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 839 stored elements and shape (202, 243)>

In [6]:
# wrapper for sparse embedding
def get_sparse_embedding(text):
    # Transform Text into TF-IDF Sparse Vector
    tfidf_vector = vectorizer.transform([text])

    # Create Sparse Embedding for the New Text
    values = []
    dims = []
    for i, tfidf_value in enumerate(tfidf_vector.data):
        values.append(float(tfidf_value))
        dims.append(int(tfidf_vector.indices[i]))
    return {"values": values, "dimensions": dims}

In [7]:
text_text = "Chrome Dino Pin"
get_sparse_embedding(text_text)

{'values': [0.5212913389979028, 0.5212913389979028, 0.6756557405747007],
 'dimensions': [33, 48, 157]}

In [8]:
from vertexai.preview.language_models import TextEmbeddingModel

model = TextEmbeddingModel.from_pretrained("text-embedding-005")

# wrapper for dense embedding
def get_dense_embedding(text):
    return model.get_embeddings([text])[0].values



In [9]:
text_text = "Chrome Dino Pin"
get_dense_embedding(text_text)

[-0.06114290654659271,
 0.017346370965242386,
 -0.004251249600201845,
 -0.02798495627939701,
 -0.0011111712083220482,
 0.012573054060339928,
 -0.05285409837961197,
 -0.030847828835248947,
 -0.003995438572019339,
 -0.05352348834276199,
 -0.08685804903507233,
 0.034621480852365494,
 -0.01601252891123295,
 -0.012502963654696941,
 -0.024926766753196716,
 0.03435458615422249,
 0.0061781443655490875,
 -0.07511552423238754,
 -0.024149153381586075,
 -0.0012593824649229646,
 0.00979007687419653,
 -0.07821597903966904,
 -0.020418530330061913,
 -0.01775938645005226,
 0.023217324167490005,
 0.008083238266408443,
 0.011712702922523022,
 0.020031563937664032,
 -0.013191470876336098,
 -0.019503341987729073,
 0.06235750392079353,
 0.015036839991807938,
 0.03624138981103897,
 0.032345667481422424,
 -0.014319414272904396,
 -0.02620174176990986,
 -0.06564001739025116,
 -0.04058409854769707,
 -0.009316562674939632,
 0.03755204379558563,
 0.050808507949113846,
 -0.11942362040281296,
 0.018268104642629623,


In [10]:
BUCKET_URI = f"gs://{PROJECT_ID}-vs-hybridsearch-{UID}"
! gcloud storage buckets create -l $LOCATION --project $PROJECT_ID $BUCKET_URI

Creating gs://qwiklabs-gcp-00-e6f60a4bc84c-vs-hybridsearch-07220610/...


In [11]:
! gcloud storage cp gs://partner-genai-bucket/genai115/items.json  $BUCKET_URI/items.json

Copying gs://partner-genai-bucket/genai115/items.json to gs://qwiklabs-gcp-00-e6f60a4bc84c-vs-hybridsearch-07220610/items.json
  Completed files 1/1 | 3.3MiB/3.3MiB                                          


In [12]:
from google.cloud import aiplatform
aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [13]:
my_hybrid_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"vs-hybridsearch-index-{UID}",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=10,
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/287012535685/locations/us-central1/indexes/3532390011439677440/operations/8638852654593736704
MatchingEngineIndex created. Resource name: projects/287012535685/locations/us-central1/indexes/3532390011439677440
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/287012535685/locations/us-central1/indexes/3532390011439677440')


In [14]:
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"vs-hybridsearch-index-endpoint-{UID}", public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/287012535685/locations/us-central1/indexEndpoints/824829433901744128/operations/642711516197421056
MatchingEngineIndexEndpoint created. Resource name: projects/287012535685/locations/us-central1/indexEndpoints/824829433901744128
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/287012535685/locations/us-central1/indexEndpoints/824829433901744128')


In [None]:
DEPLOYED_HYBRID_INDEX_ID = f"vs_hybridsearch_deployed_{UID}"
my_index_endpoint.deploy_index(
    index=my_hybrid_index, deployed_index_id=DEPLOYED_HYBRID_INDEX_ID
)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/287012535685/locations/us-central1/indexEndpoints/824829433901744128
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/287012535685/locations/us-central1/indexEndpoints/824829433901744128/operations/147315557186666496


In [None]:
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
    HybridQuery,
)

query_text = "Kids"
query_dense_emb = get_dense_embedding(query_text)
query_sparse_emb = get_sparse_embedding(query_text)
query = HybridQuery(
    dense_embedding=query_dense_emb,
    sparse_embedding_dimensions=query_sparse_emb["dimensions"],
    sparse_embedding_values=query_sparse_emb["values"],
    rrf_ranking_alpha=0.5,
)

In [None]:
# run a hybrid query
response = my_index_endpoint.find_neighbors(
    deployed_index_id=DEPLOYED_HYBRID_INDEX_ID,
    queries=[query],
    num_neighbors=10,
)

# print results
for idx, neighbor in enumerate(response[0]):
    title = df.title[int(neighbor.id)]
    dense_dist = neighbor.distance if neighbor.distance else 0.0
    sparse_dist = neighbor.sparse_distance if neighbor.sparse_distance else 0.0
    print(f"{title:<40}: dense_dist: {dense_dist:.3f}, sparse_dist: {sparse_dist:.3f}")