In [3]:
# Copyright 2024 Google LLC. This software is provided as-is, without warranty
# or representation for any use or purpose. Your use of it is subject to your
# agreement with Google.

In [4]:
# Set to root directory.
import os
REPO_NAME = "sme_chat"
if os.getcwd().split("/")[-1] != REPO_NAME:
    os.chdir(f"../../{REPO_NAME}")
print(os.getcwd())

/Users/pnallamotu/Desktop/sme_chat


This notebook is used to create a RAG based system to identify malicious or prompt hijack queries. 
The flow of this notebook is as follow: 
1. Create a spreadsheet of malicious or queries to block. 
2. Embed each query 
3. Create Vector Search Index from created embeddings. 
4. Deploy Vector Search Index to endpoint. 

# Setup

In [5]:
from typing import Any, Dict, List, Optional

from google.cloud import aiplatform
import pandas as pd
import vertexai
from vertexai.preview.language_models import TextEmbeddingInput, TextEmbeddingModel

## Constants

In [6]:
PROJECT_ID = "pnallamotu-test"
LOCATION = "us-central1"

In [7]:
# Init VertexAI & AI Platform.
aiplatform.init(project=PROJECT_ID, location=LOCATION)
vertexai.init(project=PROJECT_ID, location=LOCATION)

# Read in Query Examples

In [8]:
file_path = "./notebooks/data/queries_to_block.csv"

In [9]:
df = pd.read_csv(file_path)

# Embed Queries

In [10]:
def embed_text(
    texts: List[str],
    task: str = "SEMANTIC_SIMILARITY",
    model_name: str = "text-embedding-004",
    dimensionality: Optional[int] = 256,
    batch_size: int = 5
):
    """Embeds a list of texts.
    Args:
        texts: The list of texts to embed.
        task: The task for which the embeddings will be used.
        model_name: The name of the pre-trained text embedding model to use.
        dimensionality: The desired dimensionality of the embeddings. If None, the
            default dimensionality of the model is used.
        batch_size: The batch size to use for embedding.

    Returns:
        A list of lists, where each inner list represents the emebddings of a text
    """
    embeddings = []
    model = TextEmbeddingModel.from_pretrained(model_name)
    # for i in tqdm.tqdm(range(0, len(texts), batch_size)):
    for i in range(0, len(texts), batch_size):
        curr_batch = texts[i: i + batch_size]
        inputs = [TextEmbeddingInput(text, task) for text in curr_batch]

        kwargs = dict(output_dimensionality=dimensionality) if dimensionality else {}
        result = model.get_embeddings(inputs, **kwargs)
        embeddings = embeddings + [e.values for e in result]
    return embeddings

In [11]:
df = df.assign(embedding=embed_text(list(df["query"])))

## Upload Embeddings to GCS

In [13]:
LOCAL_EMBEDDINGS_PATH = "./notebooks/data/embeddings.json"
jsonl_string = df[["id", "embedding"]].to_json(orient="records", lines=True)
with open(LOCAL_EMBEDDINGS_PATH, "w") as f:
    f.write(jsonl_string)

# show the first few lines of the json file
! head -n 3 ./notebooks/data/embeddings.json

{"id":1,"embedding":[-0.0086849649,-0.0094498051,-0.0533416085,-0.0002221287,0.0299373642,0.049645666,0.0555175021,0.0697580799,-0.030261362,0.0011084516,0.0011093153,0.0914345086,-0.0185045246,0.051989574,0.0369685479,-0.0233300049,0.0050664125,0.0399317779,-0.0454415418,-0.0127727892,-0.0037062413,0.0144868558,-0.0172112677,-0.0216604024,0.0073467297,-0.0209816415,0.04758735,-0.0788331628,-0.0013815438,-0.0493790321,0.0745011792,0.0109370556,-0.000250025,-0.0440187454,0.0360942744,0.0023019249,-0.0094008828,-0.0428107269,-0.0097443825,-0.0307801366,-0.0547727384,0.0452350192,-0.0724724382,-0.0042062104,-0.0056527611,-0.0193993393,-0.0267820824,-0.0378545672,0.0486142673,-0.0044723307,0.0180207212,-0.0553663746,-0.0323941596,-0.0369985588,0.025259627,-0.0332302973,-0.0537406392,0.007708489,0.0989010707,-0.0267123468,-0.0382067524,-0.0130893681,-0.024364382,-0.0171621852,0.0264381617,0.007582196,-0.041985333,-0.0094239591,0.0025440657,0.0653168112,0.0190015528,0.0419340581,-0.011757608

In [14]:
EMBEDDINGS_BUCKET_URI = f"gs://sme-malicious-queries"
! gsutil mb -l $LOCATION -p {PROJECT_ID} {EMBEDDINGS_BUCKET_URI}
! gsutil cp ./notebooks/data/embeddings.json {EMBEDDINGS_BUCKET_URI}

Creating gs://sme-malicious-queries/...
Copying file://./notebooks/data/embeddings.json [Content-Type=application/json]...
/ [1 files][135.0 KiB/135.0 KiB]                                                
Operation completed over 1 objects/135.0 KiB.                                    


# Vector Search

## Create Index

In [16]:
query_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"sme-queries-to-block",
    contents_delta_uri=EMBEDDINGS_BUCKET_URI,
    dimensions=256,
    approximate_neighbors_count=20,
    distance_measure_type="DOT_PRODUCT_DISTANCE",
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/969241382112/locations/us-central1/indexes/6374928835426648064/operations/1645450109091053568
MatchingEngineIndex created. Resource name: projects/969241382112/locations/us-central1/indexes/6374928835426648064
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/969241382112/locations/us-central1/indexes/6374928835426648064')


## Create Endpoint

In [17]:
query_idx_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"sme-queries-embeddings-endpoint",
    public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/969241382112/locations/us-central1/indexEndpoints/6596801485818822656/operations/6091628841212575744
MatchingEngineIndexEndpoint created. Resource name: projects/969241382112/locations/us-central1/indexEndpoints/6596801485818822656
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/969241382112/locations/us-central1/indexEndpoints/6596801485818822656')


## Deploy Endpoint

In [18]:
query_idx_endpoint.deploy_index(index=query_index, deployed_index_id="sme_queries")

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/969241382112/locations/us-central1/indexEndpoints/6596801485818822656
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/969241382112/locations/us-central1/indexEndpoints/6596801485818822656/operations/7789485900731252736


# Run Sample Query 

In [11]:
ENDPOINT_ID = "6596801485818822656"

In [12]:
sme_query_endpoint = aiplatform.MatchingEngineIndexEndpoint(ENDPOINT_ID)

In [22]:
test_embeddings = embed_text(["what wines pair well with grilled salmon"])

In [23]:
# Test query.
response = sme_query_endpoint.find_neighbors(
    deployed_index_id="sme_queries",
    queries=test_embeddings,
    num_neighbors=20,
)

neighbor_ids = []
for idx, neighbor in enumerate(response[0]):
    print(neighbor)
    # print(neighbor.distance)
    neighbor_ids.append(int(neighbor.id))

MatchNeighbor(id='8', distance=0.15819087624549866, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[])
MatchNeighbor(id='23', distance=0.15232744812965393, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[])
MatchNeighbor(id='36', distance=0.15166263282299042, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[])
MatchNeighbor(id='16', distance=0.15004907548427582, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[])
MatchNeighbor(id='10', distance=0.14968690276145935, feature_vector=[], crowding_tag='0', restricts=[], numeric_restricts=[], sparse_embedding_values=[], sparse_embedding_dimensions=[])
MatchNeighbor(id='14', distance=0.1464846432209015, feature_vector=[], 