<a href="https://colab.research.google.com/github/priyankaiiit14/LearningAIML/blob/main/Generative_AI_Document_Retrieval_and_Question_Answering_with%C2%A0LLMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Generative AI - Document Retrieval and Question Answering with LLMs


## Authenticate

In [None]:
from google.colab import auth as google_auth
google_auth.authenticate_user()

In [None]:
!gcloud config set project sascha-playground-doit

Updated property [core/project].


## Dependencies

In [None]:
!pip install google-cloud-aiplatform==1.25.0
!pip install langchain==0.0.187
!pip install xmltodict==0.13.0
!pip install unstructured==0.7.0 # used by langchain
!pip install pdf2image==1.16.3 #used by langchain
!pip install requests==2.31.0
!pip install beautifulsoup4==4.12.2

In [None]:
PROJECT_ID = 'sascha-playground-doit'
REGION = 'us-central1'
BUCKET = 'gs://doit-llm/embeddings'
DIMENSIONS=768
DISPLAY_NAME='palm-2-langchain-document-answering'
ENDPOINT='us-central1-aiplatform.googleapis.com'
TEXT_GENERATION_MODEL='text-bison@001'

sitemap='https://cloud.google.com/vertex-ai/sitemap.xml'

In [None]:
import os
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=REGION)

# Documents
## Parse the sitemap

In [None]:
import requests
from bs4 import BeautifulSoup

def parse_sitemap(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "xml")
    urls = [element.text for element in soup.find_all("loc")]
    return urls

sites = parse_sitemap(sitemap)

In [None]:
sites_filtered = [url for url in sites if '/reference/' not in url and '?hl' not in url]
sites_filtered

['https://cloud.google.com/vertex-ai/docs/general/iam-permissions',
 'https://cloud.google.com/vertex-ai/sla',
 'https://cloud.google.com/vertex-ai/docs/samples/aiplatform-get-model-evaluation-tabular-regression-sample',
 'https://cloud.google.com/vertex-ai/docs/tutorials/tabular-bq-prediction',
 'https://cloud.google.com/vertex-ai/docs/experiments/tensorboard-profiler',
 'https://cloud.google.com/vertex-ai/docs/tutorials/image-recognition-automl/training',
 'https://cloud.google.com/vertex-ai/docs/start/automl-users',
 'https://cloud.google.com/vertex-ai/docs/training/neural-architecture-search/nas-tutorials',
 'https://cloud.google.com/vertex-ai/docs/tutorials/image-recognition-custom',
 'https://cloud.google.com/vertex-ai/docs/text-data/sentiment-analysis/interpret-results',
 'https://cloud.google.com/vertex-ai/docs/core-release-notes',
 'https://cloud.google.com/vertex-ai/docs/generative-ai/code/test-code-generation-prompts',
 'https://cloud.google.com/vertex-ai/docs/generative-ai/

In [None]:
len(sites_filtered)

597

## Load page content using LangChains UnstructuredURLLoader

In [None]:
from langchain.document_loaders import UnstructuredURLLoader
loader = UnstructuredURLLoader(urls=sites_filtered)
documents = loader.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
documents[42]

Document(page_content='Home\n\nDocs\n\nVertex AI\n\nDocumentation\n\nGuides\n\nService accounts for Tabular Workflows\n\nStay organized with collections\n\nSave and categorize content based on your preferences.\n\nThis page explains the service accounts for the following Tabular Workflows:\n\nTabular Workflow for End-to-End AutoML\n\nTabular Workflow for TabNet\n\nTabular Workflow for Wide & Deep\n\nProphet\n\nARIMA+\n\nService accounts for Tabular Workflow for End-to-End AutoML\n\nThis workflow uses the following service accounts:\n\nService account for Vertex AI Pipelines\n\nThe service account that runs the pipeline\n\nPROJECT_NUMBER-compute@developer.gserviceaccount.com\n\nCompute Engine default service account\n\nYes\n\nService account for Dataflow worker\n\nThe service account that runs the Dataflow workers\n\nPROJECT_NUMBER-compute@developer.gserviceaccount.com\n\nCompute Engine default service account\n\nYes\n\nAI Platform Service Agent\n\nThe service account that runs the trai

In [None]:
len(documents)

## Chunking

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 100)

document_chunks = text_splitter.split_documents(documents)

print(f"Number documents {len(documents)}")
print(f"Number chunks {len(document_chunks)}")

document_chunks=[f"Context: {chunk.page_content} Source: {chunk.metadata['source']}" for chunk in document_chunks]



Number documents 597
Number chunks 5827


# Embeddings for documents



## Create embedding for all document chunks

In [None]:
!rm -rf ./documents
!mkdir ./documents

In [None]:
import time
from langchain.embeddings import VertexAIEmbeddings
import pandas as pd
import json

def handle_quota_errors(func, *args, retry_delay=5,backoff_factor=2, **kwargs):
    retries = 0

    try:
      return func(*args, **kwargs)
    except Exception as e:
      print(f"error: {e}")
      retries += 1
      wait = retry_delay * (backoff_factor ** retries)
      time.sleep(wait)
      print("wait for {wait} seconds")


embeddings = VertexAIEmbeddings()

df = pd.DataFrame(document_chunks, columns =['text'])

index_embeddings = []

for index, doc in df.iterrows():
  print(f"Get embedding and write document for document {index} of {len(df)-1}")
  embedding = handle_quota_errors(embeddings.embed_query, doc['text'])

  if embedding is not None:

    doc_id=f"{index}.txt"
    embedding_dict = {
              "id": doc_id,
              "embedding": [str(value) for value in embedding],
    }
    index_embeddings.append(json.dumps(embedding_dict) + "\n")

    doc_id = f"{index}.txt"
    with open(f"documents/{doc_id}", "w") as document:
      document.write(doc['text'])


with open("embeddings.json", "w") as f:
    f.writelines(index_embeddings)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Get embedding and write document for document 827 of 5826
Get embedding and write document for document 828 of 5826
Get embedding and write document for document 829 of 5826
Get embedding and write document for document 830 of 5826
Get embedding and write document for document 831 of 5826
Get embedding and write document for document 832 of 5826
Get embedding and write document for document 833 of 5826
Get embedding and write document for document 834 of 5826
Get embedding and write document for document 835 of 5826
Get embedding and write document for document 836 of 5826
Get embedding and write document for document 837 of 5826
Get embedding and write document for document 838 of 5826
Get embedding and write document for document 839 of 5826
Get embedding and write document for document 840 of 5826
Get embedding and write document for document 841 of 5826
Get embedding and write document for document 842 of 5826
Get emb

## Copy document chunks and embeddings to Google Cloud Storage for later indexing and retreival steps

In [None]:
!gsutil cp -r documents  gs://doit-llm/documents
!gsutil cp embeddings.json gs://doit-llm/embeddings/embeddings.json

# Index

## Option 1 - Create Streaming Index

In [None]:
%%writefile index.json
{
    displayName: "palm-langchain-document-answering",
    description: "palm-langchain-document-answering",
    metadata: {
       contentsDeltaUri: "gs://doit-llm/embeddings",
       config: {
          dimensions: "768",
          approximateNeighborsCount: 150,
          distanceMeasureType: "DOT_PRODUCT_DISTANCE",
          algorithmConfig: {
             bruteForceConfig: {}
          }
       },
    },
    indexUpdateMethod: "STREAM_UPDATE"
}

In [None]:
!curl -X POST -H "Content-Type: application/json" \
-H "Authorization: Bearer `gcloud auth print-access-token`" \
https://us-central1-aiplatform.googleapis.com/v1/projects/sascha-playground-doit/locations/us-central1/indexes \
-d @index.json

## Option 2 - Create Batch Index

In [None]:
index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
      display_name=DISPLAY_NAME,
      contents_delta_uri=BUCKET,
      dimensions=DIMENSIONS,
      approximate_neighbors_count=150,
      distance_measure_type="DOT_PRODUCT_DISTANCE")

Creating MatchingEngineIndex


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Creating MatchingEngineIndex


Create MatchingEngineIndex backing LRO: projects/234439745674/locations/us-central1/indexes/1788448021554397184/operations/6668007164503654400


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index:Create MatchingEngineIndex backing LRO: projects/234439745674/locations/us-central1/indexes/1788448021554397184/operations/6668007164503654400


KeyboardInterrupt: ignored

## Create Endpoint

In [None]:
REGION = "us-central1"
ENDPOINT = "{}-aiplatform.googleapis.com".format(REGION)

PROJECT_ID = "sascha-playground-doit"
PARENT = "projects/{}/locations/{}".format(PROJECT_ID, REGION)

PROJECT_NUMBER = !gcloud projects list --filter="PROJECT_ID:'{PROJECT_ID}'" --format='value(PROJECT_NUMBER)'
PROJECT_NUMBER = PROJECT_NUMBER[0]

NETWORK_NAME = "matching-engine-vpc-network"

VPC_NETWORK_NAME = "projects/{}/global/networks/{}".format(PROJECT_NUMBER, NETWORK_NAME)
VPC_NETWORK_NAME

'projects/234439745674/global/networks/matching-engine-vpc-network'

In [None]:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DISPLAY_NAME,
    description=DISPLAY_NAME,
    network=VPC_NETWORK_NAME,
    #IMPORTANT if you want to use a public endpoint you need to use aiplatform_v1beta1 when query or inserting vectors
    # https://cloud.google.com/vertex-ai/docs/matching-engine/deploy-index-public
    # public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Creating MatchingEngineIndexEndpoint


Create MatchingEngineIndexEndpoint backing LRO: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136/operations/8752047892069351424


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Create MatchingEngineIndexEndpoint backing LRO: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136/operations/8752047892069351424


MatchingEngineIndexEndpoint created. Resource name: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:MatchingEngineIndexEndpoint created. Resource name: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136


To use this MatchingEngineIndexEndpoint in another session:


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:To use this MatchingEngineIndexEndpoint in another session:


index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136')


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136')


## Deploy Index

In [None]:
# uncomment if you want to use existing index or endpoint

INDEX_RESOURCE_NAME = "projects/234439745674/locations/us-central1/indexes/1788448021554397184"
index = aiplatform.MatchingEngineIndex(index_name=INDEX_RESOURCE_NAME)

ENDPOINT_RESOURCE_NAME = "projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136"
index_endpoint = aiplatform.MatchingEngineIndexEndpoint(index_endpoint_name=ENDPOINT_RESOURCE_NAME)

In [None]:
deployed_index = index_endpoint.deploy_index(
    index=index, deployed_index_id=DISPLAY_NAME.replace('-','_')
)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136


Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136/operations/4082378028439568384


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136/operations/4082378028439568384


MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136


INFO:google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint:MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/234439745674/locations/us-central1/indexEndpoints/4797978472544731136


# The fun part (needs to be inside the same VPC)
Matching Engine now supports in beta public endpoints. Stayed tuned for a dedicated article on that.


In [None]:
INDEX='8624771518414454784'
ENDPOINT='1017628797832265728'
DOCS_BUCKET='doit-llm'

from langchain.embeddings import VertexAIEmbeddings
embeddings = VertexAIEmbeddings()

In [None]:
from langchain.vectorstores.matching_engine import MatchingEngine
from langchain.agents import Tool

def matching_engine_search(question):

    vector_store = MatchingEngine.from_components(
                        index_id=INDEX,
                        region=REGION,
                        embedding=embeddings,
                        project_id=PROJECT_ID,
                        endpoint_id=ENDPOINT,
                        gcs_bucket_name=DOCS_BUCKET)

    relevant_documentation=vector_store.similarity_search(question, k=8)
    context = "\n".join([doc.page_content for doc in relevant_documentation])[:10000]
    return str(context)

In [None]:
question = "what is the latency for Vertex AI Matching Engine?"

In [None]:
from vertexai.preview.language_models import TextGenerationModel

matching_engine_response=matching_engine_search(question)

prompt=f"""
Follow exactly those 3 steps:
1. Read the context below and aggregrate this data
Context : {matching_engine_response}
2. Answer the question using only this context
3. Show the source for your answers
User Question: {question}


If you don't have any context and are unsure of the answer, reply that you don't know about this topic.
"""

model = TextGenerationModel.from_pretrained(TEXT_GENERATION_MODEL)
response = model.predict(
        prompt,
        temperature=0.2,
        top_k=40,
        top_p=.8,
        max_output_tokens=1024,
)

print(f"Question: \n{question}")
print(f"Response: \n{response.text}")

## Direct usage of Vertex AI Matching Engine (only to showase the direct usage for document retrieval)

In [None]:
embedding = embeddings.embed_query("what is the latency for Vertex AI Matching Engine?")
print(embedding)
response = index_endpoint.match(
    deployed_index_id="palm-langchain-document-answering",
    queries=[embedding]
)

response

# Little Index Helper

In [None]:
!gcloud ai index-endpoints list --project="sascha-playground-doit" --region="us-central1"

In [None]:
!gcloud ai index-endpoints undeploy-index "2199120012575244288" \
  --deployed-index-id="image_similarity_vpc" \
  --project="sascha-playground-doit" \
  --region="us-central1"

In [None]:
!gcloud ai indexes list \
  --project="sascha-playground-doit" \
  --region="us-central1"

In [None]:
!gcloud -q ai indexes delete "projects/234439745674/locations/us-central1/indexes/1910889636423532544" \
  --project="sascha-playground-doit" \
  --region="us-central1"

In [None]:
!gcloud ai index-endpoints delete 5731859668712554496 \
  --project="sascha-playground-doit" \
  --region="us-central1" \
  --quiet