# Project Setup In Vertex AI

Install dependencies for vertexai

In [None]:
%pip install -U -q google-cloud-aiplatform langchain-core langchain-google-vertexai langchain-text-splitters langchain-community "unstructured[all-docs]" pypdf pydantic lxml pillow matplotlib opencv-python tiktoken

Restart runtime to use installed packages

In [None]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

Authenticate user

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

Set up details about google cloud

In [None]:
PROJECT_ID = ""  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# For Vector Search Staging
GCS_BUCKET = ""  # @param {type:"string"}
GCS_BUCKET_URI = f"gs://{GCS_BUCKET}"

Start up vertexai

In [None]:
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION, staging_bucket=GCS_BUCKET_URI)

Import libraries used for the project

In [None]:
import base64
import os
import uuid
import re
import pandas as pd
import json
import ast

from typing import List, Tuple

from IPython.display import display, Image, Markdown

from langchain.prompts import PromptTemplate
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore

from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_core.messages import AIMessage, HumanMessage
from langchain_core.output_parsers import StrOutputParser

from langchain_text_splitters import CharacterTextSplitter

from langchain_google_vertexai import (
    VertexAI,
    ChatVertexAI,
    VertexAIEmbeddings,
    VectorSearchVectorStore,
)

from unstructured.partition.pdf import partition_pdf

Set up model information

In [None]:
MODEL_NAME = "gemini-1.0-pro"
GEMINI_OUTPUT_TOKEN_LIMIT = 8192

EMBEDDING_MODEL_NAME = "text-embedding-004"
EMBEDDING_TOKEN_LIMIT = 2048

TOKEN_LIMIT = min(GEMINI_OUTPUT_TOKEN_LIMIT, EMBEDDING_TOKEN_LIMIT)

# Document Storage Setup

These cells have to be run to set up the vectorstore.

First bring data to local directory

In [None]:
!gsutil -m rsync -r <path> .

Read in data file and convert to list of documents to add

In [None]:
documents = pd.read_csv('documents.csv')
doc_ids = list(documents['id'])
documents = documents[['title', 'question', 'answer']]
documents = json.loads(documents.T.to_json())
texts = []
for doc in documents:
  texts.append(str(documents[doc]))

Create index to store all of the documents

In [None]:
DIMENSIONS = 768

index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name="llm_documents_index",
    dimensions=DIMENSIONS,
    approximate_neighbors_count=150,
    leaf_node_embedding_count=500,
    leaf_nodes_to_search_percent=7,
    description="Document Storage for LLM",
)

Create endpoint to access documents

In [None]:
DEPLOYED_INDEX_ID = "llm_documents_endpoint"

index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=DEPLOYED_INDEX_ID,
    description="Index Endpoint for LLM",
    public_endpoint_enabled=True,
)

Deploy index to endpoint

In [None]:
index_endpoint = index_endpoint.deploy_index(
    index=index, deployed_index_id="llm_documents_index"
)

Create vectorstore to save embedding vectors of the documents

In [None]:
vectorstore = VectorSearchVectorStore.from_components(
    project_id=PROJECT_ID,
    region=LOCATION,
    gcs_bucket_name=GCS_BUCKET,
    index_id=index.name,
    endpoint_id=index_endpoint.name,
    embedding=VertexAIEmbeddings(model_name=EMBEDDING_MODEL_NAME),
    stream_update=False,
)

Create retriever to get documents by embeddings and docstore to map documents to embeddings

In [None]:
docstore = InMemoryStore()

id_key = "doc_id"
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    id_key=id_key,
)

Store the document ids for retrieval later

In [None]:
all_docs = [
    Document(page_content=str(s), metadata={id_key: doc_ids[i]})
    for i, s in enumerate(texts)
]
retriever.docstore.mset(list(zip(doc_ids, texts)))

Add document embeddings to the vectorstore

In [None]:
retriever.vectorstore.add_documents(all_docs)

# Create LLM with RAG from vectorstore

Get index id and endpoint id to create vectorstore object from here the vertex ai cloud console in the vector search tab

In [None]:
index_id = ''
endpoint_id = ''

vectorstore = VectorSearchVectorStore.from_components(
    project_id=PROJECT_ID,
    region=LOCATION,
    gcs_bucket_name=GCS_BUCKET,
    index_id=index_id,
    endpoint_id=endpoint_id,
    embedding=VertexAIEmbeddings(model_name=EMBEDDING_MODEL_NAME),
    stream_update=False,
)

Create retriever to get documents from vectorstore

In [None]:
docstore = InMemoryStore()

id_key = "doc_id"
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=docstore,
    id_key=id_key,
)

Add dictionary to convert embeddings back to original documents

In [None]:
df = pd.read_csv('documents.csv')
doc_ids = list(df['id'])
df = df[['title', 'question', 'answer']]
text = df.T.to_json()
text = json.loads(text)
texts = []
for k in text:
  texts.append(str(text[k]))
retriever.docstore.mset(list(zip(doc_ids, texts)))

Create chain pipeline to retrieve documents then generate answer to the question

In [None]:
def combine_context_question(inputs):
    """
    Combine the context and question to create the prompt for the LLM
    """
    context = inputs.get("context", "")
    question = inputs.get("question", "")
    prompt = f"Context: {context}\n\nQuestion: {question}"
    return prompt

llm_chain = (
    {
        "context": retriever, # Retrieve similar documents
        "question": RunnablePassthrough(), # Question from user
    }
    | RunnableLambda(combine_context_question) # Create a combined prompt
    | ChatVertexAI( # Ask question to LLM
        temperature=0,
        model_name=MODEL_NAME,
        max_output_tokens=TOKEN_LIMIT,
    )
    | StrOutputParser() # Return output
)

Run llm chain to get answer to question from retrieved documents

In [None]:
query = ""

result = llm_chain.invoke(query)

Markdown(result)