In [1]:
from google.cloud import storage
import os
from langchain.vectorstores.chroma import Chroma
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

BUCKET_NAME = "nl-llm"
GCS_PERSIST_PATH = "chroma/"
LOCAL_PERSIST_PATH = "./local_chromadb/"

# Initialize GCS client
storage_client = storage.Client()

def download_directory_from_gcs(gcs_directory, local_directory, bucket_name):
    """Download all files from a GCS directory to a local directory."""
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=gcs_directory)

    for blob in blobs:
        if not blob.name.endswith("/"):  # Avoid directory blobs
            relative_path = os.path.relpath(blob.name, gcs_directory)
            local_file_path = os.path.join(local_directory, relative_path)
            os.makedirs(os.path.dirname(local_file_path), exist_ok=True)
            blob.download_to_filename(local_file_path)
            print(f"Downloaded {blob.name} to {local_file_path}")

# Download Chroma persisted data from GCS to local directory
download_directory_from_gcs(GCS_PERSIST_PATH, LOCAL_PERSIST_PATH, BUCKET_NAME)

# Step to use the data locally in retrieval
EMBEDDING_MODEL = "textembedding-gecko@003"
EMBEDDING_NUM_BATCH = 5

# Load embeddings and persisted data
embeddings = VertexAIEmbeddings(
    model_name=EMBEDDING_MODEL, batch_size=EMBEDDING_NUM_BATCH
)

# Load Chroma data from local persisted directory
db = Chroma(persist_directory=LOCAL_PERSIST_PATH,embedding_function=embeddings)

# Now use db for retrieval
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

Downloaded chroma/52207b07-059c-4577-8ff3-3388276bedaa/data_level0.bin to ./local_chromadb/52207b07-059c-4577-8ff3-3388276bedaa/data_level0.bin
Downloaded chroma/52207b07-059c-4577-8ff3-3388276bedaa/header.bin to ./local_chromadb/52207b07-059c-4577-8ff3-3388276bedaa/header.bin
Downloaded chroma/52207b07-059c-4577-8ff3-3388276bedaa/length.bin to ./local_chromadb/52207b07-059c-4577-8ff3-3388276bedaa/length.bin
Downloaded chroma/52207b07-059c-4577-8ff3-3388276bedaa/link_lists.bin to ./local_chromadb/52207b07-059c-4577-8ff3-3388276bedaa/link_lists.bin
Downloaded chroma/chroma.sqlite3 to ./local_chromadb/chroma.sqlite3


In [2]:
template = """
    You are a helpful AI assistant. You're tasked to answer the question given below, but only based on the context provided.
    context:

    {context}


    question:

    {input}


    If you cannot find an answer ask the user to rephrase the question.
    answer:
"""
prompt = PromptTemplate.from_template(template)

llm = VertexAI(
    model="gemini-1.5-pro",
    max_output_tokens=2048,
    temperature=0.2,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Recreate the retrieval and combine docs chain
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

# Example query
response = retrieval_chain.invoke({"input": "Tell me about Figuring the EIC."})

In [3]:
response["answer"]

'To figure out the amount of your Earned Income Credit, you have two options:\n\n1. **Let the IRS calculate it for you:** This is the simpler option. Refer to "IRS Will Figure the EIC for You" in Publication 596 for guidance.\n\n2. **Calculate it yourself:** If you prefer to calculate the EIC on your own,  complete the front of Schedule R (Form 1040) and then proceed to Part III of the same form.  "How To Figure the EIC Yourself" in Publication 596 will provide detailed instructions. \n'