In [None]:
# !pip install google-cloud-aiplatform==1.46.0 \
# 'bigframes<1.0.0' \
# langchain==0.1.14 \
# langchain_google_vertexai==0.1.2 \
# chromadb==0.4.24 \
# unstructured==0.12.6 \
# pillow-heif==0.15.0 \
# unstructured-inference==0.7.25 \
# pypdf==4.1.0 \
# pdf2image==1.17.0 \
# unstructured_pytesseract==0.3.12 \
# pikepdf==8.14.0 \
# --upgrade \
# --user

In [4]:
import os
import langchain
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.document_loaders import TextLoader, UnstructuredPDFLoader
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain_google_vertexai import VertexAI, VertexAIEmbeddings
import vertexai

PROJECT_ID = "no-latency-labs"
LOCATION = "us-central1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

INDEX_PATH = "./dataset/"
PERSIST_PATH = "./persistentdb/"

MODEL = "gemini-1.5-pro"
EMBEDDING_MODEL = "textembedding-gecko@003"

In [5]:
def get_split_documents(index_path: str) -> list[str]:
    """
    This function is used to chunk documents and convert them into a list.

    Args:
    index_path: Path of the dataset folder containing the documents.

    Returns:
    List of chunked, or split documents.
    """

    split_docs = []

    for file_name in os.listdir(index_path):
        print(f"file_name : {file_name}")
        if file_name.endswith(".pdf"):
            loader = UnstructuredPDFLoader(index_path + file_name)
        else:
            loader = TextLoader(index_path + file_name)

        text_splitter = CharacterTextSplitter(chunk_size=8192, chunk_overlap=128)
        split_docs.extend(text_splitter.split_documents(loader.load()))

    return split_docs

EMBEDDING_NUM_BATCH = 5

embeddings = VertexAIEmbeddings(
    model_name=EMBEDDING_MODEL, batch_size=EMBEDDING_NUM_BATCH
)

In [6]:
split_docs = get_split_documents(INDEX_PATH)
db = Chroma.from_documents(
    documents=split_docs, embedding=embeddings, persist_directory=PERSIST_PATH
)
# db.persist()

file_name : tax-guidance.pdf


In [7]:
llm = VertexAI(
    model=MODEL,
    max_output_tokens=2048,
    temperature=0.2,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 3})

template = """
    You are a helpful AI assistant. You're tasked to answer the question given below, but only based on the context provided.
    context:

    {context}


    question:

    {input}


    If you cannot find an answer ask the user to rephrase the question.
    answer:

"""
prompt = PromptTemplate.from_template(template)

In [8]:
combine_docs_chain = create_stuff_documents_chain(llm, prompt)

retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [9]:
response = retrieval_chain.invoke({"input": "Tell me about Figuring the EIC."})

In [10]:
print(response["answer"])

To figure out the amount of your Earned Income Credit, you have two options:

1. **Let the IRS calculate it for you:** This is the simpler option. Refer to "IRS Will Figure the EIC for You" in Publication 596 for guidance.

2. **Calculate it yourself:** If you prefer to calculate the EIC on your own,  complete the front of Schedule R (Form 1040) and then proceed to Part III of the same schedule.  "How To Figure the EIC Yourself" in Publication 596 will provide detailed instructions.



<h2>Upload ChromaDb to GCS 

In [13]:
from google.cloud import storage
import os


BUCKET_NAME = "nl-llm"
GCS_PERSIST_PATH = "chroma/"
LOCAL_PERSIST_PATH = "./persistentdb/"

# Initialize GCS client
storage_client = storage.Client()

def upload_directory_to_gcs(local_directory, bucket_name, gcs_directory):
    """Upload all files in a local directory to a GCS directory."""
    bucket = storage_client.bucket(bucket_name)
    
    for root, _, files in os.walk(local_directory):
        for file_name in files:
            local_file_path = os.path.join(root, file_name)
            relative_path = os.path.relpath(local_file_path, local_directory)
            blob = bucket.blob(os.path.join(gcs_directory, relative_path))
            blob.upload_from_filename(local_file_path)
            print(f"Uploaded {local_file_path} to gs://{bucket_name}/{gcs_directory}{relative_path}")

# Upload Chroma persisted data to GCS bucket
upload_directory_to_gcs(LOCAL_PERSIST_PATH, BUCKET_NAME, GCS_PERSIST_PATH)

Uploaded ./persistentdb/chroma.sqlite3 to gs://nl-llm/chroma/chroma.sqlite3
Uploaded ./persistentdb/52207b07-059c-4577-8ff3-3388276bedaa/header.bin to gs://nl-llm/chroma/52207b07-059c-4577-8ff3-3388276bedaa/header.bin
Uploaded ./persistentdb/52207b07-059c-4577-8ff3-3388276bedaa/data_level0.bin to gs://nl-llm/chroma/52207b07-059c-4577-8ff3-3388276bedaa/data_level0.bin
Uploaded ./persistentdb/52207b07-059c-4577-8ff3-3388276bedaa/length.bin to gs://nl-llm/chroma/52207b07-059c-4577-8ff3-3388276bedaa/length.bin
Uploaded ./persistentdb/52207b07-059c-4577-8ff3-3388276bedaa/link_lists.bin to gs://nl-llm/chroma/52207b07-059c-4577-8ff3-3388276bedaa/link_lists.bin
