In [1]:
from typing import List
import shutil
import os
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain_community.document_loaders import PyPDFLoader 
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnableMap, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableConfig

In [2]:
load_dotenv(dotenv_path='.env', verbose=True)
NVIDIA_E5_EMBEDDING_API_KEY = os.getenv('NVIDIA_E5_EMBEDDING_API_KEY')

In [3]:
def load_documents(DATA_PATH: str) -> List[Document]:
    documents = []
    
    if not os.path.exists(DATA_PATH):
        raise FileNotFoundError(f"Directory not found: {DATA_PATH}")
    
    pdf_files = [f for f in os.listdir(DATA_PATH) if f.endswith('.pdf')]
    print(f"Found {len(pdf_files)} PDF files in {DATA_PATH}")
    
    for filename in pdf_files:
        filepath = os.path.join(DATA_PATH, filename)
        try:
            loader = PyPDFLoader(filepath)
            docs = loader.load()
            
            for doc in docs:
                doc.metadata['source'] = filename
            
            documents.extend(docs)
            print(f"Successfully loaded {len(docs)} pages from {filename}")
            
        except Exception as e:
            print(f"Error loading {filename}: {str(e)}")
    
    return documents

In [4]:
def split_text(documents: List[Document]) -> List[Document]:
    if not documents:
        raise ValueError("No documents to split")
        
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=100,
        length_function=len,
        is_separator_regex=False,
    )
    
    chunks = text_splitter.split_documents(documents)
    print(f"{len(documents)} documents are split into {len(chunks)} chunks")
    
    chunks = [chunk for chunk in chunks if chunk.page_content.strip()]
    print(f"After removing empty chunks: {len(chunks)} chunks remain")
    
    return chunks

In [None]:
def save_data_to_db(data_chunks: List[Document], CHROMA_PATH: str):
    if not data_chunks:
        raise ValueError("No chunks to save to database")
        
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    
    try:
        # embeddings = NVIDIAEmbeddings(
        #     model="nvidia/nv-embedqa-e5-v5",
        #     api_key=NVIDIA_E5_EMBEDDING_API_KEY,
        #     truncate="NONE"
        # )   
        # embeddings = NVIDIAEmbeddings(
        #     model="nvidia/nv-embedqa-e5-v5", 
        #     api_key="", 
        #     truncate="NONE", 
        # ) 

        embeddings = NVIDIAEmbeddings(
                model="nvidia/nv-embedqa-mistral-7b-v2", 
                api_key="", 
                truncate="NONE", 
                )    
        
        vector_store = Chroma(
            collection_name="NLP_Project_embedding",
            embedding_function=embeddings,
            persist_directory=CHROMA_PATH
        )

        vector_store.add_documents(documents=data_chunks)

        print(f"Saved {len(data_chunks)} chunks to chroma db")
        return vector_store
        
    except Exception as e:
        print(f"Error creating embeddings: {str(e)}")
        raise

In [6]:
def getEmbeddings(CHROMA_PATH: str, DATA_PATH: str):
    try:
        documents = load_documents(DATA_PATH)
        if not documents:
            raise ValueError("No documents were successfully loaded")
            
        data_chunks = split_text(documents)
        print(f"Length of data chunks is {len(data_chunks)}")
        

        if not data_chunks:
            raise ValueError("No chunks were created from the documents")
                    
        db = save_data_to_db(data_chunks, CHROMA_PATH)
        return db
        
    except Exception as e:
        print(f"Error in getEmbeddings: {str(e)}")
        raise

In [7]:
if __name__ == "__main__":
    try:
        if not NVIDIA_E5_EMBEDDING_API_KEY:
            raise ValueError("Nvidia embedding api key not found in environment variables")
        else:
            print(f"NVIDIA EMBEDDING API KEY IS {NVIDIA_E5_EMBEDDING_API_KEY}")
            
        CHROMA_PATH = "chroma"
        DATA_PATH = "pdfs/"
        
        os.makedirs(CHROMA_PATH, exist_ok=True)
        
        db = getEmbeddings(CHROMA_PATH, DATA_PATH)
        
        if db is not None:
            test_results = db.similarity_search("test query", k=1)
            print("\nDatabase test successful")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")

NVIDIA EMBEDDING API KEY IS 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
Found 2 PDF files in pdfs/
Successfully loaded 24 pages from AWS Customer Agreement.pdf
Successfully loaded 90 pages from AWS Service Terms.pdf
114 documents are split into 387 chunks
After removing empty chunks: 387 chunks remain
Length of data chunks is 387
Saved 387 chunks to chroma db

Database test successful


In [11]:
db.get(
    include = ["embeddings"]
)

{'ids': ['e32b1d09-8bf6-4bd8-8458-c9a09590475d',
  'a6747cc4-9dd7-4d31-9730-9adfb010d5da',
  'f5491f31-016c-4571-9158-05e841b3f09a',
  '81d829fc-eefa-4e85-b306-765ed53151bf',
  '94f4d005-e56d-4dd2-8ce2-38a311a3ced4',
  '41628022-9a05-4966-9ff7-7b3cbc1ac7e0',
  'ae1819b5-3555-47b3-b785-2255a6e09687',
  '9b69cfc9-a1c0-4b0d-b221-d7ad5150d86d',
  'ed3125dd-a85e-4a5e-9ea9-68d78e5f0259',
  '43cf9b12-720b-4375-b16a-540a70e4ce25',
  'e1e56966-1781-4864-b99a-67c625415b49',
  '936c9f3f-865d-449f-b9b4-22b60bf55961',
  '645ec57f-f1c8-454a-91f7-bfc22f98645a',
  '3a5f1f13-366d-46d4-8e86-eb7549124066',
  '59b0b200-6e1c-444f-b0b8-5f24d5aa33d2',
  'a91cb984-fd7a-4486-8eed-0161a5a5b5f3',
  '31b28c7f-10bb-4c49-91f1-4fc4d88abda9',
  '5ab72c42-f890-4591-93c9-a1c5aebc7ece',
  'ff55cf1f-8b4c-471a-b000-99fe70f4f1b5',
  '29218ae0-daee-4a5e-acf5-d22bf509c90b',
  'aa53d611-27d0-49e5-90df-92de73f849a6',
  '49152da3-dc81-4208-bc0d-8ef1cc2f9cf0',
  '691f0604-2c21-4ccd-abd7-43e407302259',
  '06c40f55-753d-41f1-ae46-

In [None]:
import chromadb
from chromadb.config import Settings



{'ids': ['4316af71-8a46-4c9c-9bc7-69ba08157555',
  'bd12e665-ed07-4235-aa95-818cde22808a',
  '3ca02e7a-de7a-4e96-8e98-5380af33b9c2',
  '94e18f72-faed-4701-9aff-00d25fbe97c1',
  '964d7c95-5475-453f-a637-99b1395d2fb4',
  '6790fc5a-e39a-42ef-af37-d338b9e63b6b',
  '798a2aca-736b-434e-b22c-3fb3a4fbb789',
  'd8374e17-7462-41b5-94b8-0b50e7deba16',
  'e1e4293a-2f06-4c62-93b1-1013ea0eed8f',
  '6908fabb-c72d-4840-9c61-c655982938e2',
  '70bf4cce-3c7f-47aa-9819-ad59a700d8b1',
  'b1c77f3b-ec4a-4851-aeab-583f68d34a92',
  '8174bdd9-282f-45e1-a9b2-d29550d6c49b',
  '87017dde-a459-4b48-a38f-320e01bc9cbc',
  '7b3a82c4-35ab-4674-910a-baebe4fe99d8',
  '070990a8-80e2-4076-a138-1bedc07e0324',
  '097a893c-5fa0-48d1-b95e-db3beda57e23',
  'd8607afb-a1d4-461a-85d9-91361af64e12',
  '3fcdfbe5-0765-4572-b638-95b5d2eabbad',
  'd05ff497-c9e2-4ea4-bd41-61c44650f372',
  '6fe92059-0aaa-4aec-bb98-bd0c69d06bb0',
  '72907d88-31ce-4fdb-a982-3c1b8d61b15e',
  'b51f523b-9a66-4bac-ab46-1585e42add7b',
  '72bb942d-26c9-4803-82c6-

In [12]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a virtual chat assistant for answering AWS customer and AWS service agreement questions. Answer the question based on the given context. If you don't know the answer, just say you don't know and don't make up an answer. The answer needs to be factual and based on the context given."),
    MessagesPlaceholder(variable_name="context"),
    ("user", "{question}")
])

In [None]:
llm = ChatOpenAI(
    model = "gpt-3.5-turbo",
    temperature=0.2,
    api_key="",
    max_retries=2,
    max_tokens=None
)

In [None]:
embeddings = NVIDIAEmbeddings(
                model="nvidia/nv-embedqa-mistral-7b-v2", 
                api_key="", 
                truncate="NONE", 
                )

vectordb = Chroma(
    persist_directory=CHROMA_PATH,
    embedding_function=embeddings
)

doc_count = vectordb._collection.count()
print(f"Vector store contains {doc_count} documents.")

retriever = vectordb.as_retriever()

Vector store contains 0 documents.


In [15]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = RunnableMap({
    "context": retriever,
    "question": RunnablePassthrough()
}) | prompt | llm | StrOutputParser()

In [16]:
query = "What is Amazon Fraud Detector terms mentioned in service terms?"

retrieved_docs = retriever.invoke(query)
print(f"Retrieved {len(retrieved_docs)} documents")

Retrieved 0 documents


In [17]:
for doc in retrieved_docs:
    print(doc.metadata['source'])
    print(doc.page_content[:200])


In [18]:
result = rag_chain.invoke(query)

print("Answer:", result)

Answer: Amazon Fraud Detector is a fully managed service that makes it easy to identify potentially fraudulent online activities such as online payment fraud and the creation of fake accounts. The terms related to Amazon Fraud Detector can be found in the AWS Service Terms. These terms outline the usage policies, pricing, data protection, and other important information related to using Amazon Fraud Detector within the AWS environment. For specific details, it is recommended to refer to the AWS Service Terms documentation for Amazon Fraud Detector.
