### Let's see RAG in Action

#### We will be using LangChain Library. It helps to abstract lot of complexities while helping build chains that makes working with LLMs easy

In [None]:
# Import the necessary libraries
import os
from dotenv import load_dotenv, find_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai

load_dotenv(find_dotenv())
#openai.api_key = os.environ["OPENAI_API_KEY"]

### Create Vector Store (persisted)

In [None]:
PERSIST_DIRECTORY = "../vector_db"
# Initialize Chroma client with persistence settings
vectorstore = Chroma(
    persist_directory=PERSIST_DIRECTORY,
    embedding_function=OpenAIEmbeddings()
)

### Vectorize the data

In [None]:

# Load and vectorize documents from a folder
folder_path = "../test_data"

print("loading documents...")
loader = DirectoryLoader(folder_path, glob="*.txt")  # Adjust glob for other file types if needed
documents = loader.load()

# Split large documents into smaller chunks for better embeddings
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
#docs = text_splitter.split_documents(documents)

# Add documents to the vector store and persist it
vectorstore.add_documents(documents)
vectorstore.persist()

### Let's validate Vector store data

In [None]:
#  Test retrieval functionality
def test_documents_retrieval():
    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={"k": 1}
    )

    # Try a simple query to see if retrieval works
    test_query = "What information do you have?"
    docs = retriever.get_relevant_documents(test_query)
    
    print(f"Retrieved {len(docs)} documents for test query")
    if len(docs) == 0:
        print("❌ No documents retrieved for test query. Check embedding function and collection.")
        return
    else:
        print("✅ Document retrieval is working")
        print("\nSample document content:")
        print(f"Document 1: {docs[0].page_content[:150]}...")

In [None]:
# Check if vector store exist and have documents
collection_count = len(vectorstore._collection.get()['ids'])
print(f"Found {collection_count} documents in the collection")

if collection_count == 0:
    print("❌ No documents found in the collection. Please check the folder path and document format.")
else:
    test_documents_retrieval()

### Create Conversational Chain

In [None]:
# Set up the conversational chain

llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
retriever = vectorstore.as_retriever(search_kwargs={"k": 1})  # Adjust k for number of documents to retrieve
chain = ConversationalRetrievalChain.from_llm(llm, retriever)

### Let's chat with our personal bot

In [None]:
def chat_with_bot():
    print("Bot is ready. Type 'exit' to quit.")
    chat_history = []

    while True:
        user_input = input("User: ")
        if user_input.lower() == "exit":
            break
        
        response = chain({"question": user_input, "chat_history": chat_history})
        print(f"Bot: {response['answer']}")
        
        chat_history.append((user_input, response['answer']))

chat_with_bot()