# RAG Pipeline with OpenAI and FAISS

This section demonstrates a simple Retrieval-Augmented Generation (RAG) pipeline that:
  - Loads text documents from a folder (e.g. `data`).
  - Creates document embeddings using OpenAI.
  - Builds a FAISS vector store for similarity search.
  - Sets up a RetrievalQA chain to answer a sample query.

**File Name:** `rag_pipeline.py`

In [None]:
import os
import glob
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# Ensure the OpenAI API key is set
openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError('Please set the OPENAI_API_KEY environment variable.')

def load_documents(directory):
    """
    Loads all text documents (.txt files) from the given directory.
    """
    docs = []
    for file_path in glob.glob(os.path.join(directory, '*.txt')):
        loader = TextLoader(file_path)
        docs.extend(loader.load())
    return docs

# 1. Load documents from the 'data' folder
docs = load_documents('data')
print(f"Loaded {len(docs)} documents.")

# 2. Initialize OpenAI embeddings (this calls the OpenAI API)
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# 3. Build a FAISS vector store from the documents
vector_store = FAISS.from_documents(docs, embeddings)
print("Vector store created with FAISS.")

# 4. Create a RetrievalQA chain using OpenAI LLM
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(openai_api_key=openai_api_key, temperature=0),
    chain_type="stuff",  # Concatenates retrieved docs
    retriever=vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
)

# 5. Run a sample query
query = "What is the main idea discussed in these documents?"
answer = qa_chain.run(query)
print("Query:", query)
print("Answer:", answer)

# Document QA with Local Embeddings

This section demonstrates a document question-answering system that uses local embeddings from a Sentence Transformer model. It:
  - Loads documents recursively from a folder (e.g. `docs`).
  - Creates embeddings using the `all-MiniLM-L6-v2` model.
  - Builds a FAISS vector store for similarity search.
  - Uses OpenAI’s LLM to answer a question based on the retrieved documents.

**File Name:** `local_document_qa.py`

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import DirectoryLoader
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# Load documents recursively from the 'docs' directory (all .txt files)
loader = DirectoryLoader('docs', glob='**/*.txt')
docs = loader.load()
print(f"Loaded {len(docs)} documents from the 'docs' directory.")

# Initialize local embeddings using Sentence Transformers
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

# Build a FAISS vector store from the documents
vector_store = FAISS.from_documents(docs, embeddings)
print("FAISS vector store created using local embeddings.")

# Create a RetrievalQA chain with OpenAI LLM
qa_chain = RetrievalQA.from_chain_type(
    llm=OpenAI(temperature=0),  # Uses the OPENAI_API_KEY from your environment
    chain_type="stuff",
    retriever=vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 5})
)

# Run a sample question
question = "Can you summarize the key points in the documents?"
answer = qa_chain.run(question)

print("Question:", question)
print("Answer:", answer)

# Conversational Chatbot with Retrieval-Augmented Generation

This section builds an interactive chatbot that uses a conversational retrieval chain. The chatbot:
  - Loads documents from a folder (e.g. `chat_data`).
  - Creates embeddings using OpenAI’s model.
  - Builds a FAISS vector store for retrieval.
  - Uses a conversation memory to maintain context across interactions.

**File Name:** `conversational_chatbot.py`

In [None]:
import os
import glob
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.chains import ConversationalRetrievalChain
from langchain.llms import OpenAI
from langchain.memory import ConversationBufferMemory

# Ensure the OpenAI API key is set
openai_api_key = os.getenv('OPENAI_API_KEY')
if not openai_api_key:
    raise ValueError('Please set the OPENAI_API_KEY environment variable.')

def load_documents(directory):
    """
    Loads all text documents (.txt files) from the specified directory.
    """
    docs = []
    for file_path in glob.glob(os.path.join(directory, '*.txt')):
        loader = TextLoader(file_path)
        docs.extend(loader.load())
    return docs

# Load documents from the 'chat_data' folder
docs = load_documents('chat_data')
print(f"Loaded {len(docs)} documents for chatbot retrieval.")

# Create embeddings using OpenAI
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Build a FAISS vector store
vector_store = FAISS.from_documents(docs, embeddings)
print("FAISS vector store created for retrieval.")

# Initialize conversation memory to store chat history
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# Create a conversational retrieval chain that integrates:
#  - OpenAI LLM (with some creativity via temperature)
#  - The FAISS vector store retriever
#  - Conversational memory
conv_chain = ConversationalRetrievalChain.from_llm(
    llm=OpenAI(openai_api_key=openai_api_key, temperature=0.7),
    retriever=vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3}),
    memory=memory,
    verbose=True
)

# Simulate a conversation (in a real-world scenario, you might use input())
print("Chatbot is ready! Simulated conversation:")

# First query
user_input = "Tell me about the documents."
response = conv_chain.run(user_input)
print("User:", user_input)
print("Bot:", response)

# Second query with conversational context
user_input = "Can you provide more details?"
response = conv_chain.run(user_input)
print("User:", user_input)
print("Bot:", response)