In [None]:
import os
from dotenv import load_dotenv 
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.prompts import PromptTemplate
import gradio as gr

In [None]:
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")

In [None]:
def set_openai_api_key(api_key: str):
    """
    Load OpenAI API key from environment variables and set it for the OpenAI API.
    Raise an error if the API key is not found.
    """
    api_key = os.getenv("OPENAI_API_KEY")  # Fetch the OpenAI API key from environment
    if api_key:
        os.environ["OPENAI_API_KEY"] = api_key  # Set it as an environment variable for use by OpenAI
    else:
        # If the API key is not found, raise a descriptive error
        raise ValueError("OPENAI_API_KEY is not set in the environment variables. Please check your .env file.")

In [None]:
def load_and_split_pdf(pdf_path: str, chunk_size=1000, chunk_overlap=100):
    """
    Load the PDF file and split it into smaller chunks for processing.
    The chunks will be used for embedding and vector search.
    
    Parameters:
    - pdf_path: path to the PDF document.
    - chunk_size: size of each chunk.
    - chunk_overlap: how much overlap there should be between chunks to preserve context.
    
    Returns:
    - A list of text chunks split from the document.
    """
    try:
        # Load the PDF using PyPDFLoader
        loader = PyPDFLoader(pdf_path)
        documents = loader.load()

        if not documents:
            # If the PDF is empty, raise an error
            raise ValueError("No content found in the PDF file.")
        
        # Use RecursiveCharacterTextSplitter to split the documents into smaller chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )

        chunks = text_splitter.split_documents(documents)  # Split the document into chunks
        return chunks

    except FileNotFoundError:
        # Handle the case where the PDF file is not found
        raise FileNotFoundError(f"PDF file not found at: {pdf_path}")
    except Exception as e:
        # Catch any other errors and provide a meaningful message
        raise RuntimeError(f"An error occurred while loading or splitting the PDF: {e}")


In [None]:
def create_vectorstore(chunks, persist_directory=None):
    """
    Create a vector store from the document chunks.

    Parameters:
    - chunks: List of document chunks to index.
    - persist_directory: Path where the FAISS index should be saved. If None, it will be in memory.

    Returns:
    - A FAISS vector store for performing similarity-based searches.
    """
    try:
        embedding = OpenAIEmbeddings()  # Create OpenAI embeddings to convert text into vector form

        # Create the FAISS vector store from the document chunks and embeddings
        vectorstore = FAISS.from_documents(
            documents=chunks,
            embedding=embedding
        )
        
        if persist_directory:
            # Save the FAISS index to a directory if persist_directory is provided
            vectorstore.save_local(persist_directory)

        return vectorstore
    except Exception as e:
        # Catch any errors related to FAISS and embeddings
        raise RuntimeError(f"An error occurred while creating the vector store: {e}")

In [None]:
def build_qa_chain(vectorstore):
    """
    Build a QA chain that combines a language model (LLM) and a retriever (vector store).
    This chain will retrieve the most relevant documents from the vector store 
    and use the LLM to generate answers to user queries.

    Parameters:
    - vectorstore: The FAISS vector store used to retrieve relevant context.

    Returns:
    - A RetrievalQA chain for answering queries.
    """
    try:
        # Define the prompt template that will be used for question answering
        prompt_template = PromptTemplate(
            input_variables=["context", "question"],
            template="""
You are a helpful assistant answering questions about Nestlé's HR Policy.

Use only the information from the provided document context.

Context:
{context}

Question: {question}
Helpful Answer:"""
        )

        # Set up the RetrievalQA chain that uses the vector store and OpenAI model
        qa_chain = RetrievalQA.from_chain_type(
            llm=ChatOpenAI(model_name="gpt-4o-mini", temperature=0),  # LLM for generating responses
            retriever=vectorstore.as_retriever(),  # Retrieve relevant context from the FAISS vector store
            chain_type="stuff",  # The type of retrieval chain used
            chain_type_kwargs={"prompt": prompt_template},  # Provide the prompt to the LLM
            return_source_documents=False  # We do not need to return the source documents
        )
        
        return qa_chain
    except Exception as e:
        # Catch any errors related to the creation of the QA chain
        raise RuntimeError(f"An error occurred while building the QA chain: {e}")

In [None]:
def answer_query(query: str, qa_chain):
    """
    Answer a query using the provided QA chain.

    Parameters:
    - query: The user's question to be answered.
    - qa_chain: The QA chain that will process the query and return an answer.

    Returns:
    - The generated answer to the query.
    """
    if not query or len(query.strip()) < 5:
        # If the query is too short or empty, prompt the user for more information
        return "Please enter a more specific question."
    
    try:
        # Run the query through the QA chain and return the result
        answer = qa_chain.run(query)
        return answer
    except ValueError as e:
        # Handle any value-related errors
        return f"Error with the provided input: {str(e)}"
    except Exception as e:
        # Catch all other errors and display the error message
        return f"An error occurred while processing your query: {str(e)}"

In [None]:
def launch_chatbot_interface(qa_chain):
    """
    Launch a Gradio interface for the chatbot.
    Users can interact with the chatbot through this interface.

    Parameters:
    - qa_chain: The QA chain that will handle the user's queries.
    """
    gr.Interface(
        fn=lambda query: answer_query(query, qa_chain),  # Function to handle user input and provide answers
        inputs=gr.Textbox(label="Ask a question about Nestlé's HR Policy"),  # User input for questions
        outputs=gr.Textbox(label="Answer"),  # The answer displayed to the user
        title="Nestlé HR Policy Chatbot",  # Title of the interface
        description="Ask any question related to the HR policy. The chatbot will find and return the best answer from the document."
    ).launch(share=True)  # Launch the interface with a public link

In [None]:
def run_chatbot_workflow(api_key: str, pdf_path: str):
    # Step 1: Set OpenAI key
    set_openai_api_key(api_key)

    # Step 2: Load & split PDF
    chunks = load_and_split_pdf(pdf_path)

    # Step 3: Create vector store
    vectorstore = create_vectorstore(chunks)

    # Step 4: Build QA chain
    qa_chain = build_qa_chain(vectorstore)

    # Step 5: Launch Gradio
    launch_chatbot_interface(qa_chain)


In [None]:
api_key = os.getenv("OPENAI_API_KEY")
pdf_path = "Nestle HR Policy.pdf"
run_chatbot_workflow(api_key, pdf_path)