In [62]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.llms import HuggingFaceHub


In [63]:
DATA_FOLDER = "data"

In [64]:
# Load PDF documents
def load_and_split_pdf(pdf_path):
    """Loads a PDF and splits it into chunks."""
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    texts = text_splitter.split_documents(documents)
    return texts

In [65]:
# Embeddings and Vector Store (ChromaDB)
def create_vectorstore(
    texts,
    embeddings_model_name="sentence-transformers/all-mpnet-base-v2",
    persist_directory="chroma_db",
):
    """Creates a Chroma vector store from text chunks."""
    embeddings = HuggingFaceEmbeddings(model_name=embeddings_model_name)
    vectorstore = Chroma.from_documents(
        documents=texts, embedding=embeddings, persist_directory=persist_directory
    )
    vectorstore.persist()  # Persist the database to disk.
    return vectorstore

In [None]:
# RAG Pipeline
def create_rag_pipeline(
    vectorstore, model_id="HuggingFaceTB/SmolLM2-1.7B-Instruct"
):  # Or other HF LLMs
    """Creates a RetrievalQA pipeline for RAG."""
    llm = HuggingFacePipeline.from_model_id(
        model_id=model_id,
        task="text-generation",
        pipeline_kwargs={"temperature": 0.1, "max_length": 512},
    )  # Adjust temperature and max_length as needed.
    qa = RetrievalQA.from_chain_type(
        llm=llm, chain_type="stuff", retriever=vectorstore.as_retriever()
    )
    return qa

In [67]:
# Main function
def rag_pdf_qa(
    pdf_path,
    question,
    embeddings_model="sentence-transformers/all-mpnet-base-v2",
    llm_model="HuggingFaceTB/SmolLM2-1.7B-Instruct",
):
    """Performs RAG on a PDF and answers a question."""
    texts = load_and_split_pdf(pdf_path)
    vectorstore = create_vectorstore(texts, embeddings_model)
    qa = create_rag_pipeline(vectorstore, llm_model)
    result = qa.invoke(question)
    return result["result"]

In [68]:
pdf_path = "data/aws-overview.pdf"
question = "What are the main topics discussed in this document?"

In [69]:
try:
    answer = rag_pdf_qa(pdf_path, question)
    print(f"Question: {question}")
    print(f"Answer: {answer}")

except FileNotFoundError:
    print(f"Error: PDF file not found at {pdf_path}")
except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: Got device==0, device is required to be within [-1, 0)
