# 🧠 PDF Q&A Chatbot with RAG
This notebook lets you upload a PDF, convert it to text chunks, embed them, store in Chroma DB, and use LangChain + OpenAI to chat with it using Retrieval-Augmented Generation (RAG).

In [None]:
# Install necessary packages
%pip install -q langchain openai chromadb pypdf gradio tiktoken sentence-transformers langchain-groq



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [6]:
%pip install python-dotenv langchain-community



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3.12 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
#Set GROQ API key
from dotenv import load_dotenv
import os

load_dotenv()  # Loads the .env file from the root directory

api_key = os.getenv("GROQ_API_KEY")

if not api_key:
    raise ValueError("GROQ_API_KEY not found in .env file")


In [None]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pdf_path = "./1-s2.0-S1773224725003533-main (1).pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()
print(f"✅ Loaded {len(docs)} pages from the PDF")

# Split into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
print("🔪 Splitting the document into chunks...")
chunks = text_splitter.split_documents(docs)
print(f"✅ Loaded and split into {len(chunks)} chunks")


✅ Loaded 10 pages from the PDF
🔪 Splitting the document into chunks...
✅ Loaded and split into 191 chunks


In [None]:
# Create Chroma vector store using GROQ embeddings
from langchain.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
# Instantiate the chosen embedding model
# Using 'all-MiniLM-L6-v2' as a commonly used and efficient model
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
print("Embedding model initialized:", embedding_model)
# Replace the GROQEmbeddings() instance with the new embedding model
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
    persist_directory="./chroma_db"
)
retriever = vectorstore.as_retriever()


  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


Embedding model initialized: client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
) model_name='all-MiniLM-L6-v2' cache_folder=None model_kwargs={} encode_kwargs={} multi_process=False show_progress=False


In [None]:
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA

# Replace ChatOpenAI with ChatGroq
llm = ChatGroq(groq_api_key=api_key, model_name="meta-llama/llama-4-scout-17b-16e-instruct") # Using a Groq model
qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)



In [15]:
response = qa_chain.run("Who is the first author of this scientific paper?")
print(response)

The first author of this scientific paper is Ana R.M. Ribeiro.
