In [1]:
!pip install -q pypdf faiss-cpu langchain langchain-google-genai PyPDF2 python-dotenv

In [2]:
!pip install langchain_community



In [3]:
from PyPDF2 import PdfReader
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
import textwrap
import os

# Set your Gemini API Key
os.environ["GOOGLE_API_KEY"] = "AIzaSyCXAypufNrgwkghOfBgDbz4Om8TJ8PlUWA"  # Replace with your actual key

In [4]:
def load_pdfs(pdf_paths):
    """Extract text from PDFs with page metadata"""
    documents = []
    for path in pdf_paths:
        try:
            pdf_reader = PdfReader(path)
            for page_num, page in enumerate(pdf_reader.pages):
                text = page.extract_text()
                if text.strip():
                    metadata = {
                        "source": os.path.basename(path),
                        "page": page_num + 1
                    }
                    documents.append(Document(page_content=text, metadata=metadata))
        except Exception as e:
            print(f"Error processing {path}: {str(e)}")
    return documents

def chunk_documents(documents):
    """Split documents into manageable chunks"""
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=300,
        length_function=len
    )
    return text_splitter.split_documents(documents)

In [5]:
def create_vector_store(chunks):
    """Create FAISS vector store using Gemini embeddings"""
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return FAISS.from_documents(chunks, embeddings)

In [6]:
def setup_qa_chain(vector_store):
    """Create retrieval QA chain with Gemini Pro"""
    # Custom prompt for research paper QA
    prompt_template = """
    You are an AI research assistant. Answer the question based only on the following context which comes from AI research papers.
    Provide detailed, technical answers and always cite your sources using the document metadata.

    Context:
    {context}

    Question: {question}

    Answer in markdown format with clear section references:
    """

    PROMPT = PromptTemplate(
        template=prompt_template,
        input_variables=["context", "question"]
    )

    # Initialize Gemini Pro
    llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash-exp", temperature=0.3)

    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 5}),
        chain_type_kwargs={"prompt": PROMPT},
        return_source_documents=True
    )

In [7]:
def main():
    # Upload PDFs first (use Colab file upload)
    pdf_paths = [
        "/content/paper1.pdf",
        "/content/paper2.pdf",
        "/content/paper3.pdf"
    ]

    print("‚è≥ Loading and processing PDFs...")
    raw_docs = load_pdfs(pdf_paths)
    chunks = chunk_documents(raw_docs)
    print(f"‚úÖ Processed {len(raw_docs)} documents into {len(chunks)} chunks")

    print("üß† Creating vector database...")
    vector_store = create_vector_store(chunks)
    print("‚úÖ Vector database created")

    qa_chain = setup_qa_chain(vector_store)

    # Interactive question answering
    while True:
        print("\n" + "="*50)
        question = input("\nAsk a question about the research (type 'exit' to quit):\n")

        if question.lower() == 'exit':
            break

        print("\nüîç Searching and generating answer...")
        result = qa_chain.invoke({"query": question})

        # Print formatted answer
        print("\n" + "="*50)
        print("üí° ANSWER:")
        print(textwrap.fill(result["result"], width=100))

        # Print sources
        print("\nüìö SOURCES:")
        unique_sources = set()
        for doc in result["source_documents"]:
            source_info = f"{doc.metadata['source']} (page {doc.metadata['page']})"
            if source_info not in unique_sources:
                unique_sources.add(source_info)
                print(f"- {source_info}")

        print("="*50)

In [None]:
if __name__ == "__main__":
    main()

‚è≥ Loading and processing PDFs...
‚úÖ Processed 109 documents into 249 chunks
üß† Creating vector database...
‚úÖ Vector database created


Ask a question about the research (type 'exit' to quit):
Main components of RAG model and how they interact?

üîç Searching and generating answer...

üí° ANSWER:
The RAG model consists of a retriever and a generator that work together to produce text. The
retriever identifies relevant documents based on the input, and the generator uses these documents
to generate the output text. ([2.1 Models])  The interaction between these components is as follows:
1.  **Retrieval:** Given an input x, the retriever identifies the top-K documents, denoted as z,
based on  p(z|x). ([2.1 Models]) 2.  **Generation:** The generator, conditioned on the input x and
the retrieved document z, generates the output sequence y. The probability of generating a token is
given by p(y\_i|x, z, y\_{1:i-1}), where y\_{1:i-1} represents the tokens generated before the
current t

# New Section