In [2]:
%%capture
!pip install google-generativeai langchain langchain_google_genai langchain_text_splitters pypdf annoy


In [4]:
%%capture
!pip install langchain_community
!pip install python-dotenv

In [5]:
%%capture
!pip install pdfplumber


In [6]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import Annoy
from langchain.schema.document import Document
import google.generativeai as genai

In [9]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")

In [11]:
genai.configure(api_key=api_key)

In [14]:
import pdfplumber

data_path = "data"

def load_pdf():
    documents = []
    for pdf_file in os.listdir(data_path):
        if pdf_file.endswith(".pdf"):
            with pdfplumber.open(os.path.join(data_path, pdf_file)) as pdf:
                text = "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
                documents.append(Document(page_content=text))
    return documents

In [16]:
print(os.listdir())

['.DS_Store', 'requirements.txt', 'anaconda_projects', '.gitignore', '.env', '.virtual_documents', '.ipynb_checkpoints', 'RAG(geminiAI).ipynb', 'demo.py', 'data', 'RAG(geminiAI)-Copy1.ipynb', 'Untitled.ipynb 10-30-05-375\u202fPM-Copy1.ipynb']


In [18]:
# print(f"API Key: {api_key}")

In [20]:
documents = load_pdf()
# documents[0],len(documents)

In [21]:
# Check extracted text
for i, doc in enumerate(documents[:5]):  # Print first 5 docs
    print(f"\nDocument {i+1}:\n{doc.page_content[:500]}...\n")  # Print first 500 chars



Document 1:
THE KITE
RUNNER
by KHALED HOSSEINI
Published 2003
Afghan Mellat Online Library
www.afghan-­‐mellat.org.uk
_December 2001_
I became what I am today at the age of twelve, on a frigid overcast day in the
winter of 1975. I remember the precise moment, crouching behind a crumbling
mud wall, peeking into the alley near the frozen creek. That was a long time ago,
but it's wrong what they say about the past, I've learned, about how you can bury
it. Because the past claws its way out. Looking back now, I...


Document 2:
MONOPOLY
Property Trading Game from Parker Brothers"
AGES 8+
2 to 8 Players
Contents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance
and Community Chest cards, Title Deed cards, play money and a Banker's tray.
Now there's a faster way to play MONOPOLY. Choose to play by
the classic rules for buying, renting and selling properties or use the
Speed Die to get into the action faster. If you've never played the classic
MONOPOLY game, refer to the Classic Rules

In [24]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)

In [26]:
chunks = split_documents(documents)
print(chunks[0])

page_content='THE KITE
RUNNER
by KHALED HOSSEINI
Published 2003
Afghan Mellat Online Library
www.afghan-­‐mellat.org.uk
_December 2001_
I became what I am today at the age of twelve, on a frigid overcast day in the
winter of 1975. I remember the precise moment, crouching behind a crumbling
mud wall, peeking into the alley near the frozen creek. That was a long time ago,
but it's wrong what they say about the past, I've learned, about how you can bury
it. Because the past claws its way out. Looking back now, I realize I have been
peeking into that deserted alley for the last twenty-­‐six years.
One day last summer, my friend Rahim Khan called from Pakistan. He
asked me to come see him. Standing in the kitchen with the receiver to my ear, I'


In [28]:
# Extract plain text from the Document objects
chunk_texts = [chunk.page_content for chunk in chunks]
chunk_texts[0]

"THE KITE\nRUNNER\nby KHALED HOSSEINI\nPublished 2003\nAfghan Mellat Online Library\nwww.afghan-\xad‐mellat.org.uk\n_December 2001_\nI became what I am today at the age of twelve, on a frigid overcast day in the\nwinter of 1975. I remember the precise moment, crouching behind a crumbling\nmud wall, peeking into the alley near the frozen creek. That was a long time ago,\nbut it's wrong what they say about the past, I've learned, about how you can bury\nit. Because the past claws its way out. Looking back now, I realize I have been\npeeking into that deserted alley for the last twenty-\xad‐six years.\nOne day last summer, my friend Rahim Khan called from Pakistan. He\nasked me to come see him. Standing in the kitchen with the receiver to my ear, I"

In [30]:
embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
    google_api_key=api_key
)
annoy_store = Annoy.from_texts(chunk_texts, embeddings)
print("Annoy vector store successfully created!")


Annoy vector store successfully created!


In [31]:
from IPython.display import display, Markdown


In [33]:
while True:
    query = input("\nAsk a question about the PDF (or type 'exit' to quit): ")
    if query.lower() == "exit":
        print("Goodbye!")
        break

    query_embedding = embeddings.embed_query(query)
    similar_docs = annoy_store.similarity_search_by_vector(query_embedding, k=5)

    if similar_docs:
        pdf_context = " ".join(doc.page_content for doc in similar_docs)
    else:
        pdf_context = "No relevant context found."
        
    print("\n Most Relevant Text from PDF:\n", pdf_context)

    full_prompt = f"""
    You are an AI assistant. Answer the user's question based on the extracted PDF context.

    **Question:** {query}

    **PDF Context:** {pdf_context}

    Provide a detailed and informative response. Explain thoroughly, provide examples, and ensure clarity. If needed, break your response into sections for better readability.
    """

    model = genai.GenerativeModel("gemini-pro")
    generation_config = genai.types.GenerationConfig(
        temperature=0.7,
        max_output_tokens=1024,
        top_p=0.9,
    )
    response = model.generate_content(full_prompt)

    if response and hasattr(response, "candidates") and response.candidates:
        best_candidate = response.candidates[0]  # Assuming first is the best
        full_text = " ".join(part.text for part in best_candidate.content.parts)  # Combine all parts
        display(Markdown(full_text))
    else:
        print("\n No valid response received from Gemini AI.")


Ask a question about the PDF (or type 'exit' to quit):  How does the self-attention mechanism in the Transformer model improve upon traditional sequence-to-sequence architectures like RNNs and LSTMs?



 Most Relevant Text from PDF:
 to averaging attention-weighted positions, an effect we counteract with Multi-Head Attention as
describedinsection3.2.
Self-attention,sometimescalledintra-attentionisanattentionmechanismrelatingdifferentpositions
ofasinglesequenceinordertocomputearepresentationofthesequence. Self-attentionhasbeen
usedsuccessfullyinavarietyoftasksincludingreadingcomprehension,abstractivesummarization,
textualentailmentandlearningtask-independentsentencerepresentations[4,22,23,19].
End-to-endmemorynetworksarebasedonarecurrentattentionmechanisminsteadofsequence-
alignedrecurrenceandhavebeenshowntoperformwellonsimple-languagequestionansweringand
languagemodelingtasks[28].
To the best of our knowledge, however, the Transformer is the first transduction model relying 3.2.3 ApplicationsofAttentioninourModel
TheTransformerusesmulti-headattentioninthreedifferentways:
• In"encoder-decoderattention"layers,thequeriescomefromthepreviousdecoderlayer,
andthememorykeysandvaluescomefromt

NotFound: 404 models/gemini-pro is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.