In [None]:
import os

In [None]:
os.environ["OPENAI_API_KEY"] = "sk-your_api_key"

**Step 1: Load and Parse PDF (Text + OCR)**

In [None]:
# Step 1: PDF Loading (Text + OCR Hybrid Method)
# This step loads both text-based and scanned PDFs.
# It first tries simple text extraction (PyPDFLoader),
# and if OCR is needed, it uses UnstructuredPDFLoader with hi_res mode.

In [None]:
from langchain_community.document_loaders import PyPDFLoader, UnstructuredPDFLoader

In [None]:
pdf_path = "/content/NIPS-2017-attention-is-all-you-need-Paper.pdf" # change path as needed

# Simple text extraction first
try:
    print("Attempting text extraction using PyPDFLoader")
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()

    # Check if text was extracted (PyPDF sometimes returns empty text for scanned PDFs)
    total_text = sum(len(doc.page_content.strip()) for doc in documents)
    if total_text < 100:  # threshold: if text is too short, likely a scanned PDF
        raise ValueError("Empty or scanned PDF detected, switching to OCR loader")

    print(f"Text-based PDF successfully loaded! Extracted {len(documents)} pages.")

except Exception as e:
    print(f"Text extraction failed ({e}), switching to OCR mode using UnstructuredPDFLoader...")
    loader = UnstructuredPDFLoader(
        pdf_path,
        mode="elements",       # keeps layout elements (better context)
        strategy="hi_res"      # enables image-based OCR parsing internally
    )
    documents = loader.load()
    print(f"OCR-based PDF successfully loaded using Unstructured loader. Extracted {len(documents)} elements.")

# Preview one document object
print("\n--- Sample Extracted Text ---\n")
print(documents[0].page_content[:800])  # preview first 800 characters

Attempting text extraction using PyPDFLoader
Text-based PDF successfully loaded! Extracted 11 pages.

--- Sample Extracted Text ---

Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser ∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and c


**Step 2 — Recursive & Semantic Chunking**

In [None]:
# Step 2: Chunking (Recursive & Semantic)
from langchain.text_splitter import RecursiveCharacterTextSplitter
# (Optional) Semantic chunker if you want deeper contextual splits
# from langchain_experimental.text_splitter import SemanticChunker
# from langchain_openai import OpenAIEmbeddings

# Recursive Chunking Setup
recursive_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,      # each chunk will have up to 1000 characters
    chunk_overlap=100,    # overlap helps preserve context between chunks
    separators=["\n\n", "\n", ".", "!", "?", " ", ""]
)

# Split the loaded documents into chunks
chunks = recursive_splitter.split_documents(documents)

print(f"Total chunks created: {len(docs_chunks)}")
print(f"Example chunk preview:\n{docs_chunks[0].page_content[:800]}")


Total chunks created: 51
Example chunk preview:
Attention Is All You Need
Ashish Vaswani∗
Google Brain
avaswani@google.com
Noam Shazeer∗
Google Brain
noam@google.com
Niki Parmar∗
Google Research
nikip@google.com
Jakob Uszkoreit∗
Google Research
usz@google.com
Llion Jones∗
Google Research
llion@google.com
Aidan N. Gomez∗†
University of Toronto
aidan@cs.toronto.edu
Łukasz Kaiser ∗
Google Brain
lukaszkaiser@google.com
Illia Polosukhin∗‡
illia.polosukhin@gmail.com
Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,


**Step 3: Embedding + Vector Database Setup**

In [None]:
# STEP 3: Create Embeddings and Store in ChromaDB

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# Initialize embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Create ChromaDB vector store
vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"  # Folder to save your vectors
)

# Persist data (saves locally)
vectorstore.persist()

print(f"Stored {len(chunks)} chunks in ChromaDB successfully!")


Stored 51 chunks in ChromaDB successfully!


  vectorstore.persist()


In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [None]:
# STEP 4: Hybrid Retrieval (Semantic + Keyword)

from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

# 1. Load Chroma Vector Store (from Step 3) ---
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

vectorstore = Chroma(
    persist_directory="./chroma_db",
    embedding_function=embedding_model
)

semantic_retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

# 2. Create BM25 Retriever (Keyword-based) ---
bm25_retriever = BM25Retriever.from_documents(chunks)
bm25_retriever.k = 4

# 3. Combine Both Retrievers using EnsembleRetriever ---
hybrid_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, semantic_retriever],
    weights=[0.4, 0.6]  # 40% keyword + 60% semantic importance
)

# 4️. Initialize LLM for Final Answer Generation ---
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=hybrid_retriever,
    return_source_documents=True
)

In [None]:
# 5. Ask Questions from the PDF ---
query = "Explain the main contribution of the Attention Is All You Need paper"
result = qa_chain.invoke({"query": query})

print("\n Query:", query)
print("\n Answer:\n", result["result"])


 Query: Explain the main contribution of the Attention Is All You Need paper

 Answer:
 The main contribution of the "Attention Is All You Need" paper is the introduction of a new network architecture called the Transformer. This architecture relies solely on attention mechanisms without using recurrent or convolutional neural networks. The Transformer model includes a multi-head attention mechanism that allows the model to jointly attend to information from different representation subspaces at different positions. This approach improves the efficiency and performance of sequence transduction models, making them more effective for tasks involving text and potentially other modalities like images, audio, and video.


In [None]:
# 5. Ask Questions from the PDF ---
query = "Explain about Tranformer in detail"
result = qa_chain.invoke({"query": query})

print("\n Query:", query)
print("\n Answer:\n", result["result"])


 Query: Explain about Tranformer in detail if possible with figure

 Answer:
 The Transformer is a neural sequence transduction model that relies entirely on self-attention mechanisms, eliminating the need for recurrence and convolutions. It consists of an encoder-decoder structure where the encoder maps input sequences to continuous representations, and the decoder generates output sequences. The model architecture includes stacked self-attention and point-wise fully connected layers for both the encoder and decoder.

In the Transformer architecture, the encoder and decoder are composed of a stack of identical layers. Each layer in the encoder has two sub-layers: a multi-head self-attention mechanism and a point-wise fully connected feed-forward network. The output of each sub-layer is passed through a residual connection and layer normalization. The decoder, in addition to these two sub-layers, inserts a third sub-layer that performs multi-head attention over the output of the encod

In [None]:
# 5. Ask Questions from the PDF ---
query = "Give the summary of the Attention Is All You Need paper"
result = qa_chain.invoke({"query": query})

print("\n Query:", query)
print("\n Answer:\n", result["result"])


 Query: Give the summary of the Attention Is All You Need paper

 Answer:
 The "Attention Is All You Need" paper introduces a new network architecture called the Transformer, which simplifies sequence transduction models by using attention mechanisms. The paper discusses the concept of attention, including Scaled Dot-Product Attention and Multi-Head Attention. It explains how attention functions map queries and key-value pairs to outputs, emphasizing the importance of jointly attending to information from different representation subspaces. The paper also describes modifications to the self-attention sub-layer in the decoder stack to prevent positions from attending to subsequent positions.


**THANKS**