## MultiModal PDF-RAG with Langchain and OpenSearch Vector database

In [19]:
# =========================
# 🔹 Imports and Setup
# =========================
import os
import time
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from docx import Document
from dotenv import load_dotenv
from opensearchpy import OpenSearch



In [20]:
# =========================
# 🔹 Load Environment Variables
# =========================
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENSEARCH_HOST = os.getenv("OPENSEARCH_HOST", "localhost")
OPENSEARCH_PORT = int(os.getenv("OPENSEARCH_PORT", 9200))
OPENSEARCH_INDEX = os.getenv("OPENSEARCH_INDEX", "pdf_chunks")


PDF Data Extraction:
Extracts raw textual content from a PDF file. This forms the base input for downstream processing in the pipeline.

In [24]:
# =========================
# 🔹 Step 1: Data Ingestion from PDF using PyMuPDFLoader
# =========================
def extract_text_and_charts(pdf_path):
    loader = PyMuPDFLoader(pdf_path)
    documents = loader.load()
    texts = [doc.page_content for doc in documents]
    print(f"✅ Extracted {len(texts)} pages from PDF")
    return texts

Semantic Chunking :
Splits the extracted content into semantically meaningful text chunks. This ensures better context preservation for both vectorization and LLM processing.

In [26]:
# =========================
# 🔹 Step 2: Semantic Chunking
# =========================
def semantic_chunking(texts, chunk_size=800, chunk_overlap=100):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", "!", "?", ",", " "]
    )
    chunks = []
    for t in texts:
        chunks.extend(splitter.split_text(t))
    print(f"✅ Generated {len(chunks)} semantic chunks")
    return chunks


Embedding and Indexing
Converts text chunks into vector embeddings using an OpenAI model and stores them in an OpenSearch vector index. This enables fast and scalable semantic search.



In [27]:
# =========================
# 🔹 Step 3: Embedding and Storing in OpenSearch
# =========================
embedding_model = OpenAIEmbeddings(model="text-embedding-ada-002")

def embed_and_store_chunks_opensearch(chunks):
    client = OpenSearch(
        hosts=[{"host": OPENSEARCH_HOST, "port": OPENSEARCH_PORT}],
        http_compress=True
    )
    vectorstore = OpenSearchVectorSearch(
    index_name=OPENSEARCH_INDEX,
    embedding_function=embedding_model,  # Pass the full embedding model
    opensearch_url=f"http://{OPENSEARCH_HOST}:{OPENSEARCH_PORT}",
    http_auth=None,
    use_ssl=False,
    )
    vectorstore.add_texts(chunks)
    print(f"✅ Embedded and stored {len(chunks)} chunks in OpenSearch")
    return vectorstore

Semantic Search:
Uses a natural language query to search the vector index and retrieve the most relevant text chunks based on semantic similarity.

In [28]:
# =========================
# 🔹 Step 4: Semantic Search with OpenSearch
# =========================
def search_opensearch(vectorstore, query, top_k=5):
    start = time.time()
    results = vectorstore.similarity_search(query, k=top_k)
    duration = time.time() - start
    print(f"✅ Search completed in {duration:.2f} seconds, top {top_k} results retrieved")
    return results, duration

LLM Response Generation
Constructs a structured prompt using the retrieved context and query, and generates a coherent, well-formed response using a GPT-4 model.

In [32]:
# =========================
# 🔹 Step 5: Prompt Template and LLM Output Generation
# =========================
def generate_response(context, query):
    prompt = PromptTemplate(
        input_variables=["context", "query"],
        template=(
            "You are an AI assistant helping to summarize and explain research papers.\n"
            "Based on the following context extracted from the paper 'Attention Is All You Need':\n\n"
            "{context}\n\n"
            "Please provide a clear and concise answer to the following question:\n"
            "{query}\n"
            "Your response should be accurate, well-structured, and use appropriate technical terminology."
        )
    )
    llm = ChatOpenAI(model_name="gpt-4o", temperature=0.2)
    chain = LLMChain(llm=llm, prompt=prompt)
    response = chain.run({"context": context, "query": query})
    print("✅ LLM response generated")
    return response


Save Output:
Saves the generated response into a DOCX document, allowing users to archive or share results.

In [33]:
# =========================
# 🔹 Step 6: Save LLM Response to DOCX
# =========================
def save_response_to_docx(response_text, filename="response.docx"):
    doc = Document()
    doc.add_paragraph(response_text)
    doc.save(filename)
    print(f"✅ Response saved to {filename}")

Full Pipeline Orchestration:
Chains all the above steps into a seamless end-to-end flow — from PDF ingestion to LLM-powered answer generation and export

In [36]:
# =========================
# 🔹 Step 7: Full Pipeline Execution
# =========================
def run_pipeline(pdf_path, user_query):
    print("🔹 Extracting data from PDF...")
    combined_texts = extract_text_and_charts(pdf_path)

    print("🔹 Performing semantic chunking...")
    chunks = semantic_chunking(combined_texts)

    print("🔹 Embedding chunks and storing in OpenSearch...")
    vectorstore = embed_and_store_chunks_opensearch(chunks)

    print("🔹 Performing similarity search...")
    results, _ = search_opensearch(vectorstore, user_query)

    top_context = "\n".join([doc.page_content for doc in results])

    print("🔹 Generating response using LLM...")
    response = generate_response(top_context, user_query)

    save_response_to_docx(response)
    print(response)



In [37]:
# =========================
# 🔹 Run Example
# =========================
if __name__ == "__main__":
    run_pipeline(
        pdf_path="attention.pdf",
        user_query="Can you explain the core idea behind the self-attention mechanism proposed in the paper?"
    )


🔹 Extracting data from PDF...
✅ Extracted 11 pages from PDF
🔹 Performing semantic chunking...
✅ Generated 51 semantic chunks
🔹 Embedding chunks and storing in OpenSearch...
✅ Embedded and stored 51 chunks in OpenSearch
🔹 Performing similarity search...
✅ Search completed in 0.33 seconds, top 5 results retrieved
🔹 Generating response using LLM...
✅ LLM response generated
✅ Response saved to response.docx
The core idea behind the self-attention mechanism, as proposed in the paper "Attention Is All You Need," is to compute a representation of a sequence by relating different positions within that sequence. Self-attention, also known as intra-attention, allows the model to weigh the importance of each position in the sequence relative to others, enabling it to capture dependencies between distant positions efficiently.

In traditional sequence models, capturing long-range dependencies can be computationally expensive and challenging. However, self-attention reduces this complexity to a con