In [1]:
pip install pymupdf langchain_core langchain_community

Collecting pymupdf
  Downloading pymupdf-1.26.6-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Collecting langchain_community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain_core
  Downloading langchain_core-1.0.3-py3-none-any.whl.metadata (3.5 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain_community)
  Downloading langchain_classic-1.0.0-py3-none-any.whl.metadata (3.9 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain_community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain_community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7.0,>=0.6.7->langchain_community)
  Downloading typing_insp

In [2]:
# --- PDF and image processing ---
import fitz  # PyMuPDF
from PIL import Image
import io
import os
import base64

# --- Core libraries ---
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# --- LangChain ecosystem ---
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import HumanMessage
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain.chat_models import init_chat_model

# --- Multimodal embedding / vision models ---
from transformers import CLIPProcessor, CLIPModel


In [3]:
#Initalizing clip(Contrastive Language-Image Pre-Training) model for unified embeddings

#clip model used to bridge the gap between computer vision and natural language processing by creating a shared embedding space for images and text
#clip_processor is used for preparing input data (images and text) for the CLIP model by handling necessary preprocessing steps like image resizing, normalization, and text tokenization.

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

clip_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [4]:
def embed_image(image_data):
  if isinstance(image_data, str):
    image = Image.open(image_data).convert("RGB")
  else:
    image = image_data

  inputs = clip_processor(images = image, return_tensors = "pt")

  with torch.no_grad():
    featuers = clip_model.get_image_features(**inputs)

    featuers = featuers / featuers.norm(dim = -1, keepdim = True)
    return featuers.squeeze().numpy()

def embed_text(text):
  inputs = clip_processor(
      text = text,
      return_tensors = "pt",
      padding  = True,
      truncation = True,
      max_length = 77
)

  with torch.no_grad():
    featuers = clip_model.get_text_features(**inputs)

    featuers = featuers/ featuers.norm(dim = -1, keepdim = True)
    return featuers.squeeze().numpy()


In [5]:
path = "/content/multimodal_sample (1).pdf"
doc = fitz.open(path)

all_docs = []
all_embeds = []
all_images = {}

In [6]:
splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)

In [7]:
if doc.is_closed:
    doc = fitz.open(path)
for i, page in enumerate(doc):
  text = page.get_text()
  text.strip()

  temp_doc = Document(page_content = text, metadata = {"page" : i, "type": "text"})
  text_chnks = splitter.split_documents([temp_doc])

  for chunk in text_chnks:
    embedding = embed_text(chunk.page_content)
    all_embeds.append(embedding)
    all_docs.append(chunk)

for img_index, img in enumerate(page.get_images(full=True)):
        try:
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]

            # Convert to PIL Image
            pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

            # Create unique identifier
            image_id = f"page_{i}_img_{img_index}"

            # Store image as base64 for later use with GPT-4V
            buffered = io.BytesIO()
            pil_image.save(buffered, format="PNG")
            img_base64 = base64.b64encode(buffered.getvalue()).decode()
            all_images[image_id] = img_base64

            # Embed image using CLIP
            embedding = embed_image(pil_image)
            all_embeds.append(embedding)

            # Create document for image
            image_doc = Document(
                page_content=f"[Image: {image_id}]",
                metadata={"page": i, "type": "image", "image_id": image_id}
            )
            all_docs.append(image_doc)

        except Exception as e:
            print(f"Error processing image {img_index} on page {i}: {e}")
            continue


In [8]:
all_docs

[Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
 Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')]

In [9]:
embeddings_array = np.array(all_embeds)
embeddings_array

array([[-0.00267245,  0.01282999, -0.05183139, ..., -0.00385086,
         0.02977718, -0.00010685],
       [ 0.01732335, -0.01327693, -0.02427032, ...,  0.0899405 ,
        -0.00272154,  0.03253041]], dtype=float32)

In [10]:
(all_docs,embeddings_array)

([Document(metadata={'page': 0, 'type': 'text'}, page_content='Annual Revenue Overview\nThis document summarizes the revenue trends across Q1, Q2, and Q3. As illustrated in the chart\nbelow, revenue grew steadily with the highest growth recorded in Q3.\nQ1 showed a moderate increase in revenue as new product lines were introduced. Q2 outperformed\nQ1 due to marketing campaigns. Q3 had exponential growth due to global expansion.'),
  Document(metadata={'page': 0, 'type': 'image', 'image_id': 'page_0_img_0'}, page_content='[Image: page_0_img_0]')],
 array([[-0.00267245,  0.01282999, -0.05183139, ..., -0.00385086,
          0.02977718, -0.00010685],
        [ 0.01732335, -0.01327693, -0.02427032, ...,  0.0899405 ,
         -0.00272154,  0.03253041]], dtype=float32))

In [14]:
vector_store = FAISS.from_embeddings(
    text_embeddings=[(doc.page_content, emb) for doc, emb in zip(all_docs, embeddings_array)],
    embedding=None,  # Already computed embeddings
    metadatas=[doc.metadata for doc in all_docs]
)
vector_store



<langchain_community.vectorstores.faiss.FAISS at 0x7dd95f32fd40>

In [23]:
import os
os.environ["OPENAI_API_KEY"] ="your api key"

In [24]:
llm = init_chat_model("openai:gpt-4.1")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x7dd95e3377d0>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x7dd95d3293a0>, root_client=<openai.OpenAI object at 0x7dd95e7c69c0>, root_async_client=<openai.AsyncOpenAI object at 0x7dd95e3367b0>, model_name='gpt-4.1', model_kwargs={}, openai_api_key=SecretStr('**********'), stream_usage=True)

In [25]:
def retrieve_multimodal(query, k=5):
    # Embed query using CLIP
    query_embedding = embed_text(query)

    # Search in unified vector store
    results = vector_store.similarity_search_by_vector(
        embedding=query_embedding,
        k=k
    )

    return results

In [26]:
def create_multimodal_message(query, retrieved_docs, all_images):

    content = []

    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })

    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in all_images:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{all_images[image_id]}"
                }
            })

    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content=content)

In [29]:
def multimodal_pdf_rag_pipeline(query):
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)

    # Create multimodal message
    message = create_multimodal_message(query, context_docs,all_images)

    # Get response from GPT-4V
    response = llm.invoke([message])

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response.content

In [30]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What does the chart on page 1 show about revenue trends?",
        "Summarize the main findings from the document",
        "What visual elements are present in the document?"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What does the chart on page 1 show about revenue trends?
--------------------------------------------------

Retrieved 2 documents:
  - Text from page 0: Annual Revenue Overview
This document summarizes the revenue trends across Q1, Q2, and Q3. As illust...
  - Image from page 0


Answer: The chart on page 1 shows a clear upward trend in revenue across three quarters. Each bar represents a different quarter, with the height increasing from left to right:

- The first bar (Q1, blue) shows the lowest revenue, but with some growth as new products were introduced.
- The second bar (Q2, green) is higher, indicating further growth driven by marketing campaigns.
- The third bar (Q3, red) is the tallest, showing the highest revenue and reflecting exponential growth due to global expansion.

Overall, the chart demonstrates that revenue increased steadily each quarter, with the fastest growth occurring in Q3.

Query: Summarize the main findings from the document
-------------------------