In [1]:
# !pip install pymupdf

In [2]:
import fitz # for PyMuPDF
from langchain_core.documents import Document
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch
import numpy as np
from langchain.chat_models import init_chat_model
from langchain.prompts import PromptTemplate
from langchain.schema.messages import HumanMessage
from sklearn.metrics.pairwise import cosine_similarity
import os
import base64
import io
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

In [3]:
from google.colab import userdata

# Load the API key from Colab secrets
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

In [4]:
# Initialize Clip Model for unified Embeddings

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e-05,

In [5]:
def embed_image(image_data):
  """Embed image using CLIP"""
  if isinstance(image_data, str): #path
    image = Image.open(image_data).convert("RGB")
  else:
    image = image_data

  inputs = clip_processor(images=image, return_tensors="pt")
  with torch.no_grad():
    features = clip_model.get_image_features(**inputs)
    features = features / features.norm(dim=-1, keepdim = True)
  return features.squeeze().numpy()


def embed_text(text):
  """Embed text using CLIP"""
  inputs = clip_processor(text=text, return_tensors="pt", padding = True, truncation = True, max_length = 77)
  with torch.no_grad():
    features = clip_model.get_text_features(**inputs)
    features = features / features.norm(dim = -1, keepdim = True)
    return features.squeeze().numpy()

In [7]:
from google.colab import drive
drive.mount('/content/drive')

pdf_path = '/content/drive/My Drive/Multimodal_RAG/LectureNotesCNN.pdf'


doc = fitz.open(pdf_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
doc

Document('/content/drive/My Drive/Multimodal_RAG/LectureNotesCNN.pdf')

In [9]:
all_docs = []
all_embeddings = []
image_data_store = {}

splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 100)


In [10]:
for i, page in enumerate(doc):
  #process text
  text = page.get_text()
  if text.strip():
    #create temporary document for splitting
    temp_doc = Document(page_content = text, metadata = {"page": f"{i}", "type": "text"})
    text_chunks = splitter.split_documents([temp_doc])

    for chunk in text_chunks:
      embedding = embed_text(chunk.page_content)
      all_embeddings.append(embedding)
      all_docs.append(chunk)

  #Process images
  for img_index, img in enumerate(page.get_images(full=True)):
    try:
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]

        # Convert to PIL Image
        pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")

        # Create unique identifier
        image_id = f"page_{i}_img_{img_index}"

        # Store image as base64 for later use with GPT-4V
        buffered = io.BytesIO()
        pil_image.save(buffered, format="PNG")
        img_base64 = base64.b64encode(buffered.getvalue()).decode()
        image_data_store[image_id] = img_base64

        # Embed image using CLIP
        embedding = embed_image(pil_image)
        all_embeddings.append(embedding)

        # Create document for image
        image_doc = Document(
            page_content=f"[Image: {image_id}]",
            metadata={"page": i, "type": "image", "image_id": image_id}
        )
        all_docs.append(image_doc)

    except Exception as e:
        print(f"Error processing image {img_index} on page {i}: {e}")
        continue

doc.close()

The channel dimension is ambiguous. Got image shape (1, 1, 3). Assuming channels are the first dimension. Use the [input_data_format](https://huggingface.co/docs/transformers/main/internal/image_processing_utils#transformers.image_transforms.rescale.input_data_format) parameter to assign the channel dimension.


Error processing image 2 on page 9: mean must have 1 elements if it is an iterable, got 3


In [11]:
all_embeddings

[array([ 4.28104661e-02,  4.59232507e-03, -2.15550233e-03, -1.03625329e-02,
        -3.96510437e-02,  7.29156454e-05,  4.31593321e-02,  2.26110090e-02,
         1.10872686e-01,  1.04269516e-02, -7.16207223e-03,  2.07392555e-02,
         5.42129064e-03, -2.97503453e-03,  2.58059651e-02, -5.18025123e-02,
         1.83949322e-02,  7.27241337e-02, -8.78350437e-03,  9.69066191e-03,
         3.12640145e-02,  4.73436601e-02, -5.70419105e-03,  2.84244847e-02,
        -1.02167092e-02,  6.02104217e-02,  2.04029679e-02,  6.55344874e-03,
         3.63309635e-03, -2.55881324e-02,  2.29776575e-04, -3.26842666e-02,
         3.28388400e-02, -1.63185038e-02, -2.48296484e-02, -6.37119263e-02,
        -6.80562109e-02,  5.13326435e-04,  2.72696093e-02, -2.12057959e-02,
        -4.83106971e-02, -1.89933069e-02, -2.36234274e-02, -7.16174208e-03,
        -7.65075092e-04, -1.71351638e-02, -2.18463577e-02,  1.48987742e-02,
        -1.71262771e-02,  2.75707268e-03, -4.44535092e-02,  3.80306244e-02,
        -2.4

In [13]:
len(all_docs)

81

In [17]:
embedding_array = np.array(all_embeddings)

vector_store = FAISS.from_embeddings(
    text_embeddings = [(doc.page_content, emb) for doc, emb in zip(all_docs, embedding_array)],
    embedding = None,
    metadatas = [doc.metadata for doc in all_docs]
)

vector_store



<langchain_community.vectorstores.faiss.FAISS at 0x7fce1535e790>

In [20]:
llm = init_chat_model("openai:gpt-4.1")
llm

ChatOpenAI(client=<openai.resources.chat.completions.completions.Completions object at 0x7fce56d19810>, async_client=<openai.resources.chat.completions.completions.AsyncCompletions object at 0x7fcde90ac8d0>, root_client=<openai.OpenAI object at 0x7fce56d18fd0>, root_async_client=<openai.AsyncOpenAI object at 0x7fcde90ac450>, model_name='gpt-4.1', model_kwargs={}, openai_api_key=SecretStr('**********'))

In [21]:
def retrieve_multimodal(query, k = 5):
  query_embedding = embed_text(query)

  results = vector_store.similarity_search_by_vector(
      embedding = query_embedding,
      k = k
  )
  return results

In [22]:
def create_multimodal_message(query, retrieved_docs):
    """Create a message with both text and images for GPT-4V."""
    content = []

    # Add the query
    content.append({
        "type": "text",
        "text": f"Question: {query}\n\nContext:\n"
    })

    # Separate text and image documents
    text_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "text"]
    image_docs = [doc for doc in retrieved_docs if doc.metadata.get("type") == "image"]

    # Add text context
    if text_docs:
        text_context = "\n\n".join([
            f"[Page {doc.metadata['page']}]: {doc.page_content}"
            for doc in text_docs
        ])
        content.append({
            "type": "text",
            "text": f"Text excerpts:\n{text_context}\n"
        })

    # Add images
    for doc in image_docs:
        image_id = doc.metadata.get("image_id")
        if image_id and image_id in image_data_store:
            content.append({
                "type": "text",
                "text": f"\n[Image from page {doc.metadata['page']}]:\n"
            })
            content.append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{image_data_store[image_id]}"
                }
            })

    # Add instruction
    content.append({
        "type": "text",
        "text": "\n\nPlease answer the question based on the provided text and images."
    })

    return HumanMessage(content=content)

In [23]:
def multimodal_pdf_rag_pipeline(query):
    """Main pipeline for multimodal RAG."""
    # Retrieve relevant documents
    context_docs = retrieve_multimodal(query, k=5)

    # Create multimodal message
    message = create_multimodal_message(query, context_docs)

    # Get response from GPT-4V
    response = llm.invoke([message])

    # Print retrieved context info
    print(f"\nRetrieved {len(context_docs)} documents:")
    for doc in context_docs:
        doc_type = doc.metadata.get("type", "unknown")
        page = doc.metadata.get("page", "?")
        if doc_type == "text":
            preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
            print(f"  - Text from page {page}: {preview}")
        else:
            print(f"  - Image from page {page}")
    print("\n")

    return response.content

In [25]:
if __name__ == "__main__":
    # Example queries
    queries = [
        "What does the chart on page 1 show about what it says about CNN?",
        "Summarize the main findings from the document",
        "What visual elements are present in the document?"
    ]

    for query in queries:
        print(f"\nQuery: {query}")
        print("-" * 50)
        answer = multimodal_pdf_rag_pipeline(query)
        print(f"Answer: {answer}")
        print("=" * 70)


Query: What does the chart on page 1 show about what it says about CNN?
--------------------------------------------------

Retrieved 5 documents:
  - Text from page 15: dataset is large, then you can fine-tune the model as you have enough data and need not to worry 
ab...
  - Text from page 7: product of wT and p. The dot product is computed at every patch to get a (3, 3) output array, as sho...
  - Text from page 15: So far, we have discussed CNN based networks which were trained on millions of images of various cla...
  - Text from page 14: might be unsurprising) and then degrades rapidly. Unexpectedly, such degradation is not caused by 
o...
  - Text from page 0: CNNs - A Specialised Architecture for Visual Data 
 
 
 
 
Convolutional Neural Networks, or CNNs, a...


Answer: Based on the provided context, particularly the text from **page 1** (actually page 0 in your excerpts), here’s what the chart on that page shows about CNNs (Convolutional Neural Networks):

**Key points from 