<a href="https://colab.research.google.com/github/preetamjumech/LLM/blob/main/Multimodal_RAG_with_Qwen_2_and_ColPali_Ask_Questions_from_Images_04_11_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade git+https://github.com/huggingface/transformers.git byaldi accelerate flash-attn qwen_vl_utils pdf2image
!sudo apt-get install -y poppler-utils

In [None]:
from byaldi import RAGMultiModalModel
from transformers import Qwen2VLForConditionalGeneration,AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
from pdf2image import convert_from_path
import os

In [None]:
rag_engine = RAGMultiModalModel.from_pretrained("vidore/colpali")
vlm = Qwen2VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2-VL-7B-Instruct",
    torch_dtype = torch.bfloat16,
    attn_implementation = "flash_attention_2",
    device_map = "cuda"
)


In [None]:
rag_engine.index(
    input_path = "docs.pdf",
    index_name = "index",
    store_collection_with_index=False,
    overwrite=True
)

In [None]:
text_query = """
What all Lymph node stations are visible in the case study?
"""

In [None]:
results = rag_engine.search(text_query, k=3)
results

In [None]:
images = convert_from_path("docs.pdf")
image_index = results[0]["page_num"] - 1

In [None]:
image_index

In [None]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": images[image_index],
            },
            {"type": "text", "text": text_query},
        ],
    }
]



text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")


generated_ids = vlm.generate(**inputs, max_new_tokens=512)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)