In [2]:
import torch
import pandas as pd
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Load processor and model
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
processor = AutoProcessor.from_pretrained(model_name)
model = AutoModelForVision2Seq.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    attn_implementation="eager",
    output_attentions=True,
    return_dict=True
).to("cpu")

print("✅ Model & Processor Loaded!")



Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

✅ Model & Processor Loaded!


ValueError: The number of image token (0) should be the same as in the number of provided images (1)

In [10]:
# Load image
image_path = "image.jpg"  # Replace with your image path
image = Image.open(image_path).convert("RGB")

# Prepare inputs
# prompt = "<image> What is the famous thing in the image?"
# prompt = processor.tokenizer.apply_chat_template([
#     {"role": "user", "content": "<image>", "add_generation_prompt": False},
#     {"role": "user", "content": "Who invented electricity?"}
# ], tokenize=False)
 messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "What is the famous thing in the image?"}
        ]
    },
]

# ✅ Tokenize text prompt
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(images=[image], text=prompt, return_tensors="pt")

# Forward pass
with torch.no_grad():
    outputs = model(**inputs, output_hidden_states=True)

# Decode output
generated_texts = processor.batch_decode(outputs.logits.argmax(dim=-1), skip_special_tokens=True)
print("📝 Generated Answer:", generated_texts[0])

ValueError: The number of image token (0) should be the same as in the number of provided images (1)

In [None]:
cross_attn_layers = [3, 8, 13, 18, 23, 28, 33, 38]

# Get token strings
tokens = processor.tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

# Create directory to store maps
output_dir = "./cross_attention_maps"
os.makedirs(output_dir, exist_ok=True)

# Visualize cross-attention
for layer in cross_attn_layers:
    cross_attn = outputs.attentions[layer][0]  # shape: [num_heads, text_seq_len, image_seq_len]
    attn_mean = cross_attn.mean(dim=0)         # [text_seq_len, image_seq_len]

    plt.figure(figsize=(10, 6))
    sns.heatmap(attn_mean.numpy(), cmap="YlGnBu", annot=False)
    plt.title(f"Cross-Attention: Text Tokens → Image Embeddings (Layer {layer})")
    plt.xlabel("Image Patches")
    plt.ylabel("Text Tokens")
    plt.yticks(ticks=range(len(tokens)), labels=tokens, rotation=0)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/cross_attention_layer_{layer}.png")
    plt.close()

In [None]:
model