In [13]:
import os
from dotenv import load_dotenv
from datasets import load_dataset
import random
from unsloth import FastVisionModel
import torch

0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.


In [3]:
load_dotenv()
if os.environ.get("HUGGINGFACE_TOKEN") is None:
    raise ValueError("HUGGINGFACE_TOKEN not set")


In [4]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)

In [5]:
# First, let's try to download the dataset from HuggingFace
# We don't need to use a token, because it's a public dataset.
dataset_name = "UCSC-Admire/idiom-dataset-100-2024-11-11_14-37-58"
dataset = load_dataset(dataset_name, split="train")


In [12]:
def convert_to_conversation(sample: dict):
    # 1. Create list of (original_position, image) pairs
    original_images = [
        (i, sample[f"image_{i}"])
        for i in range(1, 6)
    ]
    
    # 2. Create a shuffled version of this list
    shuffled_images = original_images.copy()
    random.shuffle(shuffled_images)
    
    # 3. Assign letters A-E to the shuffled images
    letters = list("ABCDE")
    images_with_letters = [
        (letter, image)
        for letter, (_, image) in zip(letters, shuffled_images)
    ]
    
    # 4. Create mapping of original_position to assigned letter
    original_to_letter = {
        orig_pos: letter
        for (orig_pos, _), (letter, _) in zip(shuffled_images, images_with_letters)
    }
    
    # Now the correct order is the letters assigned to positions 1,2,3,4,5
    correct_order = [original_to_letter[i] for i in range(1, 6)]

    # print(f"Original order: {original_images}\n")
    # print(f"Shuffled order: {shuffled_images}\n")
    # print(f"Shuffled order with letters: {images_with_letters}\n")
    # print(f"Original to letter: {original_to_letter}\n")
    # print(f"Correct order: {correct_order}\n")
    
    instruction = f"""You are given a compound, its use in a sentence (which determines whether a compound should be interpreted literally or idiomatically), and five images.
    The images have been given aliases of {', '.join(letters)}, respectively.
    Rank the images from most to least relevant, based on how well they represent the compound (in either a literal or idiomatic sense, based on how it's used in the sentence).
    Return the ranking of the images as a comma-separated list of the aliases, from most to least relevant.
    
    As an example, if your predicted ranking from most to least relevant is B, C, A, E, D, then you should respond with "B, C, A, E, D"."""

    correct_response = f"{', '.join(correct_order)}"

    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": instruction},
                *[{"type": "image", "image": img} for _, img in shuffled_images]
            ]
        },
        {
            "role": "assistant",
            "content": [
                {"type": "text", "text": correct_response}
            ]
        }
    ]

    return {"messages": conversation}

In [14]:
# Load our model
model, tokenizer = FastVisionModel.from_pretrained(
    "Qwen/Qwen2-VL-2B-Instruct",
    use_gradient_checkpointing="unsloth",
    dtype=torch.bfloat16,  # Load in bfloat16
)

# It looks like "We also support finetuning ONLY the vision part of the model, or ONLY the langauge part"
# Or you can select both! You can also select to finetune the attention or the MLP layers
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers=False,  # Leave the vision layers frozen
    finetune_language_layers=True,  # Finetune the language layers
    finetune_attention_modules=True,  # Finetune the attention modules
    finetune_mlp_modules=True,  # Finetune the MLP modules

    r=16,  # Rank of the LoRA matrices; Larger, the higher the acc, but might overfit
    lora_alpha=16,  # Recommneded alpha == r at least
    lora_dropout=0,
    bias="none",
    random_state=42,
    use_rslora=False,  # They support rank stabilized LoRA
    loftq_config = None,  # They also support LoftQ
    # target_modules = "all-linear"  # Optional now; can specify a list if needed
)



==((====))==  Unsloth 2024.11.10: Fast Qwen2_Vl vision patching. Transformers: 4.46.2.
   \\   /|    GPU: NVIDIA GeForce RTX 4090. Max memory: 23.546 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu118. CUDA: 8.9. CUDA Toolkit: 11.8. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


In [32]:
# Let's try for one sample!
from PIL import Image

def resize_images(images, max_size=224):
    """Resize a list of PIL images while maintaining aspect ratio"""
    resized_images = []
    for i, img in enumerate(images, 1):
        # Print original size
        print(f"Image {i} original size: {img.size}")
        
        # Calculate new size maintaining aspect ratio
        ratio = max_size/max(img.size)
        new_size = tuple([int(x*ratio) for x in img.size])
        
        # Print new size
        print(f"Image {i} resized to: {new_size}")
        print("-" * 40)
        
        resized_images.append(img.resize(new_size, Image.Resampling.LANCZOS))
    return resized_images


record = dataset[0]
images = [record[f"image_{i}"] for i in range(1, 6)]
images = resize_images(images)
messages = convert_to_conversation(record)["messages"][:1]
answer = convert_to_conversation(record)["messages"][1]
input_text = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False  # Don't tokenize yet to avoid double tokenization
).replace("<|im_end|>", "")  # Remove the im_end token

inputs = tokenizer(
    images,
    input_text,
    add_special_tokens=True,  # Changed to True to properly handle special tokens
    return_tensors="pt",
).to("cuda")

from transformers import TextStreamer

text_streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=256,
    use_cache=True,
    temperature=1.5,
    min_p=0.1,
    # pad_token_id=tokenizer.pad_token_id,
    # eos_token_id=tokenizer.eos_token_id  # Explicitly set EOS token
)

print(answer)

Image 1 original size: (1024, 1024)
Image 1 resized to: (224, 224)
----------------------------------------
Image 2 original size: (1024, 1024)
Image 2 resized to: (224, 224)
----------------------------------------
Image 3 original size: (1024, 1024)
Image 3 resized to: (224, 224)
----------------------------------------
Image 4 original size: (1024, 1024)
Image 4 resized to: (224, 224)
----------------------------------------
Image 5 original size: (1024, 1024)
Image 5 resized to: (224, 224)
----------------------------------------
B, C, A, E, D<|im_end|>
{'role': 'assistant', 'content': [{'type': 'text', 'text': 'C, D, E, B, A'}]}


: 