In [12]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
)

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-7B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


["The image depicts a serene beach scene during what appears to be either sunrise or sunset, as indicated by the warm, golden light illuminating the sky and casting long shadows on the sand. A woman is sitting on the sandy beach, wearing a plaid shirt and dark pants, with her legs crossed. She has long hair and is smiling warmly at a light-colored dog, possibly a Labrador Retriever, which is sitting in front of her. The dog is wearing a harness and is extending its paw towards the woman's hand, suggesting a playful interaction between them. The ocean is visible in the background, with gentle waves rolling onto the shore"]


In [19]:
import pandas as pd
from tqdm import tqdm

In [2]:
 test_df = pd.read_parquet('data/deep_chal_multitask_dataset_test.parquet')

In [3]:
test_df

Unnamed: 0,input_type,task,input,question
0,image,captioning,https://pulpcovers.com/wp-content/uploads/2020...,
1,image,captioning,https://pulpcovers.com/wp-content/uploads/2010...,
2,image,captioning,https://pulpcovers.com/wp-content/uploads/2020...,
3,image,captioning,https://pulpcovers.com/wp-content/uploads/2011...,
4,image,captioning,https://pulpcovers.com/wp-content/uploads/2023...,
...,...,...,...,...
2495,text,text_qa,Alan worked in an office in the city. He worke...,Why was Alan going to the farm to begin with?
2496,text,text_qa,The kitchen comes alive at night in the Sander...,Does she believe him?
2497,text,text_qa,A440 or A4 (also known as the Stuttgart pitch)...,What is one instrument A4 is used to tune?
2498,text,text_qa,"The dog, called Prince, was an intelligent ani...",Who found it?


In [38]:

results=[]
# short_test_df= test_df.iloc[:10]
for index, row in tqdm(test_df.iterrows()):
    messages=[{
        "role": "user",
        "content": [
            {
                "type": row['input_type'],
                row['input_type']: row['input'],
            },
            {"type": "text", "text": "Do the task : "+ row['task']},
        ],
    }]
    text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")
    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    results.append({"ID" : index , "Output":output_text})
    

69it [06:17,  5.48s/it]


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fbb5a226700>

In [43]:
batches = []
for index, row in tqdm(test_df.iterrows()):
    messages = [{
        "role": "user",
        "content": [
            {
                "type": row['input_type'],
                row['input_type']: row['input'],
            },
            {"type": "text", "text": "Do the task : " + row['task']},
        ],
    }]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    try:
        image_inputs, video_inputs = process_vision_info(messages)
    except Exception as e:
        print(f"Error processing vision info for row {index}: {str(e)}")
        messages = [{
        "role": "user",
        "content": [
            {
                "type": "text",
                "text": row['input'],
            },
            {"type": "text", "text": "Do the task : " + row['task']},
        ],
        }]
        image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    batches.append(inputs)

71it [00:24,  3.89it/s]

Error processing vision info for row 69: cannot identify image file <_io.BytesIO object at 0x7fbb3a599d00>


89it [00:34,  1.12s/it]

Error processing vision info for row 88: cannot identify image file <_io.BytesIO object at 0x7fbb3a999d00>


268it [02:01,  1.13it/s]

Error processing vision info for row 267: cannot identify image file <_io.BytesIO object at 0x7fbb3a932430>


358it [02:41,  1.40it/s]

Error processing vision info for row 357: cannot identify image file <_io.BytesIO object at 0x7fbb3a998f40>


378it [02:53,  1.11s/it]

Error processing vision info for row 377: cannot identify image file <_io.BytesIO object at 0x7fbb3a9324d0>


533it [03:50, 25.03it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

564it [03:51, 44.23it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)

627it [03:51, 83.35it/s]IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
Serve

In [46]:


# short_test_df= test_df.iloc[:10]
for index, batch in tqdm(enumerate(batches):
    inputs=batch
    inputs = inputs.to("cuda")
    
    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    results.append({"ID" : index , "Output":output_text})

2500

In [24]:
from tqdm import tqdm
import torch

def process_batch_data(test_df, batch_size, processor, model, device="cuda"):
    results = []
    
    # Iterate over the dataframe in batches
    for batch_start in tqdm(range(0, len(test_df), batch_size)):
        batch_end = min(batch_start + batch_size, len(test_df))
        batch_df = test_df.iloc[batch_start:batch_end]
        
        # Prepare batch messages
        batch_messages = []
        batch_indices = []
        for index, row in batch_df.iterrows():
            messages = [{
                "role": "user",
                "content": [
                    {
                        "type": row['input_type'],
                        row['input_type']: row['input'],
                    },
                    {"type": "text", "text": "Do the task : " + row['task']},
                ],
            }]
            batch_messages.append(messages)
            batch_indices.append(index)
        
        # Apply chat template to all messages in the batch
        batch_texts = [
            processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
            for messages in batch_messages
        ]
        
        # Process vision inputs for the batch
        batch_image_inputs = []
        batch_video_inputs = []
        for messages in batch_messages:
            image_inputs, video_inputs = process_vision_info(messages)
            batch_image_inputs.append(image_inputs)
            batch_video_inputs.append(video_inputs)
        
        # Prepare inputs for the model
        inputs = processor(
            text=batch_texts,
            images=batch_image_inputs if any(batch_image_inputs) else None,
            videos=batch_video_inputs if any(batch_video_inputs) else None,
            padding=True,
            return_tensors="pt",
        )
        inputs = inputs.to(device)
        
        # Inference: Generate outputs for the batch
        generated_ids = model.generate(**inputs, max_new_tokens=128)
        
        # Trim generated IDs to remove input tokens
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        
        # Decode the outputs
        output_texts = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        # Collect results
        for idx, output_text in zip(batch_indices, output_texts):
            results.append({"ID": idx, "Output": output_text})
    
    return results

# Example usage:
results = process_batch_data(test_df, batch_size=2, processor=processor, model=model, device="cuda")

  0%|          | 0/1250 [00:00<?, ?it/s]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|          | 1/1250 [00:06<2:11:43,  6.33s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|          | 2/1250 [00:14<2:34:31,  7.43s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|          | 3/1250 [00:22<2:40:00,  7.70s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  0%|          | 4/1250 [00:35<3:22:03,  9.73s/it]A decoder-only architecture is being used, but right-padding was det

KeyboardInterrupt: 