In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import requests

# Check for GPU availability and set the device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Please ensure you have a compatible GPU and the necessary drivers installed.")

# Load the processor
processor = AutoProcessor.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
)

# Load the model onto the GPU
model = AutoModelForCausalLM.from_pretrained(
    'allenai/Molmo-7B-D-0924',
    trust_remote_code=True,
    torch_dtype='auto',
    device_map={'': device}
)

# Define your batch of images and texts
image_urls = [
    "https://picsum.photos/id/237/536/354",
    "https://picsum.photos/id/238/536/354",
    "https://picsum.photos/id/239/536/354"
]

images = [
    Image.open(requests.get(url, stream=True).raw).convert('RGB') for url in image_urls
]

texts = [
    "Describe this image 1.",
    "Describe this image 2.",
    "Describe this image 3."
]

# Process each text and image individually
processed_inputs = []
for idx, (text, image) in enumerate(zip(texts, images)):
    input_data = processor.process(
        images=image,
        text=text
    )
    # Move inputs to the GPU and verify
    for k, v in input_data.items():
        if isinstance(v, torch.Tensor):
            v = v.to(device)
            input_data[k] = v
            print(f"Input {k} device after to(device): {v.device}")
    processed_inputs.append(input_data)

# Verify that all tensors in processed_inputs are on the GPU
for idx, input_data in enumerate(processed_inputs):
    for k, v in input_data.items():
        if isinstance(v, torch.Tensor):
            assert v.device == device, f"Processed input {idx}, tensor {k} is not on the GPU."

# Stack the inputs to create batched tensors
batched_inputs = {}
for key in processed_inputs[0].keys():
    if isinstance(processed_inputs[0][key], torch.Tensor):
        tensors_to_stack = [input_data[key] for input_data in processed_inputs]
        # Verify that all tensors to stack are on the GPU
        devices = [t.device for t in tensors_to_stack]
        print(f"Devices for {key} before stacking: {devices}")
        assert all(d == device for d in devices), f"Not all tensors for {key} are on the GPU."
        batched_inputs[key] = torch.stack(tensors_to_stack, dim=0)
        # Verify that the batched tensor is on the GPU
        print(f"Batched input {key} device after stacking: {batched_inputs[key].device}")
        assert batched_inputs[key].device == device, f"Batched input {key} is not on the GPU."
    else:
        # For non-tensor data, collect in a list
        batched_inputs[key] = [input_data[key] for input_data in processed_inputs]

# Define generation configuration
generation_config = GenerationConfig(max_new_tokens=200)

# Generate output
output = model.generate_from_batch(
    batched_inputs,
    generation_config=generation_config,
    tokenizer=processor.tokenizer
)

# Calculate the effective lengths of the inputs for each batch item
input_lengths = (batched_inputs['input_ids'] != processor.tokenizer.pad_token_id).sum(dim=1)

# Iterate over each item in the batch to extract and decode the generated tokens
for i in range(len(texts)):
    # Slice the output to get only the generated tokens for this batch item
    generated_tokens = output[i, input_lengths[i]:]
    # Decode the tokens to text
    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
    print(f"Generated text for input {i+1}: {generated_text}")


  from .autonotebook import tqdm as notebook_tqdm


Using GPU: NVIDIA L40S


Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:05<00:00,  1.33it/s]


Input input_ids device after to(device): cuda:0
Input images device after to(device): cuda:0
Input image_input_idx device after to(device): cuda:0
Input image_masks device after to(device): cuda:0
Input input_ids device after to(device): cuda:0
Input images device after to(device): cuda:0
Input image_input_idx device after to(device): cuda:0
Input image_masks device after to(device): cuda:0
Input input_ids device after to(device): cuda:0
Input images device after to(device): cuda:0
Input image_input_idx device after to(device): cuda:0
Input image_masks device after to(device): cuda:0
Devices for input_ids before stacking: [device(type='cuda', index=0), device(type='cuda', index=0), device(type='cuda', index=0)]
Batched input input_ids device after stacking: cuda:0
Devices for images before stacking: [device(type='cuda', index=0), device(type='cuda', index=0), device(type='cuda', index=0)]
Batched input images device after stacking: cuda:0
Devices for image_input_idx before stacking: [d