In [2]:
import tensorflow as tf

# Method 1: Simply check the number of available GPUs
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Method 2: More detailed approach
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth to avoid TensorFlow from allocating all memory at once
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU is available! Number of GPUs: {len(gpus)}")
        print("Available GPU(s):", gpus)
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found. Using CPU instead.")

Num GPUs Available:  1
GPU is available! Number of GPUs: 1
Available GPU(s): [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [1]:
import requests
import torch
from PIL import Image
from transformers import MllamaForConditionalGeneration, AutoProcessor


# Reduce batch size
batch_size = 1

# Use mixed precision
torch.set_default_dtype(torch.float16)

model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model = MllamaForConditionalGeneration.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16)
processor = AutoProcessor.from_pretrained(model_id)

messages = [
    [
        {
            "role": "user", 
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What does the image show?"}
            ]
        }
    ],
]
text = processor.apply_chat_template(messages, add_generation_prompt=True)

url = "https://llava-vl.github.io/static/images/view.jpg"
image = Image.open(requests.get(url, stream=True).raw)

inputs = processor(text=text, images=image, return_tensors="pt").to(model.device)
output = model.generate(**inputs, max_new_tokens=25)
print(processor.decode(output[0]))

The model weights are not tied. Please use the `tie_weights` method before using the `infer_auto_device` function.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the disk.


<|begin_of_text|><|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|>What does the image show?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

The image depicts a serene lake scene, featuring a long wooden dock extending into the water, surrounded by lush trees and a majestic


In [4]:
print(text)

['<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n<|image|>What does the image show?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n']


In [5]:
url = 'https://camo.githubusercontent.com/f50424cb2c6fd29570c2ea59afd4eeb93f5a9fa665223a9f6f313b67e9f21845/68747470733a2f2f692e706f7374696d672e63632f704c3137597447342f575832303234303530382d3232303233302d32782e706e67'
image2 = Image.open(requests.get(url, stream=True).raw)

# Update the text to include another image token
text_with_two_images = text[0].replace("<|image|>", "<|image|><|image|>")

print(text_with_two_images)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

<|image|><|image|>What does the image show?<|eot_id|><|start_header_id|>assistant<|end_header_id|>




In [8]:
import os

# Set the environment variable
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

# Verify that the environment variable is set
print(os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'])

0.0


In [10]:
%env PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0

inputs2 = processor(text=[text_with_two_images], images=(image, image2), return_tensors="pt").to(model.device)
output = model.generate(**inputs2, max_new_tokens=25)
print(processor.decode(output[0]))

env: PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0


RuntimeError: MPS backend out of memory (MPS allocated: 14.49 GB, other allocations: 3.92 MB, max allowed: 18.13 GB). Tried to allocate 4.93 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).