In [2]:
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
#Load the model
model = AutoModelForVision2Seq.from_pretrained("models/paligemma", local_files_only=True).eval()
processor = AutoProcessor.from_pretrained("models/paligemma", local_files_only=True)

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.66it/s]


In [None]:
# 假设你有一张图片（可以替换为你自己的图片路径）
image = Image.open("./test0.png")
image = image.convert("RGB")

# 处理图片和文本输入
promopt = "how many serial number in the picture?"  # 替换为你自己的任务描述

# 使用processor对图像和文本进行处理
model_inputs = processor(text=promopt, images=image, return_tensors="pt").to(model.device)
input_len = model_inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]
    decoded = processor.decode(generation, skip_special_tokens=True)
    print(decoded)

You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


0


In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"  # 检查是否有可用的GPU
print(device)

cuda


In [8]:
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor
from PIL import Image

In [9]:
# 加载模型并确保使用GPU（如果可用）
device = "cuda" if torch.cuda.is_available() else "cpu"  # 检查是否有可用的GPU
model = AutoModelForVision2Seq.from_pretrained("models/paligemma", local_files_only=True).to(device).eval()
processor = AutoProcessor.from_pretrained("models/paligemma", local_files_only=True)

Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.53it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [38]:
# 假设你有一张图片（可以替换为你自己的图片路径）
image = Image.open("./airpods.jpg")
image = image.convert("RGB")
#display(image)
print(type(image))

<class 'PIL.Image.Image'>


In [39]:
# 处理图片和文本输入
prompt = "<image>Describe the picture."  # 在文本开始处添加 <image> token
model_inputs = processor(text= prompt,images=image, return_tensors="pt").to(device)
# 获取输入长度
input_len = model_inputs["input_ids"].shape[-1]



decoded_text = processor.decode(model_inputs["input_ids"][0].tolist())

# 打印解码结果
#print(decoded_text)

model_inputs

{'input_ids': tensor([[257152, 257152, 257152,  ...,   5642, 235265,    108]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], device='cuda:0'), 'pixel_values': tensor([[[[ 0.0039,  0.0039,  0.0039,  ...,  0.4039,  0.3961,  0.4118],
          [ 0.0039,  0.0039,  0.0039,  ...,  0.3961,  0.3961,  0.4118],
          [ 0.0039,  0.0039,  0.0039,  ...,  0.3961,  0.3961,  0.4118],
          ...,
          [-0.0196, -0.0196, -0.0118,  ...,  0.3333,  0.3412,  0.3412],
          [-0.0196, -0.0196, -0.0118,  ...,  0.3333,  0.3255,  0.3255],
          [-0.0196, -0.0196, -0.0118,  ...,  0.3412,  0.3098,  0.2941]],

         [[ 0.0039,  0.0039,  0.0039,  ...,  0.3961,  0.3882,  0.4039],
          [ 0.0039,  0.0039,  0.0039,  ...,  0.3882,  0.3882,  0.4039],
          [ 0.0039,  0.0039,  0.0039,  ...,  0.3882,  0.3882,  0.4039],
          ...,
          [-0.0353, -0.0353, -0.0275,  ...,  0.3255,  0.3333,  0.3333],
          [-0.0353, -0.0353, -0.0275,  ...,  0.3255,  0.3

In [40]:

# 使用GPU进行推理
with torch.inference_mode():
    generation = model.generate(**model_inputs, max_new_tokens=50, do_sample=False)
    generation = generation[0][input_len:]
    decoded = processor.decode(generation, skip_special_tokens=True)
    print(decoded)


airpods


In [1]:
import torch
from PIL import Image
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

# 检查并选择设备（GPU 优先）
device = "cuda" if torch.cuda.is_available() else "cpu"  # 检查是否有可用的GPU
print(f"Using device: {device}")

# 加载模型并确保使用GPU（如果可用）
model = PaliGemmaForConditionalGeneration.from_pretrained("models/paligemma", local_files_only=True).to(device).eval()
processor = AutoProcessor.from_pretrained("models/paligemma", local_files_only=True)

# 处理图片和文本输入
prompt = "How many serial number in the picture?"
image = Image.open("./test0.png")
image = image.convert("RGB")

# 使用 processor 对图像和文本进行处理，并确保所有张量都在相同的设备上
inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)

# 使用GPU进行推理
with torch.no_grad():  # 使用 no_grad() 来避免计算梯度，节省内存
    output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")

# 输出结果
decoded_text = processor.decode(output[0], skip_special_tokens=True)
print(decoded_text)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.47it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.


How many serial number in the picture?
0
