In [None]:
import torch
from PIL import Image
from transformers import ViTImageProcessor, AutoTokenizer, VisionEncoderDecoderModel

In [None]:
# --- Configuration ---
ENCODER_ID = "google/vit-base-patch16-224-in21k"
DECODER_ID = "gpt2"
TOKENIZER_NAME = "gpt2"
MODEL_PATH = r'./image-captioning-model/epoch_decoder_only_baseline_3'

MAX_LEN = 48

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# setup image preprocessor and tokenizer
img_processor = ViTImageProcessor.from_pretrained(ENCODER_ID)
cap_tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
cap_tokenizer.add_special_tokens({'pad_token': '<PAD>', 'bos_token': '<BOS>'})

In [None]:
#load saved model
model = VisionEncoderDecoderModel.from_pretrained(MODEL_PATH)
model.to(DEVICE)

In [None]:
# Load image and pre-processing
image_path = r"./Flicker8k_images/667626_18933d713e.jpg"
image = Image.open(image_path).convert("RGB")

pixel_values = img_processor(images=image, return_tensors="pt").pixel_values
pixel_values = pixel_values.to(DEVICE)
pixel_values.shape

In [None]:
#output_ids = model.generate(pixel_values, max_length=16, num_beams=4)
generated_ids = model.generate(
                  pixel_values=pixel_values,
                  decoder_start_token_id=cap_tokenizer.bos_token_id,
                  pad_token_id=cap_tokenizer.pad_token_id,
                  eos_token_id=cap_tokenizer.eos_token_id,
                  max_new_tokens=20,
                  do_sample=True,
                  top_p=0.9,
                  temperature=0.7,
                  #num_beams=5,
                  #length_penalty=3.0,
                  repetition_penalty=3.0,
                  min_length=5,
                  early_stopping=True
              )
preds = cap_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print()
print('====================================')
print('=========== image caption ==========\n')
print(preds)