In [1]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image

In [2]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

Downloading (…)lve/main/config.json:   0%|          | 0.00/4.61k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/982M [00:00<?, ?B/s]

In [3]:
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

Downloading (…)rocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]



Downloading (…)okenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, 

In [6]:
max_length = 16
num_beams = 4

gen_kwargs = {
    "max_length": max_length,
    "num_beams": num_beams,
}

In [7]:
def predict_step(img_path):
    images = []
    for image_path in img_path:
        i_image = Image.open(image_path)
        if i_image.mode != "RGB":
            i_image = i_image.convert("RGB")
        images.append(i_image)

    pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values.to(device)
    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]

    return preds

In [8]:
predict_step(["D:/projects/Caption generator/dataset/images/8227381437.jpg"])

['a man sitting in front of a pile of boxes']

In [10]:
#save model
model_path = "D:/projects/Caption generator/model"
tokenizer_path = "D:/projects/Caption generator/tokenizer"
feature_extractor_path = "D:/projects/Caption generator/feature_extractor"

model.save_pretrained(model_path)
tokenizer.save_pretrained(tokenizer_path)
feature_extractor.save_pretrained(feature_extractor_path)

['D:/projects/Caption generator/feature_extractor\\preprocessor_config.json']