# Imports

In [3]:
from transformers import (
    GPT2Config,
    ViTConfig,
    VisionEncoderDecoderConfig,
    VisionEncoderDecoderModel,
    ViTFeatureExtractor,
    GPT2Tokenizer,
    PreTrainedTokenizerFast,
    TrOCRProcessor,
    AutoTokenizer)
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset
from PIL import Image

In [4]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
pretrained_tokenizer = AutoTokenizer.from_pretrained("gpt2") # Uncomment to use a pre-trained tokenizer
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") # wrap function from TrOCR that combines feature_extractor and tokenizer

# Load architectures in the model
config_encoder = ViTConfig()
config_decoder = GPT2Config()

# Group architectures and define model
config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
model = VisionEncoderDecoderModel(config=config)

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [28]:
tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/models/tokenizer/MLW_Tokenizer.json")
# tokenizer.model_max_length = 32

In [16]:
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 64 # 32 if it were characters
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4
model.decoder.resize_token_embeddings(len(tokenizer))

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# Make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# Test Input

In [8]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

In [9]:
image = Image.open("/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/data/interim/lemmata_img/images/956.jpg").convert("RGB")
pixel_values = feature_extractor(image, return_tensors="pt").pixel_values

In [14]:
model.pad_token_id = 4 # EOS

In [17]:
generated_ids = model.generate(pixel_values)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


IndexError: index out of range in self

In [11]:
feature_extractor(pixel_values)

ValueError: axes don't match array

In [22]:
import requests
from PIL import Image

from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel, PreTrainedTokenizerFast

# Load architectures in the model
config_encoder = ViTConfig()
config_decoder = GPT2Config()

# Group architectures and define model
config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
model = VisionEncoderDecoderModel(config=config)

# load a fine-tuned image captioning model and corresponding tokenizer and image processor
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
# tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/models/tokenizer/MLW_Tokenizer.json")
tokenizer.model_max_length = 32
image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# let's perform inference on an image
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
pixel_values = image_processor(image, return_tensors="pt").pixel_values
image = Image.open("/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/data/interim/lemmata_img/images/956.jpg").convert("RGB")
pixel_values = image_processor(image, return_tensors="pt").pixel_values

# autoregressively generate caption (uses greedy decoding by default)
generated_ids = model.generate(pixel_values)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)




In [46]:
import requests
from PIL import Image

from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel, PreTrainedTokenizerFast

# Load architectures in the model
config_encoder = ViTConfig()
config_decoder = GPT2Config()

# Group architectures and define model
config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
model = VisionEncoderDecoderModel(config=config)

# load a fine-tuned image captioning model and corresponding tokenizer and image processor
tokenizer = PreTrainedTokenizerFast(tokenizer_file="/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/models/tokenizer/MLW_Tokenizer.json")
tokenizer.model_max_length = 32
tokenizer.eos_token_id
image_processor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')

# let's perform inference on an image
image = Image.open("/home/philko/Documents/Uni/WiSe2223/Consulting/mlw-consulting-project/data/interim/lemmata_img/images/956.jpg").convert("RGB")
pixel_values = image_processor(image, return_tensors="pt").pixel_values

# autoregressively generate caption (uses greedy decoding by default)
generated_ids = model.generate(pixel_values, pad_token_id=tokenizer.eos_token_id)
generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=False)[0]
print("Generated Text: <", str(generated_text), ">")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text: <  >


In [35]:
tokenizer.decode(tokenizer('aaa')['input_ids'])

'aaa'

In [37]:
tokenizer.eos_token_id

In [43]:
tokenizer.eos_token = 4

In [44]:
tokenizer.eos_token

'4'