In [1]:
import os
import datasets

In [2]:
args = {}
args["data_dir"] = "/Volumes/T7 Shield/Datasets/Flickr30"
args["output_directory"] = "/Volumes/T7 Shield/Model-Runs/"
args["experiment_name"] = "1"

args["encoder"] = "facebook/dinov2-small"
args["decoder"] = "google-bert/bert-base-uncased"
# args["decoder"] = "openai-community/gpt2"

args["lr"] = 1e-2
args["lr_scheduler_step_size"] = 10
args["lr_scheduler_gamma"] = 0.1

args["experiment_path"] = os.path.join(
    args["output_directory"], args["encoder"].split("/")[1]+"@"+args["decoder"].split("/")[1], args["experiment_name"])
if not os.path.exists(args["experiment_path"]):
    os.makedirs(args["experiment_path"])
args["log_directory"] = os.path.join(args["experiment_path"], "logs")
if not os.path.exists(args["log_directory"]):
    os.makedirs(args["log_directory"])

In [3]:
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel

image_processor = AutoImageProcessor.from_pretrained(args["encoder"])
tokenizer = AutoTokenizer.from_pretrained(args["decoder"])
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    args["encoder"], args["decoder"], return_dict=True
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertLMHeadModel were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight'

In [4]:
import torch


def get_device(device_type):
    if device_type == "gpu" and torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device(device_type)


device = get_device("cpu")

In [5]:
# set special tokens used for creating the decoder_input_ids from the labels
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
# make sure vocab size is set correctly
model.config.vocab_size = model.config.decoder.vocab_size

# set beam search parameters
model.config.eos_token_id = tokenizer.sep_token_id
model.config.max_length = 128
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [6]:
# # model.config.decoder_start_token_id = tokenizer.cls_token_id
# model.config.decoder_start_token_id = tokenizer.bos_token_id
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.config.pad_token_id = tokenizer.pad_token_id

# # tokenizer.pad_token = tokenizer.eos_token
# # tokenizer.pad_token = tokenizer.pad_token
# model.config.eos_token_id = tokenizer.eos_token_id

In [7]:
tokenizer.sep_token_id, model.config.decoder.vocab_size

(102, 30522)

In [8]:
train_dataset = datasets.Flickr30kDataset(args["data_dir"], split="train",
                                          image_processor=image_processor, tokenizer=tokenizer)
#   image_processor=image_processor, tokenizer=None)
val_dataset = datasets.Flickr30kDataset(args["data_dir"], split="val",
                                        image_processor=image_processor, tokenizer=tokenizer)
# image_processor=image_processor, tokenizer=None)

In [27]:
X = train_dataset[0]

In [28]:
X["pixel_values"].shape

torch.Size([3, 224, 224])

In [29]:
X["pixel_values"].shape, X["decoder_attention_mask"].shape, X["labels"].shape

(torch.Size([3, 224, 224]), torch.Size([128]), torch.Size([128]))

In [9]:
tokenizer.max_len_sentences_pair

509

In [None]:
# model(X["pixel_values"], labels=X["labels"], decoder_attention_mask=X["decoder_attention_mask"])
model(**X)

In [10]:
from torch.utils.data import DataLoader

batch_size = 16
num_workers = 4

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,
                            shuffle=False, num_workers=num_workers)

In [11]:
inp = next(iter(train_dataloader))

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [12]:
inp["decoder_attention_mask"].shape

torch.Size([16, 128])

In [13]:
from torch.optim import lr_scheduler
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=args["lr"])
scheduler = lr_scheduler.StepLR(
    optimizer, step_size=args["lr_scheduler_step_size"], gamma=args["lr_scheduler_gamma"])
model = model.to(device)

In [14]:
import os


def save_checkpoint(state, filename='model'):
    saved_model_path = os.path.join(args["experiment_path"], "model.pth.tar")
    torch.save(state, saved_model_path)
    print(f"Model saved successfully at {saved_model_path}")

In [24]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir=args["experiment_path"],
    logging_dir=args["log_directory"],
    use_cpu=True,
    use_mps_device=False,
)

In [25]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    # tokenizer=tokenizer,
    args=training_args,
    # compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [26]:
trainer.train()

  0%|          | 0/95376 [07:00<?, ?it/s]
  0%|          | 0/95376 [01:10<?, ?it/s]
  0%|          | 29/95376 [01:33<82:36:48,  3.12s/it]

KeyboardInterrupt: 

In [17]:
x = train_dataset[0]
x["pixel_values"].shape

0


torch.Size([1, 3, 224, 224])

In [22]:
from nltk.translate.bleu_score import sentence_bleu
reference = [[10, 4, 1, 5], [10, 2, 5]]
candidate = [1, 4, 1, 5]
score = sentence_bleu(reference, candidate)
print(score)

8.636168555094496e-78


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
# two references for one document
from nltk.translate.bleu_score import corpus_bleu
references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]]
candidates = [['this', 'is', 'a', 'test']]
score = corpus_bleu(references, candidates)
print(score)