In [1]:
import os
import datasets

In [2]:
args = {}
args["data_dir"] = "/Volumes/T7 Shield/Datasets/Flickr30"
args["output_directory"] = "/Volumes/T7 Shield/Model-Runs/"
args["experiment_name"] = "1"

args["encoder"] = "facebook/dinov2-small"
args["decoder"] = "google-bert/bert-base-uncased"
args["decoder"] = "openai-community/gpt2"

args["lr"] = 1e-2
args["lr_scheduler_step_size"] = 10
args["lr_scheduler_gamma"] = 0.1

args["experiment_path"] = os.path.join(
    args["output_directory"], args["encoder"].split("/")[1]+"@"+args["decoder"].split("/")[1], args["experiment_name"])
# if not os.path.exists(args["experiment_path"]):
#     os.makedirs(args["experiment_path"])
args["log_directory"] = os.path.join(args["experiment_path"], "logs")
# if not os.path.exists(args["log_directory"]):
#     os.makedirs(args["log_directory"])

In [5]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
output.pooler_output.shape

torch.Size([1, 768])

In [8]:
encoded_input

{'input_ids': tensor([[ 101, 5672, 2033, 2011, 2151, 3793, 2017, 1005, 1040, 2066, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [3]:
from transformers import AutoImageProcessor, AutoTokenizer, VisionEncoderDecoderModel

image_processor = AutoImageProcessor.from_pretrained(args["encoder"])
tokenizer = AutoTokenizer.from_pretrained(args["decoder"])
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
    args["encoder"], args["decoder"], return_dict=True
)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['h.0.crossattention.c_attn.bias', 'h.0.crossattention.c_attn.weight', 'h.0.crossattention.c_proj.bias', 'h.0.crossattention.c_proj.weight', 'h.0.crossattention.q_attn.bias', 'h.0.crossattention.q_attn.weight', 'h.0.ln_cross_attn.bias', 'h.0.ln_cross_attn.weight', 'h.1.crossattention.c_attn.bias', 'h.1.crossattention.c_attn.weight', 'h.1.crossattention.c_proj.bias', 'h.1.crossattention.c_proj.weight', 'h.1.crossattention.q_attn.bias', 'h.1.crossattention.q_attn.weight', 'h.1.ln_cross_attn.bias', 'h.1.ln_cross_attn.weight', 'h.10.crossattention.c_attn.bias', 'h.10.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.10.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.bias', 'h.10.crossattention.q_attn.weight', 'h.10.ln_cross_attn.bias', 'h.10.ln_cross_attn.weight', 'h.11.crossattenti

In [4]:
model.encoder.config.hidden_size, model.decoder.config.hidden_size

(384, 768)

In [5]:
import torch


def get_device(device_type):
    if device_type == "gpu" and torch.cuda.is_available():
        return torch.device("cuda")
    else:
        return torch.device(device_type)


device = get_device("mps")

In [6]:
# model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token = tokenizer.pad_token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id

In [7]:
train_dataset = datasets.Flickr30kDataset(args["data_dir"], split="train",
                                          image_processor=image_processor, tokenizer=tokenizer)
#   image_processor=image_processor, tokenizer=None)
val_dataset = datasets.Flickr30kDataset(args["data_dir"], split="val",
                                        image_processor=image_processor, tokenizer=tokenizer)
# image_processor=image_processor, tokenizer=None)

In [8]:
X = train_dataset[0]

0


In [9]:
X["input_ids"].shape

torch.Size([1, 1024])

In [10]:
model(X["pixel_values"], decoder_input_ids=X["input_ids"], labels=X["input_ids"])

IndexError: index out of range in self

In [26]:
from torch.utils.data import DataLoader

batch_size = 16
num_workers = 4

train_dataloader = DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size,
                            shuffle=False, num_workers=num_workers)

In [30]:
from torch.optim import lr_scheduler
import torch.optim as optim
optimizer = optim.AdamW(model.parameters(), lr=args["lr"])
scheduler = lr_scheduler.StepLR(
    optimizer, step_size=args["lr_scheduler_step_size"], gamma=args["lr_scheduler_gamma"])
model = model.to(device)

In [31]:
import os


def save_checkpoint(state, filename='model'):
    saved_model_path = os.path.join(args["experiment_path"], "model.pth.tar")
    torch.save(state, saved_model_path)
    print(f"Model saved successfully at {saved_model_path}")

In [14]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    output_dir=args["experiment_path"],
    logging_dir=args["log_directory"],
)

In [None]:
X

In [18]:
from transformers import default_data_collator

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    # tokenizer=tokenizer,
    args=training_args,
    # compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=default_data_collator,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [19]:
trainer.train()

  0%|          | 0/95376 [03:14<?, ?it/s]
  0%|          | 0/95376 [00:00<?, ?it/s]

292
32976
82948
77827
88627
115900
63646
73809


ValueError: too many values to unpack (expected 4)

In [17]:
x = train_dataset[0]
x["pixel_values"].shape

0


torch.Size([1, 3, 224, 224])

In [22]:
from nltk.translate.bleu_score import sentence_bleu
reference = [[10, 4, 1, 5], [10, 2, 5]]
candidate = [1, 4, 1, 5]
score = sentence_bleu(reference, candidate)
print(score)

8.636168555094496e-78


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
# two references for one document
from nltk.translate.bleu_score import corpus_bleu
references = [[['this', 'is', 'a', 'test'], ['this', 'is' 'test']]]
candidates = [['this', 'is', 'a', 'test']]
score = corpus_bleu(references, candidates)
print(score)