In [1]:
import json
import numpy as np
#from utils_data import ScienceQADatasetImg
from model import T5ForMultimodalGeneration
import torch
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5ForConditionalGeneration
from torch.utils.data import Dataset
import evaluate
import nltk

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shark\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Map keys, add features, add captions

In [3]:
problems = json.load(open("data/problems.json"))
name_maps = json.load(open("data/name_map.json"))
captions = json.load(open("data/instruct_captions.json"))["captions"]

In [4]:
image_features = np.load("vision_features/vision_features/clip.npy")

In [5]:
len(problems)

21208

In [6]:
idx=0
problems_s = dict()
for qid in problems:
    idx+=1
    problems_s[qid]=problems[qid]
    problems_s[qid]['caption'] = captions[qid] if qid in captions else ""
    if qid in image_features and str(qid) in name_maps:
        problems_s[qid]['image_feature'] = image_features[int(name_maps[str(qid)])]
    else:
        problems_s[qid]['image_feature'] = np.zeros((49, 2048))
    if idx%5 == 0:
        print(10)
        break

10


  if qid in image_features and str(qid) in name_maps:


In [7]:
def create_one_example(question, context, choice, solution, test_example=True, WithOutput = False, curr_le_data=None):

    input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"

    # Outputs
    output = f"Solution: {solution}"
    
    text = input + f'Solution:'
    text = text.replace("  ", " ").strip()
    output = output.replace("  ", " ").strip()
    return text, output

In [8]:
options=["A","B","C","D","E"]
target_texts = []
source_texts = []
image_ids = []
for k in problems_s:
    question = problems_s[k]["question"]

    txt_context = problems_s[k]['hint']
    img_context = problems_s[k]['caption']
    context = " ".join([txt_context, img_context]).strip()
    if context == "":
        context = "N/A"

    choices = problems_s[k]['choices']
    choice_list = []
    for i, c in enumerate(problems_s[k]['choices']):
        choice_list.append("({}) {}".format(options[i], c))
    choice_txt = " ".join(choice_list)

    solution = problems_s[k]["solution"]

    image=problems_s[k]["image_feature"]
    
    prompt,target=create_one_example(question, context, choice_txt, solution)
    
    target_texts.append(target)
    source_texts.append(prompt)
    image_ids.append(image)

In [9]:
#testings = ScienceQADatasetImg(problems_s, list(problems_s.keys()), name_maps, tokenizer, 512, 64)

In [10]:
tokenizer=AutoTokenizer.from_pretrained("models/mm-cot-large-rationale/")
datacollator = DataCollatorForSeq2Seq(tokenizer)

In [11]:
class Data(Dataset):
    def __init__(self, target_texts, source_texts, image_ids):
        self.target_texts = target_texts
        self.source_texts = source_texts
        self.image_ids = image_ids
    
    def __getitem__(self,index):
        image_id = self.image_ids[index]
        target_text = str(self.target_texts[index])
        source_text = str(self.source_texts[index])

        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())


        source=tokenizer.batch_encode_plus([source_text],
            max_length=512,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",)

        target=tokenizer.batch_encode_plus([target_text],
            max_length=64,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",)

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze().tolist()

        
        return {"input_ids": source_ids,
                "attention_mask": source_mask,
                "image_ids": torch.tensor(image_id).squeeze(),
                "labels": target_ids,}
    
    def __len__(self):
        return len(self.target_texts)

In [12]:
data=Data(target_texts,source_texts,image_ids)

In [13]:
#for i in data[4]["image_ids"]:
#    for j in i:
#        if type(j)==float:
#            print(j)

In [14]:
#data[4]["image_ids"]

In [15]:
#tokenizer.batch_decode(data[0]["image_ids"], skip_special_tokens=True, clean_up_tokenization_spaces=True)

In [16]:
#data[1]["input_ids"]

In [17]:
#utils_data.ScienceQADatasetImg(problems,
#            train_qids,
#            name_maps,
#            tokenizer,
#            512,
#            64,
#            args,
#            image_features,
#        )

In [18]:
metric = evaluate.load("rouge")
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels



def compute_metrics_rougel(eval_preds):
    preds, targets = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    #Preds gets padded with -100. tokenizer can't handle negative numbers, so they get replaced with 0
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    targets = tokenizer.batch_decode(targets, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    decoded_preds, decoded_labels = postprocess_text(preds, targets)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [19]:
model=T5ForMultimodalGeneration.from_pretrained("models/mm-cot-large-rationale",patch_size=(49, 2048), ignore_mismatched_sizes=True)

  return torch.load(checkpoint_file, map_location="cpu")
Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at models/mm-cot-large-rationale and are newly initialized because the shapes did not match:
- encoder.image_dense.weight: found shape torch.Size([1024, 1024]) in the checkpoint and torch.Size([1024, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
training_args = Seq2SeqTrainingArguments(
            "Saving",
            do_train=False,
            do_eval=False,
            evaluation_strategy="no",
            logging_strategy="steps",
            save_strategy="epoch",
            save_total_limit = 2,
            learning_rate= 5e-5,
            eval_accumulation_steps=None,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=4,
            weight_decay=0.01,
            num_train_epochs=50,
            predict_with_generate=True,
            generation_max_length=64,
            report_to="none",
        )

In [21]:
trainer = Seq2SeqTrainer(
                        model=model,
                        args=training_args,
                        data_collator=datacollator,
                        tokenizer=tokenizer,
                        compute_metrics = compute_metrics_rougel
    )

In [22]:
predict_results = trainer.predict(test_dataset = data, max_length=64)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [24]:
preds, targets = predict_results.predictions, predict_results.label_ids
preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
preds = tokenizer.batch_decode(
            preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
targets = tokenizer.batch_decode(
            targets, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
preds = [pred.strip() for pred in preds]

In [25]:
output_data = {"preds": preds,
                "labels": targets}
#output_prediction_file = os.path.join("Saving","predictions_ans_eval.json")
with open("Saving/predictions_ans_eval.json", "w") as writer:
        writer.write(json.dumps(output_data, indent=4))

In [26]:
targets

['Solution: To find the answer, look at the compass rose. Look at which way the north arrow is pointing. West Virginia is farthest north.',
 'Solution:',
 'Solution:',
 'Solution: The text uses apostrophe, a direct address to an absent person or a nonhuman entity. O goddess is a direct address to a goddess, a nonhuman entity.',
 'Solution:']