In [1]:
import json
import numpy as np
#from utils_data import ScienceQADatasetImg
from model import T5ForMultimodalGeneration
import torch
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5ForConditionalGeneration
from torch.utils.data import Dataset
import evaluate

Map keys, add features, add captions

In [2]:
problems = json.load(open("data/problems.json"))
name_maps = json.load(open("data/name_map.json"))
captions = json.load(open("data/instruct_captions.json"))["captions"]

In [3]:
image_features = np.load("vision_features/vision_features/clip.npy")

In [4]:
len(problems)

21208

In [5]:
idx=0
problems_s = dict()
for qid in problems:
    idx+=1
    problems_s[qid]=problems[qid]
    problems_s[qid]['caption'] = captions[qid] if qid in captions else ""
    if qid in image_features and str(qid) in name_maps:
        problems_s[qid]['image_feature'] = image_features[int(name_maps[str(qid)])]
    else:
        problems_s[qid]['image_feature'] = np.zeros((49, 2048))
    if idx%10 == 0:
        print(10)
        break

10


  if qid in image_features and str(qid) in name_maps:


In [6]:
def create_one_example(question, context, choice, solution, test_example=True, WithOutput = False, curr_le_data=None):

    input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"

    # Outputs
    output = f"Solution: {solution}"
    
    text = input + f'Solution:'
    text = text.replace("  ", " ").strip()
    output = output.replace("  ", " ").strip()
    return text, output

In [7]:
options=["A","B","C","D","E"]
target_texts = []
source_texts = []
image_ids = []
for k in problems_s:
    question = problems_s[k]["question"]

    txt_context = problems_s[k]['hint']
    img_context = problems_s[k]['caption']
    context = " ".join([txt_context, img_context]).strip()
    if context == "":
        context = "N/A"

    choices = problems_s[k]['choices']
    choice_list = []
    for i, c in enumerate(problems_s[k]['choices']):
        choice_list.append("({}) {}".format(options[i], c))
    choice_txt = " ".join(choice_list)

    solution = problems_s[k]["solution"]

    image=problems_s[k]["image_feature"]
    
    prompt,target=create_one_example(question, context, choice_txt, solution)
    
    target_texts.append(target)
    source_texts.append(prompt)
    image_ids.append(image)

In [8]:
#testings = ScienceQADatasetImg(problems_s, list(problems_s.keys()), name_maps, tokenizer, 512, 64)

In [9]:
tokenizer=AutoTokenizer.from_pretrained("models/mm-cot-large-rationale/")
datacollator = DataCollatorForSeq2Seq(tokenizer)

In [10]:
class Data(Dataset):
    def __init__(self, target_texts, source_texts, image_ids):
        self.target_texts = target_texts
        self.source_texts = source_texts
        self.image_ids = image_ids
    
    def __getitem__(self,index):
        image_id = self.image_ids[index]
        target_text = str(self.target_texts[index])
        source_text = str(self.source_texts[index])

        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())


        source=tokenizer.batch_encode_plus([source_text],
            max_length=512,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",)

        target=tokenizer.batch_encode_plus([target_text],
            max_length=64,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",)

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze().tolist()

        
        return {"input_ids": source_ids,
                "attention_mask": source_mask,
                "image_ids": torch.tensor(image_id).squeeze(),
                "labels": target_ids,}
    
    def __len__(self):
        return len(self.target_texts)

In [11]:
data=Data(target_texts,source_texts,image_ids)

In [12]:
data[0]["input_ids"][0]

tensor(11860)

In [13]:
problems_s["1"]["solution"]

'To find the answer, look at the compass rose. Look at which way the north arrow is pointing. West Virginia is farthest north.'

In [14]:
#utils_data.ScienceQADatasetImg(problems,
#            train_qids,
#            name_maps,
#            tokenizer,
#            512,
#            64,
#            args,
#            image_features,
#        )

In [16]:
metric = evaluate.load("rouge")
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels



def compute_metrics_rougel(eval_preds):
    if args.use_generate:
        preds, targets = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]
    else:
        preds = eval_preds.predictions[0]
        targets = eval_preds.label_ids
        preds = preds.argmax(axis=2)
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    targets = tokenizer.batch_decode(targets, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    decoded_preds, decoded_labels = postprocess_text(preds, targets)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [17]:
model=T5ForMultimodalGeneration.from_pretrained("models/mm-cot-large-rationale",patch_size=(49, 2048), ignore_mismatched_sizes=True)

Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at models/mm-cot-large-rationale and are newly initialized because the shapes did not match:
- encoder.image_dense.weight: found shape torch.Size([1024, 1024]) in the checkpoint and torch.Size([1024, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
training_args = Seq2SeqTrainingArguments("Saving",
                                        do_train = False,
                                        do_eval = False,
                                        eval_strategy = "no",
                                        logging_strategy = "steps",
                                        save_strategy = "epoch",
                                        save_total_limit = 2,
                                        learning_rate= 5e-5,
                                        eval_accumulation_steps=None,
                                        per_device_train_batch_size=2,
                                        per_device_eval_batch_size=4,
                                        weight_decay=0.01,
                                        num_train_epochs=50,
                                        predict_with_generate=True,
                                        generation_max_length=64,
                                        report_to="none",
                                        )


In [19]:
trainer = Seq2SeqTrainer(
                        model=model,
                        args=training_args,
                        data_collator=datacollator,
                        processing_class=tokenizer,
                        compute_metrics = compute_metrics_rougel
    )

In [20]:
trainer.predict(test_dataset = data, max_length=64)

TypeError: 'NoneType' object is not subscriptable

In [76]:
for v in datum.values():
    print(type(v))

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'list'>
