In [2]:
import json
import numpy as np
#from utils_data import ScienceQADatasetImg
from src.original.model import T5ForMultimodalGeneration
import torch
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5ForConditionalGeneration
from torch.utils.data import Dataset
import evaluate
import nltk
import re
import pandas as pd

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Shark\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Map keys, add features, add captions

In [3]:
problems = json.load(open("data/problems.json"))
name_maps = json.load(open("data/name_map.json"))
captions = json.load(open("data/instruct_captions.json"))["captions"]

In [13]:
image_features = np.load("vision_features/vision_features/clip.npy")

In [14]:
len(problems)

21208

In [16]:
if problems["7"]["image"]:
    print("ok")

ok


# Subset of data

In [79]:
idx=0
problems_s = dict()
for qid in problems:
    idx+=1
    problems_s[qid]=problems[qid]
    problems_s[qid]['caption'] = captions[qid] if qid in captions else ""
    if str(qid) in name_maps:
        problems_s[qid]['image_feature'] = image_features[int(name_maps[str(qid)])]
    else:
        problems_s[qid]['image_feature'] = np.zeros((49, 2048))
    if idx%10 == 0:
        print(idx)
        break

10


In [80]:
print(len(problems_s))

10


In [84]:
def create_one_example(question, context, choice, solution, answer = None, curr_le_data=None):
    #Check if we are doing Answers or Rationale
    if curr_le_data:
        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n{curr_le_data}\n"
        output = f"Answer: The answer is {answer}."
        text = input + f'Answer:'
    else:
        input = f"Question: {question}\nContext: {context}\nOptions: {choice}\n"
        output = f"Solution: {solution}"
        text = input + f'Solution:'
    text = text.replace("  ", " ").strip()
    output = output.replace("  ", " ").strip()
    return text, output

In [64]:
options=["A","B","C","D","E"]
target_texts = []
source_texts = []
image_ids = []
for k in problems_s:
    question = problems_s[k]["question"]

    txt_context = problems_s[k]['hint']
    img_context = problems_s[k]['caption']
    context = " ".join([txt_context, img_context]).strip()
    if context == "":
        context = "N/A"

    choices = problems_s[k]['choices']
    choice_list = []
    for i, c in enumerate(problems_s[k]['choices']):
        choice_list.append("({}) {}".format(options[i], c))
    choice_txt = " ".join(choice_list)

    solution = problems_s[k]["solution"]

    image=problems_s[k]["image_feature"]
    
    prompt,target=create_one_example(question, context, choice_txt, solution)
    
    target_texts.append(target)
    source_texts.append(prompt)
    image_ids.append(image)

In [65]:
tokenizer=AutoTokenizer.from_pretrained("models/mm-cot-large-rationale/")
datacollator = DataCollatorForSeq2Seq(tokenizer)

In [66]:
class Data(Dataset):
    def __init__(self, target_texts, source_texts, image_ids):
        self.target_texts = target_texts
        self.source_texts = source_texts
        self.image_ids = image_ids
    
    def __getitem__(self,index):
        image_id = self.image_ids[index]
        target_text = str(self.target_texts[index])
        source_text = str(self.source_texts[index])

        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())


        source=tokenizer.batch_encode_plus([source_text],
            max_length=512,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",)

        target=tokenizer.batch_encode_plus([target_text],
            max_length=512,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",)

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze().tolist()

        
        return {"input_ids": source_ids,
                "attention_mask": source_mask,
                "image_ids": torch.tensor(image_id).squeeze(),
                "labels": target_ids,}
    
    def __len__(self):
        return len(self.target_texts)

In [67]:
data=Data(target_texts,source_texts,image_ids)

In [68]:
len(data)

2

# Generate rationale

In [55]:
metric = evaluate.load("rouge")
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
    return preds, labels



def compute_metrics_rougel(eval_preds):
    preds, targets = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    #Preds gets padded with -100. tokenizer can't handle negative numbers, so they get replaced with 0
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    targets = tokenizer.batch_decode(targets, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    decoded_preds, decoded_labels = postprocess_text(preds, targets)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [69]:
model=T5ForMultimodalGeneration.from_pretrained("models/mm-cot-large-rationale/",patch_size=(49, 2048), ignore_mismatched_sizes=True)

Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at models/mm-cot-large-rationale/ and are newly initialized because the shapes did not match:
- encoder.image_dense.weight: found shape torch.Size([1024, 1024]) in the checkpoint and torch.Size([1024, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [70]:
training_args = Seq2SeqTrainingArguments(
            "Saving",
            do_train=False,
            do_eval=False,
            evaluation_strategy="no",
            logging_strategy="steps",
            save_strategy="epoch",
            save_total_limit = 2,
            learning_rate= 5e-5,
            eval_accumulation_steps=None,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=4,
            weight_decay=0.01,
            num_train_epochs=50,
            predict_with_generate=True,
            generation_max_length=512,
            report_to="none",
        )

In [71]:
trainer = Seq2SeqTrainer(
                        model=model,
                        args=training_args,
                        data_collator=datacollator,
                        tokenizer=tokenizer,
                        compute_metrics = compute_metrics_rougel
    )

In [72]:
predict_results = trainer.predict(test_dataset = data, max_length=512)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [73]:
preds, targets = predict_results.predictions, predict_results.label_ids
preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
preds = tokenizer.batch_decode(
            preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
targets = tokenizer.batch_decode(
            targets, skip_special_tokens=True, clean_up_tokenization_spaces=True
            )
preds = [pred.strip() for pred in preds]

In [74]:
output_data = {"preds": preds,
                "labels": targets}
#output_prediction_file = os.path.join("Saving","predictions_ans_eval.json")
with open("Saving/predictions_ans_eval.json", "w") as writer:
        writer.write(json.dumps(output_data, indent=4))

In [27]:
preds

['Solution: Will these magnets attract or repel? To find out, look at which poles are closest to each other.nThe north pole of one magnet is closest to the south pole of the other magnet. Poles that are different attract. So, these magnets will attract each other.']

In [28]:
targets

['Solution: Will these magnets attract or repel? To find out, look at which poles are closest to each other. The north pole of one magnet is closest to the south pole of the other magnet. Poles that are different attract. So, these magnets will attract each other.']

# Generate Answers

In [39]:
rationale = json.load(open("Saving/predictions_ans_eval.json"))

In [42]:
rationale["7"]["generated_rationale"]

"Solution: Look at the picture of the sturgeon.nThe sturgeon's mouth is located on the underside of its head and points downward. Its mouth is adapted for bottom feeding. The sturgeon uses its mouth to find food hidden in the sediment of the ocean floor.nNow look at each animal. Figure out which animal has a similar adaptation.nThe armored catfish's mouth is located on the underside of its head. Its mouth points downward. Its mouth is adapted for bottom feeding.nThe discus's mouth is not located on the underside of its head. Its mouth is not adapted for bottom feeding."

In [47]:
problems["7"]["choices"]

['discus', 'armored catfish']

In [117]:
tokenizer=AutoTokenizer.from_pretrained("models/mm-cot-large-answer/")
datacollator = DataCollatorForSeq2Seq(tokenizer)

In [137]:
options=["A","B","C","D","E"]
target_texts = []
source_texts = []
image_ids = []
for k in problems_s:
    question = problems_s[k]["question"]

    txt_context = problems_s[k]['hint']
    img_context = problems_s[k]['caption']
    context = " ".join([txt_context, img_context]).strip()
    if context == "":
        context = "N/A"

    choices = problems_s[k]['choices']
    choice_list = []
    for i, c in enumerate(problems_s[k]['choices']):
        choice_list.append("({}) {}".format(options[i], c))
    choice_txt = " ".join(choice_list)

    solution = problems_s[k]["solution"]

    image=problems_s[k]["image_feature"]
    answer = "(" + options[problems_s[k]['answer']] + ")"
    if k in rationale:
        prompt,target=create_one_example(question, context, choice_txt, solution, answer, rationale[k]["generated_rationale"])

        target_texts.append(target)
        source_texts.append(prompt)
        image_ids.append(image)

In [139]:
data=Data(target_texts,source_texts,image_ids)

In [140]:
def extract_ans(ans):
    pattern = re.compile(r'The answer is \(([A-Z])\)')
    res = pattern.findall(ans)
        
    if len(res) == 1:
        answer = res[0]  # 'A', 'B', ...
    else:
        answer = "FAILED" 
    return answer

def compute_metrics_acc(eval_preds):
    preds, targets = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    preds = tokenizer.batch_decode(preds, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    targets = tokenizer.batch_decode(targets, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    correct = 0
    assert len(preds) == len(targets)
    for idx, pred in enumerate(preds):
        reference = targets[idx]
        reference = extract_ans(reference)
        extract_pred = extract_ans(pred)
        best_option = extract_pred
        if reference == best_option:
            correct +=1 
    return {'accuracy': 1.0*correct/len(targets)}


In [141]:
model=T5ForMultimodalGeneration.from_pretrained("models/mm-cot-large-answer/",patch_size=(49, 2048), ignore_mismatched_sizes=True)

Some weights of T5ForMultimodalGeneration were not initialized from the model checkpoint at models/mm-cot-large-answer/ and are newly initialized because the shapes did not match:
- encoder.image_dense.weight: found shape torch.Size([1024, 1024]) in the checkpoint and torch.Size([1024, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [142]:
#Same as generate, but max length is 64.
training_args = Seq2SeqTrainingArguments(
            "Saving",
            do_train=False,
            do_eval=False,
            evaluation_strategy="no",
            logging_strategy="steps",
            save_strategy="epoch",
            save_total_limit = 2,
            learning_rate= 5e-5,
            eval_accumulation_steps=None,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=4,
            weight_decay=0.01,
            num_train_epochs=50,
            predict_with_generate=True,
            generation_max_length=64,
            report_to="none",
        )

In [143]:
trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        data_collator=datacollator,
        tokenizer=tokenizer,
        compute_metrics = compute_metrics_acc
    )

In [144]:
predict_results = trainer.predict(test_dataset = data, max_length=64)

In [168]:
preds, targets = predict_results.predictions, predict_results.label_ids

In [169]:
preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
preds = tokenizer.batch_decode(
    preds, skip_special_tokens=True, clean_up_tokenization_spaces=True
)
targets = tokenizer.batch_decode(
    targets, skip_special_tokens=True, clean_up_tokenization_spaces=True
)

In [200]:
def get_acc_with_contion(res_pd, key, values):
    if isinstance(values, list):
        total_pd = res_pd[res_pd[key].isin(values)]
    else:
        total_pd = res_pd[res_pd[key] == values]
    correct_pd = total_pd[total_pd['true_false'] == True]
    if total_pd == 0:
        acc=0
    else:
        acc = "{:.2f}".format(len(correct_pd) / len(total_pd) * 100)
    return acc

def get_scores(result_data, rationale_data, results_reference, data_file):
    # read result file
    results = result_data
    num = len(results)
    assert num == 4241
    #print("number of questions:", num)

    # read data file
    sqa_data = json.load(open(data_file))
    # construct pandas data
    sqa_pd = pd.DataFrame(sqa_data).T
    res_pd = sqa_pd[sqa_pd['split'] == 'test']  # test set
    # update data
    for index, row in res_pd.iterrows():
        res_pd.loc[index, 'no_context'] = True if (not row['hint'] and not row['image']) else False
        res_pd.loc[index, 'has_text'] = True if row['hint'] else False
        res_pd.loc[index, 'has_image'] = True if row['image'] else False
        res_pd.loc[index, 'has_text_image'] = True if (row['hint'] and row['image']) else False

        label = row['answer']
        pred = int(results[index])
        res_pd.loc[index, 'pred'] = pred
        res_pd.loc[index, 'true_false'] = (label == pred)

    # accuracy scores
    acc_average = len(res_pd[res_pd['true_false'] == True]) / num * 100
    #assert result_file.split('_')[-1] == "{:.3f}.json".format(acc_average)

    scores = {
            "answer":{
                'acc_natural':
                get_acc_with_contion(res_pd, 'subject', 'natural science'),
                'acc_social':
                get_acc_with_contion(res_pd, 'subject', 'social science'),
                'acc_language':
                get_acc_with_contion(res_pd, 'subject', 'language science'),
                'acc_has_text':
                get_acc_with_contion(res_pd, 'has_text', True),
                'acc_has_image':
                get_acc_with_contion(res_pd, 'has_image', True),
                'acc_no_context':
                get_acc_with_contion(res_pd, 'no_context', True),
                'acc_grade_1_6':
                get_acc_with_contion(res_pd, 'grade', ['grade1', 'grade2', 'grade3', 'grade4', 'grade5', 'grade6']),
                'acc_grade_7_12':
                get_acc_with_contion(res_pd, 'grade', ['grade7', 'grade8', 'grade9', 'grade10', 'grade11', 'grade12']),
                'acc_average':
                "{:.2f}".format(acc_average),
            }}

    return scores

In [201]:
results_ans = {}
results_rationale = {}
results_reference = {}
        
num_fail = 0
for idx, qid in enumerate(["7"]):
    pred = preds[int(idx)]
    ref = targets[int(idx)]
    extract_pred = extract_ans(pred)
    if extract_pred != "FAILED":
        if extract_pred in options:
            extract_pred = options.index(extract_pred)
        else:
            extract_pred = random.choice(range(0,len(options)))
    else:
        num_fail += 1
        extract_pred = random.choice(range(len(options))) # random choose one option
    results_ans[str(qid)] = extract_pred
    results_rationale[str(qid)] = pred
    results_reference[str(qid)] = ref

scores = get_scores(results_ans, results_rationale, results_reference, "data/problems.json")
preds = [pred.strip() for pred in preds]
output_data = {
        "num_fail": num_fail,
        "scores": scores,
        "preds": preds,
        "labels": targets}

with open("Saving/ans_eval.json", "w") as writer:
    writer.write(json.dumps(output_data, indent=4))