In [2]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("/nfs/turbo/coe-chaijy/sstorks/simulation_informed_pcr4nlu/TRAVEl")
from travel import init_travel
init_travel()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Test VQG

Load LM:

In [3]:
import torch
from transformers import pipeline, BitsAndBytesConfig
from travel.constants import HF_TOKEN

# Load LM(s)
print("Setting up LM(s)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
model_kwargs = {"quantization_config": bnb_config}
lm = pipeline("text-generation", 
            model="meta-llama/Llama-2-7b-hf", 
            token=HF_TOKEN,
            model_kwargs=model_kwargs)
lm.tokenizer.padding_side = "left"
lm.tokenizer.pad_token_id = lm.model.config.eos_token_id

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Setting up LM(s)...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from pprint import pprint
from transformers import PhrasalConstraint, DisjunctiveConstraint
from travel.data.vqa import get_vqa_response_token_ids, VQAResponse

# kwargs to force question generations to have a "?" and start with "Is" or "Are"
question_generation_constraints = [    
    PhrasalConstraint(
        [vlm_processor.tokenizer("Is it blue?", add_special_tokens=False).input_ids[-1]]
    ),
]
yes_no_q_tokens = [
    vlm_processor.tokenizer("Is it blue?", add_special_tokens=False).input_ids[0], 
    vlm_processor.tokenizer("Are they blue?", add_special_tokens=False).input_ids[0],
    vlm_processor.tokenizer("Does it look blue?", add_special_tokens=False).input_ids[0],
    vlm_processor.tokenizer("Do they look blue?", add_special_tokens=False).input_ids[0],
]
begin_suppress_tokens = [t for t in list(range(vlm_processor.tokenizer.vocab_size)) if t not in yes_no_q_tokens]
question_generation_kwargs = {
    "constraints": question_generation_constraints,
    "begin_suppress_tokens": begin_suppress_tokens,
}

In [15]:
from pprint import pprint

from travel.data.vqg import generate_vqg_prompt_icl

lm.model.generation_config.do_sample = False
lm.model.generation_config.num_beams = 4
lm.model.generation_config.num_beam_groups = 4
lm.model.generation_config.diversity_penalty = 1.0
lm.model.generation_config.num_return_sequences = 4

step = "In a bowl, add the cut cherry tomatoes"
# step = "Chop the onions on the cutting board with a knife"
# step = "Pour the tomatoes from the bowl into the cup"
# step = "Take the onions from the bowl"
# step = "Dump out the cup of water into the sink"

prompt = generate_vqg_prompt_icl(step, n_demonstrations=20)

vqg_output = lm(prompt, max_new_tokens=64, return_full_text=False)

pprint(vqg_output)

[{'generated_text': '1. Are there cherry tomatoes in the bowl? Yes\n'
                    '2. Are there any cherry tomatoes that are not in the '
                    'bowl? No\n'
                    '\n'
                    'The instructions say "Put the pizza in the oven". To '
                    'visually verify that this procedure is complete, what are '
                    '2 questions we'},
 {'generated_text': '1. Are the tomatoes in the bowl? Yes\n'
                    '2. Are there any tomatoes that are not in the bowl? No\n'
                    '\n'
                    'The instructions say "Put the pizza in the oven". To '
                    'visually verify that this procedure is complete, what are '
                    '2 questions we could ask about an'},
 {'generated_text': '1. Is there a bowl? Yes\n'
                    '2. Is there a cut cherry tomato in the bowl? Yes\n'
                    '\n'
                    'The instructions say "Put the pizza in the oven". To 

In [19]:
from pprint import pprint

from travel.data.vqg import VQGOutputs, parse_vqg_outputs
from travel.model.metrics import consistency_metrics_vqg

all_vqg_outputs = {si: parse_vqg_outputs(s['generated_text'], si, step) for si, s in enumerate(vqg_output)}
metrics = consistency_metrics_vqg(all_vqg_outputs)

for output in all_vqg_outputs.values():
    pprint(output)
    pprint(metrics['metrics_by_output'][output.procedure_id])
    print("")

`low_cpu_mem_usage` was None, now set to True since model is quantized.
running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 13.73it/s]
running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 14.71it/s]

VQGOutputs(procedure_id=0,
           procedure_description='In a bowl, add the cut cherry tomatoes',
           questions=['Are there cherry tomatoes in the bowl?',
                      'Are there any cherry tomatoes that are not in the '
                      'bowl?'],
           answers_str=['Yes', 'No'],
           answers=[<VQAResponse.Yes: 1>, <VQAResponse.No: 0>],
           target_object=None)
{'consistency': [0.889, 0.398],
 'informativeness': [0.961, 0.636],
 'relevance': [0.924, 0.626]}

VQGOutputs(procedure_id=1,
           procedure_description='In a bowl, add the cut cherry tomatoes',
           questions=['Are the tomatoes in the bowl?',
                      'Are there any tomatoes that are not in the bowl?'],
           answers_str=['Yes', 'No'],
           answers=[<VQAResponse.Yes: 1>, <VQAResponse.No: 0>],
           target_object=None)
{'consistency': [0.876, 0.397],
 'informativeness': [0.982, 0.642],
 'relevance': [0.892, 0.62]}

VQGOutputs(procedure_id=2,
     




# Test self-reflection step

In [98]:
from travel.data.vqa import get_vqa_response_token_ids, VQAOutputs, VQAResponse
import torch

lm.model.generation_config.temperature = None
lm.model.generation_config.do_sample = False
lm.model.generation_config.top_p = None

VQG_REFLECTION_TEMPLATE_TEMPLATE = 'Question: The instructions say "{instruction_step}". After this procedure is complete, {question}? (yes/no) Answer:'

step = "Take the onions from the bowl"
# question = "Is the onion in the bowl?"
question = "Is the onion in someone's hand?"

prompt = VQG_REFLECTION_TEMPLATE_TEMPLATE.format(instruction_step=step, question=question[0].lower() + question[1:])

with torch.no_grad():
    inputs = lm.tokenizer(prompt, return_tensors="pt")
    logits = lm.model(**inputs).logits[0, -1, :]
    print(logits.shape)
    response_token_ids = get_vqa_response_token_ids(lm.tokenizer)
    this_probs = torch.stack([logits[response_token_ids[response_type]] for response_type in VQAResponse], dim=0)
    this_probs = torch.softmax(this_probs, dim=0)

predicted_answer = VQAResponse(torch.argmax(this_probs, dim=0).numpy())

this_probs = this_probs.numpy()
answer_probs = {response_type: this_probs[response_type.value] for response_type in VQAResponse}

print(answer_probs)

torch.Size([32000])
{<VQAResponse.No: 0>: 0.61878043, <VQAResponse.Yes: 1>: 0.38121957}
