In [1]:
%load_ext autoreload
%autoreload 2

import os
from travel import init_travel
init_travel()

# Test VQG

Load LM:

In [3]:
import torch
from transformers import pipeline, BitsAndBytesConfig
from travel.constants import HF_TOKEN

# Load LM(s)
print("Setting up LM(s)...")
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)
model_kwargs = {"quantization_config": bnb_config}
lm = pipeline("text-generation", 
            model="meta-llama/Llama-2-7b-hf", 
            token=HF_TOKEN,
            model_kwargs=model_kwargs)
lm.tokenizer.padding_side = "left"
lm.tokenizer.pad_token_id = lm.model.config.eos_token_id

Setting up LM(s)...


`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [99]:
from travel.data.vqg import N_GENERATED_QUESTIONS, VQGOutputs
from travel.data.vqg import VQG_DEMONSTRATIONS
import random

VQG_PROMPT_TEMPLATE = 'The instructions say "{instruction_step}". To visually verify that this procedure is complete, what are {n_questions} questions we could ask about an image of the scene and their expected answers?\n'
VQG_EXAMPLE_TEMPLATE = VQG_PROMPT_TEMPLATE + \
                       "{question_list}"
VQG_QUESTION_TEMPLATE = "{question_number}. {question} (yes/no) {answer}"

def generate_vqg_prompt(instruction_step: str) -> str:
    """
    Returns a prompt for VQG, i.e., for zero-shot inference or to come after several in-context demonstrations.

    :param instruction_step: Recipe or instruction step to generate instructions for. Should usually be a sentence in imperative form.
    :return: String including a prompt to generate `n_questions` questions to verify the success of `instruction_step`.
    """
    return VQG_PROMPT_TEMPLATE.format(instruction_step=instruction_step,
                                      n_questions=str(N_GENERATED_QUESTIONS))

def generate_vqg_example(vqg_output: VQGOutputs) -> str:
    """
    Returns a full VQG prompt example for in-context learning.

    :param vqg_output: VQGOutputs object for in-context VQG example.
    :return: String including a full demonstration of a prompt and several questions and expected answers for generating visual verification questions.
    """
    return VQG_EXAMPLE_TEMPLATE.format(instruction_step=vqg_output.procedure_description,
                                       n_questions = len(vqg_output.questions),
                                       question_list="\n".join([VQG_QUESTION_TEMPLATE.format(
                                            question_number=question_idx + 1,
                                            question=question,
                                            answer=answer.name
                                       ) for question_idx, (question, answer) in enumerate(zip(vqg_output.questions, vqg_output.answers))]))

def generate_vqg_prompt_icl(procedure_description: str, n_demonstrations: int=3) -> str:
    """
    Returns a prompt for VQG including in-context demonstrations.

    :param procedure_description: String description of a procedure (e.g., recipe step) to generate visual questions for.
    :param n_demonstrations: Number of in-context demonstrations to include from `VQG_DEMONSTRATIONS`.
    :return: Prompt for VQG including in-context demonstrations.
    """
    assert n_demonstrations <= len(VQG_DEMONSTRATIONS), f"Requested {n_demonstrations} in-context demonstrations for VQG, but only {len(VQG_DEMONSTRATIONS)} are available in travel.model.vqg.VQG_DEMONSTRATIONS."
    demonstrations = VQG_DEMONSTRATIONS[:n_demonstrations]
    random.shuffle(demonstrations) # Shuffle demonstrations for each prompt to ensure the ordering is not sub-optimal
    examples = [generate_vqg_example(demo) for demo in demonstrations]
    examples += [generate_vqg_prompt(procedure_description)]
    return "\n\n".join(examples)


In [100]:
lm.model.generation_config.temperature = 0.4
lm.model.generation_config.do_sample = True
lm.model.generation_config.top_p = 0.9

# step = "In a bowl, add the cut cherry tomatoes"
# step = "Chop the onions on the cutting board with a knife"
# step = "Pour the tomatoes from the bowl into the cup"
step = "Take the onions from the bowl"
# step = "Dump out the cup of water into the sink"

prompt = generate_vqg_prompt_icl(step, n_demonstrations=20)

vqg_output = lm(prompt, max_new_tokens=128, return_full_text=False)

print(vqg_output)

[{'generated_text': '1. Are the onions in the bowl? (yes/no) No\n2. Are there any onions that are not in the bowl? (yes/no) Yes\n\nThe instructions say "Put the water in the kettle". To visually verify that this procedure is complete, what are 2 questions we could ask about an image of the scene and their expected answers?\n1. Is the kettle full? (yes/no) Yes\n2. Is the kettle empty? (yes/no) No\n\nThe instructions say "Pour the water into the k'}]


# Test self-reflection step

In [98]:
from travel.data.vqa import get_vqa_response_token_ids, VQAOutputs, VQAResponse
import torch

lm.model.generation_config.temperature = None
lm.model.generation_config.do_sample = False
lm.model.generation_config.top_p = None

VQG_REFLECTION_TEMPLATE_TEMPLATE = 'Question: The instructions say "{instruction_step}". After this procedure is complete, {question}? (yes/no) Answer:'

step = "Take the onions from the bowl"
# question = "Is the onion in the bowl?"
question = "Is the onion in someone's hand?"

prompt = VQG_REFLECTION_TEMPLATE_TEMPLATE.format(instruction_step=step, question=question[0].lower() + question[1:])

with torch.no_grad():
    inputs = lm.tokenizer(prompt, return_tensors="pt")
    logits = lm.model(**inputs).logits[0, -1, :]
    print(logits.shape)
    response_token_ids = get_vqa_response_token_ids(lm.tokenizer)
    this_probs = torch.stack([logits[response_token_ids[response_type]] for response_type in VQAResponse], dim=0)
    this_probs = torch.softmax(this_probs, dim=0)

predicted_answer = VQAResponse(torch.argmax(this_probs, dim=0).numpy())

this_probs = this_probs.numpy()
answer_probs = {response_type: this_probs[response_type.value] for response_type in VQAResponse}

print(answer_probs)

torch.Size([32000])
{<VQAResponse.No: 0>: 0.61878043, <VQAResponse.Yes: 1>: 0.38121957}
