# Initial configuration

In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("/nfs/turbo/coe-chaijy/sstorks/simulation_informed_pcr4nlu/TRAVEl")
from travel import init_travel
init_travel()

In [55]:
LEARNING_RATE = 1e-2
BATCH_SIZE = 8
EPOCHS = 1
LORA_R=32
LORA_ALPHA=32
global_rank = 0

In [49]:
import datetime
import os

from travel.constants import RESULTS_DIR

# Set up output directory, training args, and wandb
timestamp = datetime.datetime.now()
this_results_dir = os.path.join(RESULTS_DIR, f"vqg_learning/PPO_VLM_prototyping/PPO_lr{LEARNING_RATE}_bs{BATCH_SIZE}_e{EPOCHS}_r{LORA_R}_alpha{LORA_ALPHA}_{timestamp.strftime('%Y%m%d%H%M%S')}")
wandb_run_name = f"PPO_lr{LEARNING_RATE}_bs{BATCH_SIZE}_e{EPOCHS}_r{LORA_R}_alpha{LORA_ALPHA}_{timestamp.strftime('%Y%m%d%H%M%S')}"
if not os.path.exists(this_results_dir):
    os.makedirs(this_results_dir)

# Set up models

In [4]:
from peft import LoraConfig

peft_config = LoraConfig(task_type="CAUSAL_LM",  # configured for causal LM
                        inference_mode=False,           # enable training - for inference, we can pre-compute the weight update matrix
                        r=LORA_R,                           # dimension of low-rank matrices
                        lora_alpha=LORA_ALPHA,                  # scaling coefficient of weight update
                        # target_modules="all-linear",
                        # lora_dropout=0.1,               # dropout regularization on LoRA weights
                        bias="none")                     # use LoRA to train "all" biases (alternatives: "none", "lora_only")

In [30]:
from peft import PeftModelForCausalLM
import torch
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
from trl import PPOConfig, AutoModelForCausalLMWithValueHead

VLM_NAME = "llava-hf/llava-1.5-7b-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

vlm = AutoModelForVision2Seq.from_pretrained(VLM_NAME, 
                                            quantization_config=bnb_config)
lm_ppo = PeftModelForCausalLM(vlm.language_model, peft_config)
lm_ppo = AutoModelForCausalLMWithValueHead.from_pretrained(lm_ppo)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
vlm.generation_config.temperature = None
vlm.generation_config.top_p = None
vlm.generation_config.do_sample = False
# vlm.generation_config.diversity_penalty = 1.0
vlm.generation_config.num_beams = 4
# vlm.generation_config.num_beam_groups = 1
vlm.generation_config.num_return_sequences=4

vlm.language_model.generation_config.temperature = None
vlm.language_model.generation_config.top_p = None
vlm.language_model.generation_config.do_sample = False
# vlm.language_model.generation_config.diversity_penalty = 1.0
vlm.language_model.generation_config.num_beams = 4
# vlm.language_model.generation_config.num_beam_groups = 1
vlm.language_model.generation_config.num_return_sequences=4

vlm_processor = AutoProcessor.from_pretrained(VLM_NAME)
vlm_processor.tokenizer.padding_side = "left"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [32]:
from pprint import pprint
from transformers import PhrasalConstraint, DisjunctiveConstraint
from travel.data.vqa import get_vqa_response_token_ids, VQAResponse

# kwargs to force question generations to have a "?" and start with "Is" or "Are"
question_generation_constraints = [    
    PhrasalConstraint(
        [vlm_processor.tokenizer("Is it blue?", add_special_tokens=False).input_ids[-1]]
    ),
]
yes_no_q_tokens = [
    vlm_processor.tokenizer("Is it blue?", add_special_tokens=False).input_ids[0], 
    vlm_processor.tokenizer("Are they blue?", add_special_tokens=False).input_ids[0],
    vlm_processor.tokenizer("Does it look blue?", add_special_tokens=False).input_ids[0],
    vlm_processor.tokenizer("Do they look blue?", add_special_tokens=False).input_ids[0],
]
begin_suppress_tokens = [t for t in list(range(vlm_processor.tokenizer.vocab_size)) if t not in yes_no_q_tokens]
question_generation_kwargs = {
    "constraints": question_generation_constraints,
    "begin_suppress_tokens": begin_suppress_tokens,
}

qmark_token_id = vlm_processor.tokenizer("Is it blue?", add_special_tokens=False).input_ids[-1]
newline_token_id = vlm_processor.tokenizer.encode("\n", add_special_tokens=False)[1] # this should be 13 for LLaMA 2

In [33]:
from travel.data.vqa import VQAResponse
from travel.model.nli import NLI_HYPOTHESIS_TEMPLATE, NLI_MODEL_PATH

from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Set up NLI model for online feedback
nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_PATH, quantization_config=bnb_config)
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_PATH)

def get_entailment_probability(premise, hypothesis):
    probs_expected = run_nli(nli_tokenizer, nli_model, list(zip(premise, hypothesis)))
    return probs_expected[:, 1].numpy()

def calculate_relevance_informativeness(procedure, question, vlm_answer, is_mistake):
    hypothesis = NLI_HYPOTHESIS_TEMPLATE.format(procedure=procedure)
    prob_e = get_entailment_probability([question + " " + vlm_answer.name], [hypothesis] * 2)
    prob_u = get_entailment_probability([question + " " + VQAResponse(1-vlm_answer.value).name], [hypothesis] * 2)
    
    negation_based_relevance = float(np.abs(prob_e - prob_u))
    informativeness = float(prob_e if not is_mistake else 1.0 - prob_e)

    return negation_based_relevance, informativeness

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Also add some helper prompting methods:

In [44]:
from pprint import pprint
from travel.data.vqa import get_vqa_response_token_ids, VQAResponse

def simple_prompt(vlm, vlm_processor, frame, prompt, max_new_tokens=20, avoid_str=[], generation_kwargs={}):
   
    inputs = vlm_processor(text=prompt, images=frame, padding=True, return_tensors="pt")
    inputs = inputs.to(vlm.device)

    outputs = vlm.generate(**inputs, max_new_tokens=max_new_tokens, return_dict_in_generate=True, output_scores=True, **generation_kwargs)
    outputs = vlm_processor.batch_decode(outputs.sequences, skip_special_tokens=True)
    
    outputs = [output.replace("USER:  ", "USER: <image>") for output in outputs]
    outputs = [output.replace(prompt, "") for output in outputs]
    
    for output in outputs:
        if avoid_str and any(s in output for s in avoid_str):
            continue
        return output
    
    if avoid_str:
        print("Warning: returning avoid_str!")
    return outputs[0]

def simple_prompt_textonly(lm, tokenizer, prompt, max_new_tokens=20, avoid_str=[], generation_kwargs={}):
   
    inputs = tokenizer(text=prompt, padding=True, return_tensors="pt")
    inputs = inputs.to(lm.device)

    outputs = lm.generate(**inputs, max_new_tokens=max_new_tokens, return_dict_in_generate=True, output_scores=True, **generation_kwargs)
    outputs = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
    
    outputs = [output.replace(prompt, "") for output in outputs]
    
    for output in outputs:
        if avoid_str and any(s in output for s in avoid_str):
            continue
        return output
    
    if avoid_str:
        print("Warning: returning avoid_str!")
    return outputs[0]

def yes_no_prompt(vlm, vlm_processor, frame, prompt):
    response_token_ids = get_vqa_response_token_ids(vlm_processor.tokenizer)
    inputs = vlm_processor(text=prompt, images=frame, padding=True, return_tensors="pt")
    inputs = inputs.to(vlm.device)

    logits = vlm(**inputs).logits
    logits = logits[0, -1].detach().cpu()
    
    this_probs = torch.stack([logits[response_token_ids[response_type]] for response_type in VQAResponse], dim=0)
    this_probs = torch.softmax(this_probs, dim=0)

    predicted_answer = VQAResponse(torch.argmax(this_probs, dim=0).numpy())

    this_probs = this_probs.numpy()
    answer_probs = {response_type: this_probs[response_type.value] for response_type in VQAResponse}
    return predicted_answer, answer_probs    

def cleanup_question(question):
    question = question.split("?")[0].strip() + "?"
    if "." in question:
        question = question.split(".")[1].strip()    
    return question

# Run PPO

Prepare training data:

In [35]:
from datasets import Dataset

from travel.constants import DATA_CACHE_DIR
from travel.data.ego4d import Ego4DMistakeDetectionDataset

dataset = Ego4DMistakeDetectionDataset(data_split="train",
                                        mismatch_augmentation=True,
                                        multi_frame=False,
                                        debug_n_examples_per_class=500)


# Prepare training examples from Ego4D mistake detection dataset
print(f"({global_rank}) Preparing training data...")
dataset_path = os.path.join(DATA_CACHE_DIR, f"ppo_training_dataset_icl8_debug500")
ppo_dataset = Dataset.load_from_disk(dataset_path=dataset_path)

# Balance mistake/success examples
positive_examples = [example for example in ppo_dataset if "pos" in example['example_id']]
negative_examples = [example for example in ppo_dataset if "pos" not in example['example_id']]
if len(positive_examples) < len(negative_examples):
    print(f"Upsampling {len(negative_examples) - len(positive_examples)} more positive examples.")
    ppo_dataset += random.sample(positive_examples, len(negative_examples) - len(positive_examples))
elif len(positive_examples) > len(negative_examples):
    print(f"Upsampling {len(positive_examples) - len(negative_examples)} more negative examples.")
    ppo_dataset += random.sample(negative_examples, len(positive_examples) - len(negative_examples))

print(f"train data partition: {len(ppo_dataset)} examples")


(0) Preparing training data...
train data partition: 1000 examples


Set up PPO trainer:

In [53]:
# Need this call at the beginning of every script to set random seeds and set the HF cache
from travel import init_travel
init_travel()

import numpy as np
from PIL import Image
from pprint import pprint
import random
import torch
from tqdm import tqdm
from trl import PPOConfig, AutoModelForCausalLMWithValueHead
import wandb

from travel.constants import HF_TOKEN, DATA_CACHE_DIR, RANDOM_SEED
from travel.data.mistake_detection import MistakeDetectionTasks
from travel.data.vqa import VQAResponse
from travel.data.vqg import generate_vqg_prompt_icl, VQGOutputs
from travel.model.nli import run_nli, NLI_HYPOTHESIS_TEMPLATE
from travel.model.ppo_trainer import PerTokenPPOTrainer as PPOTrainer
from travel.model.vqg import parse_vqg_outputs

# Set up PPO trainer
def collator(data):
    return {key: [d[key] for d in data] for key in data[0]}
ppo_config = PPOConfig(
    learning_rate=LEARNING_RATE,
    batch_size=4,
    mini_batch_size=4,
    gradient_accumulation_steps=1,
    remove_unused_columns=False,
    optimize_cuda_cache=True,
    early_stopping=True,
    is_peft_model=True,
    seed=RANDOM_SEED,
)
ppo_trainer = PPOTrainer(
    model=lm_ppo,
    ref_model=vlm.language_model,
    config=ppo_config,
    dataset=ppo_dataset,
    tokenizer=vlm_processor.tokenizer,
    data_collator=collator
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Training loop:

In [56]:
import numpy as np
import wandb

wandb.init(name=wandb_run_name)

for epoch in tqdm(range(EPOCHS), f"({global_rank}) epoch"):
    for batch_idx, batch in enumerate(tqdm(ppo_trainer.dataloader, desc=f"({global_rank}) batch")):
        # if batch_idx == 0:
        #     keep_batch = batch
        # else:
        #     batch = keep_batch
        
        this_batch_size = len(batch["procedure_description"])

        # Load examples
        examples = [dataset.load_example_from_file(example_dir, load_frames=False) for example_dir in batch['example_dir']]
        frames = [Image.open(example.frames[0]) for example in examples]

        prompts = [
            f'USER: <image>\nThis is a photo of someone working on the procedure "{procedure}". I will ask a series of different yes/no questions about the state of the scene to determine whether the person has successfully executed the procedure. The goal is to extract as much relevant information as possible from the scene, so I will not repeat questions.' 
            for procedure in batch['procedure_description']
        ]
        questions = [[] for _ in range(this_batch_size)]
        answers = [[] for _ in range(this_batch_size)]
        scores = [[] for _ in range(this_batch_size)]
        for question_generated in range(3):
            # Generate a question
            prompts_q = [prompt + " USER: Q: " for prompt in prompts]
            new_questions = [simple_prompt_textonly(vlm.language_model,
                                                    vlm_processor.tokenizer,
                                                    prompt.replace("<image>\n", ""),
                                                    max_new_tokens=20,
                                                    avoid_str=questions[prompt_idx],
                                                    generation_kwargs=question_generation_kwargs) for prompt_idx, prompt in enumerate(prompts_q)]
            new_questions = [cleanup_question(question) for question in new_questions]
            for qi in range(len(new_questions)):
                questions[qi].append(new_questions[qi])

            # Predict an answer (yes/no)
            prompts_a = [prompt + f'{question} ASSISTANT: A (yes/no): ' for prompt, question in zip(prompts_q, new_questions)]
            new_answers = [yes_no_prompt(vlm, vlm_processor, frame, prompt) for frame, prompt in zip(frames, prompts_a)]
            for ai in range(len(new_questions)):
                answers[ai].append(new_answers[ai])

            # Update prompts with answers
            prompts = [prompt + pred.name for prompt, (pred, _) in zip(prompts_a, new_answers)]

            # Score questions using NLI model
            new_scores = [
                calculate_relevance_informativeness(procedure, question, answer[0], example.mistake)
                for procedure, question, answer, example in zip(batch['procedure_description'], new_questions, new_answers, examples)
            ]
            new_scores = [rel * inf for rel, inf in new_scores]
            for si in range(len(new_scores)):
                scores[si].append(new_scores[si])

        # TODO: also incorporate model's final answer into scores
        # TODO: add a chance that generated question will come from LM + ICL

        print("Procedure:")
        print(batch["procedure_description"][0])        
        
        print("Generated questions, answers, and scores:")
        pprint(questions[0])
        pprint([(pred.name, probs) for pred, probs in answers[0]])
        pprint(scores[0])
        
        # Re-encode final prompts to find indices to apply reward
        query_tensors = [torch.zeros([0]).long() for _ in range(this_batch_size)] # use empty responses        
        response_tensors = [vlm_processor.tokenizer.encode(prompt, return_tensors="pt")[0] for prompt in prompts]
        reward_indices = [(response_tensor == qmark_token_id).long().cpu() for response_tensor in response_tensors]
        for i in range(len(reward_indices)):
            reward_indices[i][reward_indices[i] == 1] = torch.cumsum((reward_indices[i] == 1).int(), dim=0)[reward_indices[i] == 1]
            reward_indices[i][reward_indices[i] == 0] = -1
            reward_indices[i][reward_indices[i] != -1] -= 1
        reward = torch.tensor(scores).float()
            
        #### Run PPO step
        stats = ppo_trainer.step(query_tensors, response_tensors, reward, reward_indices)
        ppo_trainer.log_stats(stats, batch, reward, columns_to_log=("prompt", "response"))
        if global_rank == 0:
            try:
                wandb.log(stats | {"ppo/epoch": epoch, 
                                        "rewards/consistency": np.mean(reward.cpu().numpy()),})
            except Exception as e:
                print("Warning: failed to log to wandb!")
                pprint(e)

        #### Save model
        if epoch % 5 == 0 and global_rank == 0:
            if not os.path.exists(os.path.join(this_results_dir, f"epoch{epoch}")):
                os.makedirs(os.path.join(this_results_dir, f"epoch{epoch}"))
            ppo_trainer.save_pretrained(os.path.join(this_results_dir, f"epoch{epoch}"))    

print(f"({global_rank}) Done training!")

VBox(children=(Label(value='0.028 MB of 0.028 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
objective/entropy,▁▄█▅
objective/kl,█▃▁▂
objective/kl_coef,▅█▅▁
ppo/epoch,▁▁▁▁
ppo/learning_rate,▁▁▁▁
ppo/loss/policy,█▁▁▂
ppo/loss/total,▁▃█▃
ppo/loss/value,▁▄█▄
ppo/mean_non_score_reward,▁▆█▇
ppo/mean_scores,▄█▁▃

0,1
objective/entropy,931.00043
objective/kl,-48.2746
objective/kl_coef,0.19998
ppo/epoch,0.0
ppo/learning_rate,0.0001
ppo/loss/policy,-0.07328
ppo/loss/total,0.10311
ppo/loss/value,1.76387
ppo/mean_non_score_reward,0.08269
ppo/mean_scores,0.18112


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112092073178953, max=1.0…

(0) epoch:   0%|          | 0/1 [00:00<?, ?it/s]
(0) batch:   0%|          | 0/250 [00:00<?, ?it/s][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.25it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.07it/s]
  negation_based_relevance = float(np.abs(prob_e - prob_u))
  informativeness = float(prob_e if not is_mistake else 1.0 - prob_e)


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.29it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.36it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.32it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.13it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.41it/s]


runn

Procedure:
Press a button on the phone
Generated questions, answers, and scores:
["Is the phone in the person's hand?",
 "Is the phone in the person's right hand?",
 "Is the phone in the person's left hand?"]
[('Yes', {<VQAResponse.No: 0>: 0.026155619, <VQAResponse.Yes: 1>: 0.9738444}),
 ('Yes', {<VQAResponse.No: 0>: 0.23651622, <VQAResponse.Yes: 1>: 0.7634837}),
 ('No', {<VQAResponse.No: 0>: 0.9829547, <VQAResponse.Yes: 1>: 0.017045317})]
[0.19792819023132324, 0.17500615119934082, 0.7260103225708008]



(0) batch:   0%|          | 1/250 [00:39<2:45:55, 39.98s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.22it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.27it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.52it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.56it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.68it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.66it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.46it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.61it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token

Procedure:
Hold the leaves
Generated questions, answers, and scores:
['Is the person holding the leaves?',
 "Are the leaves in the person's hand?",
 'Are the leaves green?']
[('Yes', {<VQAResponse.No: 0>: 0.09670579, <VQAResponse.Yes: 1>: 0.9032942}),
 ('Yes', {<VQAResponse.No: 0>: 0.054600604, <VQAResponse.Yes: 1>: 0.94539934}),
 ('Yes', {<VQAResponse.No: 0>: 0.03410043, <VQAResponse.Yes: 1>: 0.9658996})]
[0.6898252964019775, 0.9153685569763184, 0.7953414916992188]



(0) batch:   1%|          | 2/250 [01:19<2:44:04, 39.69s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.53it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.52it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.64it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.05it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.80it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.83it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 16.06it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.06it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token

Procedure:
Pack flour from a plate with your hand
Generated questions, answers, and scores:
['Is the flour on the plate?',
 "Is the flour on the person's hand?",
 'Is the flour on the table?']
[('Yes', {<VQAResponse.No: 0>: 0.18713269, <VQAResponse.Yes: 1>: 0.81286734}),
 ('No', {<VQAResponse.No: 0>: 0.6095242, <VQAResponse.Yes: 1>: 0.39047584}),
 ('Yes', {<VQAResponse.No: 0>: 0.48633155, <VQAResponse.Yes: 1>: 0.5136685})]
[0.907508134841919, 0.014972209930419922, 0.5350005626678467]



(0) batch:   1%|          | 3/250 [02:11<3:05:53, 45.16s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.46it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.42it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.66it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.68it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.65it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.50it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.59it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.67it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token

Procedure:
Wash the avocado pear under running water
Generated questions, answers, and scores:
["Is the avocado pear in the person's hand?",
 'Is the avocado pear in the sink?',
 'Is the sink full of water?']
[('No', {<VQAResponse.No: 0>: 0.61323655, <VQAResponse.Yes: 1>: 0.38676345}),
 ('Yes', {<VQAResponse.No: 0>: 0.29746994, <VQAResponse.Yes: 1>: 0.7025301}),
 ('Yes', {<VQAResponse.No: 0>: 0.2991052, <VQAResponse.Yes: 1>: 0.70089483})]
[0.19784927368164062, 0.010212421417236328, 0.24869012832641602]



(0) batch:   2%|▏         | 4/250 [02:50<2:56:30, 43.05s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.54it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.45it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.71it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.80it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.67it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.79it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.74it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.58it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token

Procedure:
Shuffle a card
Generated questions, answers, and scores:
['Is the person holding a deck of cards?',
 "Is the deck of cards in the person's hand?",
 "Is the deck of cards in the person's hand upside down?"]
[('Yes', {<VQAResponse.No: 0>: 0.07316472, <VQAResponse.Yes: 1>: 0.92683524}),
 ('Yes', {<VQAResponse.No: 0>: 0.051082738, <VQAResponse.Yes: 1>: 0.94891727}),
 ('No', {<VQAResponse.No: 0>: 0.64779824, <VQAResponse.Yes: 1>: 0.3522018})]
[0.6431422233581543, 0.6291365623474121, 0.024912066757678986]



(0) batch:   2%|▏         | 5/250 [03:30<2:50:48, 41.83s/it][ASetting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.82it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 15.97it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 16.17it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 16.22it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 16.17it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 16.14it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 16.15it/s]


running NLI (cuda:0): 100%|██████████| 1/1 [00:00<00:00, 16.04it/s]
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token

RuntimeError: Could not infer dtype of NoneType

Save model:

In [None]:
#### Save model
if global_rank == 0:
    print(f"({global_rank}) Saving model...")
    ppo_trainer.save_pretrained(this_results_dir)        
    wandb.finish()