In [1]:
%load_ext autoreload
%autoreload 2

import os
os.chdir("/nfs/turbo/coe-chaijy/sstorks/simulation_informed_pcr4nlu/TRAVEl")
from travel import init_travel
init_travel()

In [4]:
import torch
from transformers import BitsAndBytesConfig, AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoModelForVision2Seq, AutoProcessor
import spacy

from travel.model.nli import NLI_MODEL_PATH

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    llm_int8_threshold=6.0,
    llm_int8_has_fp16_weight=False,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL_PATH, quantization_config=bnb_config)
nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL_PATH)
nlp = spacy.load("en_core_web_lg")

VLM_NAME = "llava-hf/llava-1.5-7b-hf"
vlm = AutoModelForVision2Seq.from_pretrained(VLM_NAME, 
                                            quantization_config=bnb_config)
vlm_processor = AutoProcessor.from_pretrained(VLM_NAME)
vlm_processor.tokenizer.padding_side = "left"
vlm_processor.tokenizer.pad_token_id = vlm_processor.tokenizer.eos_token_id

`low_cpu_mem_usage` was None, now set to True since model is quantized.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
lm = vlm.language_model
tokenizer = vlm_processor.tokenizer

# Load some outputs

In [8]:
import json
import os

this_results_dir = "/home/sstorks/coe-chaijy/sstorks/simulation_informed_pcr4nlu/TRAVEl/saved_results_222/vqa_mistake_detection/ego4d_single_debug250/llava-1.5-7b-hf/IterativeVQA_q10_ego4d_single_debug250_llava-1.5-7b-hf_beam8-4_likelihood_nohistory_20240815204213"
# this_results_dir = "/home/sstorks/coe-chaijy/sstorks/simulation_informed_pcr4nlu/TRAVEl/saved_results_222/vqa_mistake_detection/ego4d_single_debug250/llava-1.5-7b-hf/IterativeVQA_topdown_q10_ego4d_single_debug250_llava-1.5-7b-hf_beam8-4_likelihood_nohistory_20240817105952"

all_results_dicts = json.load(open(os.path.join(this_results_dir, "outputs_val.json"), "r"))

In [None]:
import numpy as np
from tqdm import tqdm

from travel.model.metrics import question_coherence_metrics_nli, compile_accuracy_and_coherence_metrics
from travel.model.mistake_detection import MISTAKE_DETECTION_THRESHOLDS

for turn_idx in tqdm(range(10)):
    
    allturns_results_dir = os.path.join(this_results_dir, f"allturns{turn_idx}")
    if not os.path.exists(allturns_results_dir):
        os.makedirs(allturns_results_dir)
    else:
        continue    
    
    all_probs = [results_dict['success_probs'][turn_idx] for results_dict in all_results_dicts.values()]
    all_labels = [results_dict['mistake'] for results_dict in all_results_dicts.values()]
    
    all_chosen_questions = [question for results_dict in all_results_dicts.values() for question in range(turn_idx+1)]
    all_previous_questions = [[q for qi, q in enumerate(results_dict['questions'][:question_idx]) if results_dict['answers'][qi] != "Unsure"] for results_dict in all_results_dicts.values() for question_idx in range(turn_idx+1)]

    label_answer_mapping = {0: "No", 1: "Yes"}
    all_predicted_answers = [label_answer_mapping[np.argmax(answer_probs)] for results_dict in all_results_dicts.values() for answer_probs in range(turn_idx+1)]
    all_previous_answers = [[a for a in results_dict['answers'][:question_idx] if a != "Unsure"] for results_dict in all_results_dicts.values() for question_idx in range(turn_idx+1)]

    all_coherence_metrics = question_coherence_metrics_nli(nli_tokenizer,
                                                                    nli_model,
                                                                    tokenizer,
                                                                    lm,                                         
                                                                    [results_dict['procedure'] for results_dict in all_results_dicts.values() for _ in range(turn_idx+1)],
                                                                    all_chosen_questions,
                                                                    answers=all_predicted_answers,
                                                                    previous_questions=all_previous_questions,
                                                                    previous_answers=all_previous_answers,
                                                                    mistake_labels=[results_dict['mistake'] for results_dict in all_results_dicts.values() for _ in range(turn_idx+1)],
                                                                    rephrase_batch_size=20)
    
    this_results_dicts = {k: v | {"final_turn": turn_idx} for k, v in all_results_dicts.items()}
    accuracy_metrics_by_threshold, coherence_metrics = compile_accuracy_and_coherence_metrics(all_labels, all_probs, all_coherence_metrics, this_results_dicts, MISTAKE_DETECTION_THRESHOLDS, 0.1)
    
    json.dump(accuracy_metrics_by_threshold, 
            open(os.path.join(allturns_results_dir, f"metrics_accuracy_val.json"), "w"),
            indent=4)

    json.dump(coherence_metrics, 
            open(os.path.join(allturns_results_dir, f"metrics_coherence_nli_val.json"), "w"),
            indent=4)

    json.dump(this_results_dicts, 
            open(os.path.join(allturns_results_dir, f"outputs_val.json"), "w"),
            indent=4)


  0%|          | 0/10 [00:00<?, ?it/s]
running generation (cuda:0):   0%|          | 0/25 [00:00<?, ?it/s][A
running generation (cuda:0):   4%|▍         | 1/25 [00:01<00:40,  1.70s/it][A
running generation (cuda:0):   8%|▊         | 2/25 [00:03<00:39,  1.70s/it][A
running generation (cuda:0):  12%|█▏        | 3/25 [00:05<00:37,  1.71s/it][A
running generation (cuda:0):  16%|█▌        | 4/25 [00:06<00:35,  1.71s/it][A
running generation (cuda:0):  20%|██        | 5/25 [00:08<00:34,  1.71s/it][A
running generation (cuda:0):  24%|██▍       | 6/25 [00:10<00:32,  1.71s/it][A
running generation (cuda:0):  28%|██▊       | 7/25 [00:11<00:30,  1.71s/it][A
running generation (cuda:0):  32%|███▏      | 8/25 [00:13<00:29,  1.71s/it][A
running generation (cuda:0):  36%|███▌      | 9/25 [00:15<00:27,  1.71s/it][A
running generation (cuda:0):  40%|████      | 10/25 [00:17<00:25,  1.71s/it][A
running generation (cuda:0):  44%|████▍     | 11/25 [00:18<00:23,  1.71s/it][A
running generation 

In [None]:
max_accuracy = None
max_accuracy_turn = None

max_verifiability = None
max_verifiability_turn = None

for turn_idx in tqdm(range(10)):

    accuracy_metrics_by_threshold = json.load(open(os.path.join(allturns_results_dir, f"metrics_accuracy_val.json"), "r"))
    coherence_metrics = json.load(open(os.path.join(allturns_results_dir, f"metrics_coherence_nli_val.json"), "r"))
    
    this_accuracy = accuracy_metrics_by_threshold['best_metrics']['accuracy']
    this_verifiability = max(list(coherence_metrics['metrics_by_threshold'].values()), key=lambda x: x['verifiability'])['verifiability']
    
    if max_accuracy is None or this_accuracy > max_accuracy:
        max_accuracy = this_accuracy
        max_accuracy_turn = turn_idx
        
    if max_verifiability is None or this_verifiability > max_verifiability:
        max_verifiability = this_verifiability
        max_verifiability_turn = turn_idx

In [None]:
lines = []
lines.append(f"Max accuracy: {max_accuracy} at turn {max_accuracy_turn}")
lines.append(f"Max verifiability: {max_verifiability} at turn {max_verifiability_turn}")

with open(os.path.join(this_results_dir, "allturns_results.txt"), "w") as f:
    f.write("\n".join(lines))