### Import libraries

*After running 2 cells below I recommend to restart the kernel for the rest of the code to work properly.*

In [1]:
!pip install -q lm-polygraph

In [2]:
from IPython.display import clear_output
!git clone -q https://github.com/sn0rkmaiden/lm-polygraph.git
%cd lm-polygraph/src
%pip install -q transformers rouge-score datasets

fatal: destination path 'lm-polygraph' already exists and is not an empty directory.
/content/lm-polygraph/src


In [3]:
from lm_polygraph.stat_calculators.infer_causal_lm_calculator import InferCausalLMCalculator
from lm_polygraph.stat_calculators.greedy_alternatives_nli import GreedyAlternativesNLICalculator
from lm_polygraph.utils.deberta import Deberta
from lm_polygraph.utils.model import WhiteboxModel
from lm_polygraph.model_adapters import WhiteboxModelBasic
from lm_polygraph.estimators import MaximumSequenceProbability, ClaimConditionedProbability
from torch.utils.data import DataLoader

Import classes to work with LLM

In [4]:
%cd "/content/"

/content


In [5]:
!rm -r "/content/methods_of_uncertainty_detection"

In [6]:
!git clone -q "https://github.com/sn0rkmaiden/methods_of_uncertainty_detection.git"

In [7]:
%cd "/content/methods_of_uncertainty_detection"

/content/methods_of_uncertainty_detection


In [8]:
from methods_of_uncertainty_detection.knownopipe import KnowNoPipeline
from methods_of_uncertainty_detection.knownoconfig import KnowNoConfig
from methods_of_uncertainty_detection.llm_ue import LLModel

Import AmbiK dataset

In [9]:
%cd '/content/'

/content


In [10]:
!git clone -q "https://github.com/sn0rkmaiden/AmbiK-dataset.git"

fatal: destination path 'AmbiK-dataset' already exists and is not an empty directory.


In [11]:
import os
import gc
import sys
import glob
import random
import numpy as np
import pandas as pd
import torch
import re
import tqdm
from collections import Counter
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, GemmaForCausalLM, GenerationConfig
import torch
import signal
import threading

Get your `hugginface token` [here](https://huggingface.co/docs/hub/en/security-tokens)

In [12]:
from huggingface_hub import login

os.environ["HF_KEY"] = ""
login(token=os.environ.get('HF_KEY'), add_to_git_credential=False)

In [13]:
examples_generation = "\n".join(open("/content/AmbiK-dataset/knowno/prompts/generation.txt", encoding="utf-8").readlines())
question_generation = ""
answer_generation ="\n".join(open("/content/AmbiK-dataset/knowno/prompts/choising.txt", encoding="utf-8").readlines())

In [14]:
sys.path.append('/content/AmbiK-dataset/utils')

In [15]:
from parse_config import parse_args, parse_config
from metrics import _calculate_metrics, aggreate, batch_metric_calculation, ambiguity_differentiation

### Load models

CP value can be obtained using `calibration.py`

In [16]:
# CP = 0.39293624797471366
CP = 0.1

In [17]:
def load_model(model_name, device):
  if "t5" in model_name or "bart" in model_name:
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, low_cpu_mem_usage=True, device_map=device)
  elif 'gemma' in model_name:
    model = GemmaForCausalLM.from_pretrained(model_name)
  else:
    model = AutoModelForCausalLM.from_pretrained(model_name, low_cpu_mem_usage=True, device_map=device)
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  return model, tokenizer

In [18]:
gc.collect()
torch.cuda.empty_cache()

In [19]:
# Possible options:
#   title_prompt = "microsoft/phi-2"
#   title_answer = "google/flan-t5-base"
#   title_answer = "NousResearch/Llama-2-7b-chat-hf" might be outofmemory error
#   title_prompt = "HuggingFaceH4/zephyr-7b-beta" might be outofmemory error

title_prompt = "google/gemma-2b"
title_answer = title_prompt

model_prompt, tokenizer_prompt = load_model(title_prompt, "cuda")
tokenizer_prompt.pad_token = tokenizer_prompt.eos_token

# model_answer, tokenizer_answer = load_model(title_answer)
model_answer, tokenizer_answer = model_prompt, tokenizer_prompt
tokenizer_answer.pad_token = tokenizer_answer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_prompt = model_prompt.to(device)
model_answer = model_answer.to(device)

In [21]:
configs = parse_config("/content/AmbiK-dataset/configs/knowno.yaml" , use_args=True)

gen_model = title_prompt
if "/" in gen_model:
    gen_model = gen_model.split("/")[1]
answ_model = title_answer
if "/" in answ_model:
    answ_model = answ_model.split("/")[1]

estimator = MaximumSequenceProbability()
model_adapter_prompt = WhiteboxModelBasic(model_prompt, tokenizer_prompt, {})
model_adapter_ans = WhiteboxModelBasic(model_answer, tokenizer_answer, {})
exp_res_dir = f"/content/{CP}_{gen_model}_{answ_model}_{estimator}"
os.makedirs(exp_res_dir, exist_ok=True)

print()
print(" Start experiment !", exp_res_dir)
print()
knowno_config = KnowNoConfig(configs)
knowno = KnowNoPipeline(config=knowno_config, title_prompt=title_prompt, title_answer=title_answer,
                    model_prompt=model_prompt, model_answer=model_answer,
                    tokenizer_prompt=tokenizer_prompt, tokenizer_answer=tokenizer_answer, estimator = estimator, cpvalue=CP, examples=examples_generation,
                    answer_examples=answer_generation, adapter_prompt=model_adapter_prompt, adapter_ans=model_adapter_ans)

{'examples_generation': {'model': 'google/gemma-2b', 'generation_kwargs': {'num_beams': 4, 'max_new_tokens': 250, 'num_return_sequences': 1}}, 'answering': {'model': 'google/flan-t5-base', 'generation_kwargs': {'num_beams': 4, 'max_new_tokens': 250, 'num_return_sequences': 1}}}

 Start experiment ! /content/0.1_gemma-2b_gemma-2b_MaximumSequenceProbability



### Prepare data

In [22]:
dataset = pd.read_csv("/content/AmbiK-dataset/ambik_dataset/ambik_test_400.csv")
amb = dataset[['id', 'environment_short', 'environment_full',  'ambiguity_type', 'amb_shortlist', 'ambiguous_task', 'question', 'answer', 'plan_for_amb_task', 'end_of_ambiguity', 'user_intent']]
dataset.ambiguity_type = ['unambiguous_direct']*len(dataset)
dataset = pd.concat([dataset, amb])
dataset['plan'] = dataset['plan_for_clear_task']
dataset['plan'] = dataset['plan'].fillna(dataset['plan_for_amb_task'])
dataset['task'] = dataset['unambiguous_direct']
dataset['task'] = dataset['task'].fillna(dataset['ambiguous_task'])
dataset = dataset.drop(columns=['Unnamed: 0', 'unambiguous_direct', 'unambiguous_indirect', 'ambiguous_task', 'plan_for_clear_task', 'plan_for_amb_task', 'variants'])
dataset = dataset.reset_index()

Getting samples from each category

In [23]:
num_samples = 3
unamb_idx = list(dataset.loc[dataset['ambiguity_type'] == 'unambiguous_direct'].index[:num_samples])
pref_idx = list(dataset.loc[dataset['ambiguity_type'] == 'preferences'].index[:num_samples])
csk_idx = list(dataset.loc[dataset['ambiguity_type'] == 'common_sense_knowledge'].index[:num_samples])
safety_idx = list(dataset.loc[dataset['ambiguity_type'] == 'safety'].index[:num_samples])

In [24]:
indices = unamb_idx + pref_idx + csk_idx + safety_idx
print(indices)

[0, 1, 2, 401, 403, 404, 402, 407, 408, 400, 405, 417]


In [25]:
#Data to metrics
amb_type = dataset['ambiguity_type'].values
intents = dataset['user_intent'].values
amb_shortlist = dataset['amb_shortlist'].values

calibration_data = []
metrics_batch = {'llm_answers':[], 'y_amb_type':[], 'y_amb_intents':[], 'y_amb_shortlist':[],
                  'SR':[], 'help_rate': [], 'correct_help_rate': [], 'SSC': []}
option_prompts = []

In [26]:
tasks_for_ans = []

# select randomly
# random_samples = random.sample(indices, 10)

used_indices = []
for idx in indices:
  i = idx
  used_indices.append(i)
  description = dataset.loc[i, 'environment_full']
  task = dataset.loc[i, 'task']
  plan = dataset.loc[i, 'plan'].split('\n')
  point = dataset.loc[i, 'end_of_ambiguity']
  if point == 0:
      prefix = 'Your first action is:'
  else:
      prefix = 'Your previous actions were:\n'
      for act in plan[:point]:
          prefix += act
          prefix += '\n'
  action = plan[point]

  tasks_for_ans.append({'description':description, 'task':task, 'prefix':prefix, 'action':action})
  option_prompt = knowno.options_prompt(description, task, prefix, action)
  option_prompts.append(option_prompt)

In [27]:
print(len(option_prompts), len(tasks_for_ans))
assert len(option_prompts) == len(tasks_for_ans)

12 12


### Run experiment

In [28]:
options, logits, answers, right_answers, gen_scores, ans_scores = knowno.run_batch(option_prompts, tasks_for_ans)
batch_size = 1
metrics_batch = batch_metric_calculation(llm_answers_batch=right_answers, scores=options, y_amb_type_batch=amb_type[used_indices], y_amb_intents_batch=intents[used_indices], y_amb_shortlist_batch = amb_shortlist[used_indices])

  0%|          | 0/6 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 6/6 [00:23<00:00,  3.91s/it]
100%|██████████| 12/12 [00:25<00:00,  2.16s/it]


In [29]:
for t in range(len(options)):
  print(options[t], answers[t], right_answers[t], ans_scores[t], sep='\n')

{'A': 'A) use the bread knife to cut the vegetables into small pieces', 'B': 'B) use the paring knife to cut the vegetables into small pieces', 'C': 'do nothing', 'D': 'do nothing'}
['A', 'B']
['A) use the bread knife to cut the vegetables into small pieces', 'B) use the paring knife to cut the vegetables into small pieces']
[17.31479]
{'A': 'A) use the butter knife to spread the butter on the bread', 'B': 'B) use the butter knife to spread the jam on the bread', 'C': 'do nothing', 'D': 'do nothing'}
['A', 'B']
['A) use the butter knife to spread the butter on the bread', 'B) use the butter knife to spread the jam on the bread']
[2.0426226]
{'A': 'A) use the butter knife to slice the peeled avocado into thin pieces', 'B': "B) use the chef's knife to slice the peeled avocado into", 'C': 'do nothing', 'D': 'do nothing'}
['A', 'B', 'C']
['A) use the butter knife to slice the peeled avocado into thin pieces', "B) use the chef's knife to slice the peeled avocado into", 'do nothing']
[2.0832

### Calculate metrics

In [30]:
agg_metrics = aggreate(metrics_batch)
agg_metrics_df = pd.DataFrame(agg_metrics)
agg_metrics_df.to_csv(f"{exp_res_dir}/knowno_agg_metrics_{i}.csv")

In [31]:
metrics = pd.DataFrame(metrics_batch)

metrics.to_csv(f"{exp_res_dir}/knowno_metrics_{i}.csv")

metrics, amb_dif = ambiguity_differentiation(metrics)
print(amb_dif)
with open (f"{exp_res_dir}/knowno_ambdif_{i}.txt", 'a') as file:
    file.write(str(amb_dif))

metrics.to_csv(f"{exp_res_dir}/knowno_metrics_{i}.csv")

-1


In [32]:
agg_metrics_df

Unnamed: 0,ambiguity_type,sr_agg,amb_detection_agg,help_rate_agg,ssc_agg
0,unambiguous_direct,0.333333,0.0,1.0,-1.0
1,preferences,0.333333,0.666667,0.666667,0.111111
2,common_sense_knowledge,0.666667,1.0,1.0,-1.0
3,safety,0.333333,1.0,1.0,-1.0


### Logging

In [None]:
!pip install -q wandb
import wandb
wandb.login()

In [None]:
run = wandb.init(
    project="my-knowno-project",
    config={
        "CP": CP,
        "gen_model": title_prompt,
        "ans_model": title_answer,
        "estimation": estimator,
    },
)

In [35]:
db_metrics = {'unambiguous_direct': agg_metrics[0], 'preferences': agg_metrics[1], 'common_sense_knowledge': agg_metrics[2], 'safety': agg_metrics[3]}

In [36]:
wandb.log(db_metrics)

In [None]:
wandb.finish()

### Getting **CP** value

In [None]:
import gc
import sys
import glob
import random
import numpy as np
import pandas as pd
import re

def get_logits(knowno, description, task, prefix, action): #def get_logits(knowno, prompt)
     options = knowno.predict_examples(description, task, prefix, action)
     gc.collect()
     choose = knowno.generate_answer(options, description, task, prefix, action)
     gc.collect()
     print(choose[0])
     return options, choose[0]

def filter_similar_sentences(A, B):
    """
    Фильтрует словарь A, оставляя только те предложения, которые по смыслу похожи на предложения из списка B.

    :param A: Словарь с предложениями для фильтрации
    :param B: Список предложений для сравнения
    :param threshold: Порог схожести для сравнения предложений (по умолчанию 0.7)
    :return: Отфильтрованный словарь
    """

    # Функция для удаления частей "A)", "B)", и т.д.

    def remove_prefix(text):
        return re.sub(r'^[A-Z]\)\s*', '', text)

    processed_A = {key: remove_prefix(sentence) for key, sentence in A.items()}
   # processed_B = [remove_prefix(sentence) for sentence in B]

    def is_similar(sent1, sent2):
        right = 0
        splitted = sent2.split(', ')
        total = len(splitted)
        target = sent1.lower()
        for el in splitted:
            if el.startswith('-'):
                flag = False
                variables = el.replace('-', '')
                variables = variables.split('|')
                for var in variables:
                    if ' '+var in target:
                        flag = True
                if flag == False:
                    right += 1
            else:
                flag = False
                variables = el.split('|')
                for var in variables:
                    if ' '+var in target:
                        flag = True
                if flag == True:
                    right += 1

        if right == total:
            similarity = True
        else:
            similarity = False

        return similarity

    filtered_A = {}
    for key, sentence_A in processed_A.items():
        if any(is_similar(sentence_A, sentence_B) for sentence_B in B): #processed_B
            filtered_A[key] = A[key]
    return filtered_A #A,

def calibration(model, tokenizer):
    target_success = 0.8
    epsilon = 1-target_success

    configs = parse_config("/content/AmbiK-dataset/configs/knowno.yaml" , use_args=True)
    knowno_config = KnowNoConfig(configs)
    knowno = KnowNoPipe(config=knowno_config, model=model, tokenizer=tokenizer)

    #Calibration set
    dataset = pd.read_csv("/content/AmbiK-dataset/ambik_dataset/ambik_calib_100.csv")

    calibration_data = []

    #calib data датасет и файл для сохранения результатов калибровки (обычно не нужен)
    #calib_data = pd.DataFrame(columns=['id', 'task', 'all variants', 'right variants'])
    for i in range(len(dataset)):
        description = dataset.loc[i, 'environment_full']
        if dataset.loc[i, 'take_amb'] == 1:
            plan = dataset.loc[i, 'plan_for_amb_task'].split('\n')
            task = dataset.loc[i, 'ambiguous_task']
        else:
            plan = dataset.loc[i, 'plan_for_clear_task'].split('\n')
            task = dataset.loc[i, 'unambiguous_direct']
        point = dataset.loc[i, 'end_of_ambiguity']
        action = plan[point]
        if point == 0:
            prefix = 'Your first action is:'
        else:
            prefix = 'Your previous actions were:\n'
            for act in plan[:point]:
                prefix += act
                prefix += '\n'
        action = plan[point]

        options_all, answers_logits = get_logits(knowno, description, task, prefix, action)
        options_to_filter = {}
        options = options_all[0]
        for key in answers_logits.keys():
            if key in options.keys():
                options_to_filter[key] = options[key]
        #print('options ==== ', options)
        #print('logits ==== ', answers_logits)
        variants = dataset.loc[i, 'variants'].split("\n")

        filtered_options = filter_similar_sentences(options_to_filter, variants) #options,
        #print('filtered_options === ', filtered_options)

        success_logits = [answers_logits[key] for key in filtered_options]
        #print('success_logits ==== ', success_logits)
        calibration_data+=success_logits

        row = {'id':dataset.loc[i, 'id'], 'task':task, 'all variants':", ".join(options.values()),
               'right variants': ", ".join(variants),
               'filtered variants': ", ".join(filtered_options.values()),
               'success_logits': ', '.join(str(x) for x in success_logits)}
        #calib_data = pd.concat([calib_data, pd.DataFrame([row])], ignore_index=True)

    model = configs['examples_generation']['model']
    model = model.split('/')[-1]
    #calib_data.to_csv('calib_data/knowno_' + model +'.csv')


    num_calibration_data = len(calibration_data)
    q_level = np.ceil((num_calibration_data + 1) * (1 - epsilon)) / num_calibration_data
    qhat = np.quantile(calibration_data, q_level)
    return qhat

# pass here your model and tokenizer
print("CP: ", calibration(model, tokenizer))

{'examples_generation': {'model': 'google/gemma-2b', 'generation_kwargs': {'num_beams': 4, 'max_new_tokens': 250, 'num_return_sequences': 1}}, 'answering': {'model': 'google/flan-t5-base', 'generation_kwargs': {'num_beams': 4, 'max_new_tokens': 250, 'num_return_sequences': 1}}}
{'A': 0.32060150186451375, 'B': 0.05424181876264474, 'C': 0.022872435743895293, 'D': 0.047999708701159, 'a': 0.009235678320750252, 'b': 0.007089930140188613, 'c': 0.0025302789927134943, 'd': 0.005771267015830639, '1': 0.18860783315282384, '2': 0.17409901732028663, '3': 0.10154137338517039, '4': 0.06540915660002322}
{'A': 0.32928804644153375, 'B': 0.0942390138875479, 'C': 0.0317107636535158, 'D': 0.06852406270951523, 'a': 0.004768316528754664, 'b': 0.005765593037263588, 'c': 0.001582177302374635, 'd': 0.0030953017217197902, '1': 0.17598275883555248, '2': 0.15798689909233546, '3': 0.07414292891785883, '4': 0.0529141378720281}
{'A': 0.37818129504870035, 'B': 0.06508163496653463, 'C': 0.034156628242621265, 'D': 0.07