# T4D

In [1]:
from datasets import Dataset
from pyprojroot import here

In [7]:
chk_dir = here("struct_vs_unstruct/data/non_self_synthesis/t4d/t4d-/t4d_eval")

In [8]:
dataset = Dataset.load_from_disk(chk_dir)

In [9]:
dataset

Dataset({
    features: ['story', 'question', 'answer', 'characters', 'distracting_characters', 'reasoning_formats', 'selected_modules', 'adapted_modules', 'reasoning_plan', 'reasoning', 'trajectory', 'answer_pred'],
    num_rows: 564
})

In [15]:
set(dataset["answer_pred"])

{'A and B, but since the options do not allow for multiple answers, the most appropriate answer would be to choose one of the individuals who would benefit from receiving helpful information.',
 'A and B, however, the options do not provide a way to select both A and B. Given the instructions to select one answer, the best answer would be to acknowledge that the format does not allow for the correct answer to be provided as per the instructions. However, following the instructions to the letter as requested:',
 "A and C, but since the format requires a single answer, the most relevant answer is A (as Emily's needs are more specific and related to the banana and corn).",
 'A and C, however, the options do not allow for the selection of multiple answers. Given the instructions to select one answer, I will choose one of the correct answers.',
 'A and C.',
 'A,C.',
 'Abigail',
 'Aiden',
 'Alexander',
 'Amelia',
 'Aria',
 'Ava',
 'Avery',
 'B and A, however, the options provided do not allo

In [11]:
import re

def extract_choice_text(text, choice_letter):
    # Define the pattern to match the choice letter followed by a dot and space
    pattern = rf"{choice_letter}\.\s(.*?)(?=\n|$)"
    match = re.search(pattern, text)
    if match:
        return match.group(1).strip()
    return None

def map_fn(instance):
    answer_pred = str(instance["answer_pred"].translate(str.maketrans("", "", ".")))
    if len(answer_pred) == 1:
        answer = extract_choice_text(instance["question"], answer_pred)
    else:
        print(instance["answer_pred"])
        answer = instance["answer_pred"]
        if not answer:
            print("+++++++++++++++++++++++++++++++++++")

    return {"answer_pred": answer}

In [13]:
dataset = dataset.map(map_fn)

In [14]:
dataset.save_to_disk(here("struct_vs_unstruct/data/modified/non_self_synthesis/t4d/t4d-/t4d_eval"))

Saving the dataset (0/1 shards):   0%|          | 0/564 [00:00<?, ? examples/s]

In [19]:
from tqdm import tqdm


t4d = lambda instance, y, y_pred: instance[y] in instance[y_pred] and instance[y] == instance[y_pred]
bbh = lambda instance, y, y_pred: instance[y_pred] and instance[y].translate(str.maketrans("", "", "()")) == instance[y_pred].translate(str.maketrans("", "", ".()"))


def calculate_accuracy(full_dataset, benchmark, y: str, y_pred: str, log_file_path: str):
    correct_preds = 0
    for instance in tqdm(full_dataset, desc="Calculating accuracy"):
        if benchmark == "t4d":
            eval_fn = t4d
        elif benchmark == "bbh":
            eval_fn = bbh

        if eval_fn(instance, y, y_pred):
            correct_preds += 1
        else:
            with open(log_file_path, "a") as f:
                f.write(f"{instance[y]}, {instance[y_pred]}\n")
    return correct_preds / len(full_dataset)

In [20]:
calculate_accuracy(dataset, "t4d", "answer", "answer_pred", here("struct_vs_unstruct/logs/modified/non_self_synthesis/evals/t4d/t4d-/t4d_different.txt"))

Calculating accuracy: 100%|██████████| 564/564 [00:00<00:00, 6976.80it/s]


0.7464539007092199