In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForMaskedLM
import datasets
from datasets import load_dataset

import sys
import src.evals.data as data_module

import torch

import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = "answerdotai/ModernBERT-large"

In [3]:
chat = data_module.create_preference_to_flan_style_dataset(
    task = "sarahpann/rwb_chat",
    split = "train",
    tokenizer_name = tokenizer,
    max_seq_length = 3600,
    prefix = "Which response is the most helpful, relevant, and correct? ",

    dataset_name="sarahpann/rwb_chat",
    dataset_subset="",
    task_column_names={"sarahpann/rwb_chat": ('chosen', 'rejected', 'og_dataset')}
)

chat_hard = data_module.create_preference_to_flan_style_dataset(
    task = "sarahpann/rwb_chat_hard",
    split = "train",
    tokenizer_name = tokenizer,
    max_seq_length = 3600,
    prefix = "Which response is the most helpful, relevant, and correct? ",

    dataset_name="sarahpann/rwb_chat_hard",
    dataset_subset="",
    task_column_names={"sarahpann/rwb_chat_hard": ('chosen', 'rejected', 'og_dataset')}
)

reasoning = data_module.create_preference_to_flan_style_dataset(
    task="sarahpann/rwb_reasoning",
    split='train',
    tokenizer_name=tokenizer,
    max_seq_length=3600,
    prefix="Determine which response is the best choice based on mathematical or programming accuracy. ",

    dataset_name="sarahpann/rwb_reasoning",
    dataset_subset="",
    task_column_names={"sarahpann/rwb_reasoning": ('chosen', 'rejected', 'og_dataset')}
)

safety = data_module.create_preference_to_flan_style_dataset(
    task="sarahpann/rwb_safety",
    split='train',
    tokenizer_name=tokenizer,
    max_seq_length=3600,
    prefix="Which response is the most helpful, relevant, and correct? ",

    dataset_name="sarahpann/rwb_safety",
    dataset_subset="",
    task_column_names={"sarahpann/rwb_safety": ('chosen', 'rejected', 'og_dataset')}
)

Downloading readme: 100%|██████████| 352/352 [00:00<00:00, 2.51kB/s]
Downloading data: 100%|██████████| 3.90M/3.90M [00:00<00:00, 5.36MB/s]
Generating train split: 100%|██████████| 2488/2488 [00:00<00:00, 134513.57 examples/s]
Map: 100%|██████████| 2488/2488 [00:00<00:00, 2703.52 examples/s]
Downloading readme: 100%|██████████| 348/348 [00:00<00:00, 2.52kB/s]
Downloading data: 100%|██████████| 403k/403k [00:00<00:00, 741kB/s]
Generating train split: 100%|██████████| 464/464 [00:00<00:00, 164357.49 examples/s]
Map: 100%|██████████| 464/464 [00:00<00:00, 4983.30 examples/s]
Downloading readme: 100%|██████████| 352/352 [00:00<00:00, 2.51kB/s]
Downloading data: 100%|██████████| 1.06M/1.06M [00:00<00:00, 2.81MB/s]
Generating train split: 100%|██████████| 1431/1431 [00:00<00:00, 223781.70 examples/s]
Map: 100%|██████████| 1431/1431 [00:00<00:00, 2578.10 examples/s]
Downloading readme: 100%|██████████| 350/350 [00:00<00:00, 2.58kB/s]
Downloading data: 100%|██████████| 665k/665k [00:00<00:00, 

In [4]:
model = AutoModelForMaskedLM.from_pretrained("sarahpann/reasoning_model_large")

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


In [5]:
chat.set_format(type='torch', columns=['input_ids', 'attention_mask'])
chat_hard.set_format(type='torch', columns=['input_ids', 'attention_mask'])
reasoning.set_format(type='torch', columns=['input_ids', 'attention_mask'])
safety.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [11]:
chat_loader = torch.utils.data.DataLoader(chat, batch_size=1, shuffle=False)
chat_hard_loader = torch.utils.data.DataLoader(chat_hard, batch_size=1, shuffle=False)
reasoning_loader = torch.utils.data.DataLoader(reasoning, batch_size=1, shuffle=False)
safety_loader = torch.utils.data.DataLoader(safety, batch_size=1, shuffle=True)

In [7]:
real_tokenizer = AutoTokenizer.from_pretrained(tokenizer)

In [8]:
import tqdm
from tqdm import tqdm

In [13]:
def eval_dataset(model, subject_dataloader):
    total = 0
    correct = 0

    model.to("cuda")
    model.eval()

    with torch.no_grad():
        for i, example in tqdm(enumerate(subject_dataloader)):
            input_ids = example['input_ids'].clone()
            input_ids[:, -2] = real_tokenizer.mask_token_id
            input_ids = input_ids.to("cuda")
            attention_mask = example['attention_mask'].to("cuda")


            output = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = output.logits
            pred = torch.argmax(logits, dim=-1)

            # print(output)
            if not i % 100:
                print("prediction: ", pred[0, -2])
                print("label: ", example['input_ids'][0, -2])

            # print(real_tokenizer.decode(input_ids[0]))
            if pred[0, -2] == example['input_ids'][0, -2]:
                correct += 1
            total += 1

            
    print(f"Accuracy: {correct / total}")
    return correct / total

In [14]:
# chat_acc = eval_dataset(model, chat_loader)
# chat_hard_acc = eval_dataset(model, chat_hard_loader)
reasoning_acc = eval_dataset(model, reasoning_loader)
# safety_acc = eval_dataset(model, safety_loader)

4it [00:00, 39.15it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(18)


106it [00:02, 34.52it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(18)


204it [00:05, 37.05it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(18)


302it [00:08, 26.14it/s]

prediction:  tensor(18789, device='cuda:0')
label:  tensor(18)


403it [00:11, 32.85it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(18)


505it [00:15, 34.69it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(18)


603it [00:18, 28.89it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(18)


707it [00:21, 38.70it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(18)


805it [00:24, 28.06it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(18)


903it [00:27, 30.56it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(18)


1002it [00:31, 18.37it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(17)


1103it [00:36, 20.63it/s]

prediction:  tensor(18, device='cuda:0')
label:  tensor(17)


1204it [00:40, 28.09it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(17)


1306it [00:44, 28.95it/s]

prediction:  tensor(17, device='cuda:0')
label:  tensor(17)


1406it [00:47, 33.94it/s]

prediction:  tensor(18, device='cuda:0')
label:  tensor(17)


1431it [00:48, 29.42it/s]

Accuracy: 0.11390635918937805





In [67]:
final_results = {
    "chat": chat_acc,
    "chat_hard": chat_hard_acc,
    "reasoning": reasoning_acc,
    "safety": safety_acc
}

print(final_results)

{'chat': 0.7946141479099679, 'chat_hard': 0.2521551724137931, 'reasoning': 0.76659678546471, 'safety': 0.7918918918918919}


In [None]:
# for chat
0.8657556270096463 * (1 - 0.157) + 0.36853448275862066 * (0.157)

0.7876919073622352

In [None]:
# for reasoning
0.3305380852550664