In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import pandas as pd
import gc
from tqdm import tqdm
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# model loading
model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto") # token=access_token

# tokenizer loading
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left", legacy=False) # token=access_token
tokenizer.pad_token_id = 0 

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.19s/it]


In [4]:
# parameters
batch_size = 64 # 64 for real test
path_test_set = "../representation-engineering/data/MMLU/data/test/elementary_mathematics_test.csv" # change this to your test set

In [5]:
def batchify(lst, batch_size):
    """Yield successive batch_size chunks from lst."""
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

def decompose_df(df):
    # get the question, and let each question replicate 4 times
    df_4_times = df.loc[df.index.repeat(4)].reset_index(drop=True)
    questions = ("Question: " + df_4_times.iloc[:,0]).values.tolist()
    # get the choices
    choices = df_4_times.iloc[:,1:5].values.tolist()
    # get each choice once
    choices = ["Answer: " + choices[i][i % 4] for i in range(len(choices))] 

    # use the original df to construct labels
    labels = df.iloc[:,5].values.tolist()
    # turn labels from "A, B, C, D" to "0, 1, 2, 3" using lambda function
    labels = list(map(lambda x: ord(x) - ord('A'), labels))

    print("Example question: {}".format(questions[0:5]))
    print("Example choice: {}".format(choices[0:5]))
    print("Example label: {}".format(labels[0:5]))

    return questions, choices, labels

def load_mmlu_sentences(path_test_set):
    df = pd.read_csv(path_test_set, header=None)
    questions, choices, labels = decompose_df(df)
    return questions, choices, labels

# need further look into this
def get_logprobs(logits, input_ids, masks, **kwargs):
    logprobs = F.log_softmax(logits, dim=-1)[:, :-1]
    # find the logprob of the input ids that actually come next in the sentence
    logprobs = torch.gather(logprobs, -1, input_ids[:, 1:, None])
    logprobs = logprobs * masks[:, 1:, None] 
    return logprobs.squeeze(-1)

def calc_acc(labels, output_logprobs):
    output_logprobs = np.array(output_logprobs).reshape(-1, 4)
    model_answer = np.argmax(output_logprobs, axis=1)
    correct = model_answer == labels
    return correct.mean(), model_answer

def prepare_decoder_only_inputs(prompts, targets, tokenizer, device):
    tokenizer.padding_side = "left"
    prompt_inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=False)
    tokenizer.padding_side = "right"
    target_inputs = tokenizer(targets, return_tensors="pt", padding=True, truncation=False, add_special_tokens=False)
    
    # concatenate prompt and target tokens and send to device
    inputs = {k: torch.cat([prompt_inputs[k], target_inputs[k]], dim=1).to(device) for k in prompt_inputs}
    # print("decoder only inputs: {}".format(inputs))

    # mask is zero for padding tokens
    mask = inputs["attention_mask"].clone()
    # set mask to 0 for question tokens
    mask[:, :prompt_inputs["input_ids"].shape[1]] = 0
    mask.to(device)
    # remove token_type_ids
    if "token_type_ids" in inputs:
        del inputs["token_type_ids"]
    
    return inputs, mask, prompt_inputs["input_ids"].shape[1]

In [6]:
def evaluation(model, tokenizer, questions, answers, labels, batch_size=128):
    gc.collect()

    model.eval()
    output_logprobs = []
    for q_batch, a_batch in tqdm(zip(batchify(questions, batch_size), batchify(answers, batch_size)), total=len(questions)//batch_size):
        inputs, masks, _ = prepare_decoder_only_inputs(q_batch, a_batch, tokenizer, model.model.device)

        with torch.no_grad():
            try:
                # set the masks so that we do not add to tokens of input sentences and padding tokens
                model.set_masks(masks.unsqueeze(-1))
            except:
                pass

            # calculate the probabilities for all tokens (all question answer pairs)
            logits = model(**inputs).logits
            # sum the probabilities for each question answer pair so that each pair has one probability
            # mask is zero for question and padding tokens
            logprobs = get_logprobs(logits, inputs['input_ids'], masks).sum(-1).detach().cpu().numpy()
        output_logprobs.extend(logprobs)

    return calc_acc(labels, output_logprobs)
    
questions, choices, labels = load_mmlu_sentences(path_test_set)
acc, model_answer = evaluation(model, tokenizer, questions, choices, labels, batch_size=batch_size)
print(f"Accuracy: {acc:.3f}")
print("model answer example: {}".format(model_answer[0:2]))

Example question: ['Question: What is the value of p in 24 = 2p?', 'Question: What is the value of p in 24 = 2p?', 'Question: What is the value of p in 24 = 2p?', 'Question: What is the value of p in 24 = 2p?', 'Question: Ms. Perez drove a total of 40 miles in 5 days. She drove the same number of miles each day. How many miles did Ms. Perez drive each day?']
Example choice: ['Answer: p = 4', 'Answer: p = 8', 'Answer: p = 12', 'Answer: p = 24', 'Answer: 5']
Example label: [2, 2, 3, 1, 1]


24it [00:13,  1.76it/s]                        

Accuracy: 0.585
model answer example: [2 2]



