In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import matplotlib.pyplot as plt
import torch
import torch.nn.functional as F
import pandas as pd
import gc
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# model loading
model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1"
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16, device_map="auto") # token=access_token

# tokenizer loading
use_fast_tokenizer = "LlamaForCausalLM" not in model.config.architectures
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=use_fast_tokenizer, padding_side="left", legacy=False) # token=access_token
tokenizer.pad_token_id = 0 

Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.09s/it]


In [27]:
# parameters
batch_size = 64 # 64 for real test
path_test_set = "../../data/MMLU/data/test/elementary_mathematics_test.csv" # change this to your test set

In [30]:
def batchify(lst, batch_size):
    """Yield successive batch_size chunks from lst."""
    for i in range(0, len(lst), batch_size):
        yield lst[i:i + batch_size]

def decompose_df(df):
    inputs = ("Question: " + df.iloc[:,0] + " Choice: A: " + df.iloc[:,1] + ", B: " + df.iloc[:,2] + ", C: " + df.iloc[:,3] + ", D: " + df.iloc[:,4] + ". Please provide only the choice and answer as succinctly as possible. Answer:").values.tolist()
    print(inputs[0:2])
    answers = df.iloc[:,5].values.tolist()
    print("Example answer: {}".format(answers[0:2]))
    # turn answers from "A, B, C, D" to "0, 1, 2, 3" using lambda function
    answers = list(map(lambda x: ord(x) - ord('A'), answers))
    print("Answer in number: {}".format(answers[0:2]))
    return inputs, answers

def load_mmlu_sentences(path_test_set):
    df = pd.read_csv(path_test_set, header=None)
    inputs, answers = decompose_df(df)
    return inputs, answers

def prepare_decoder_only_inputs(inputs, tokenizer):
    tokenizer.padding_side = "left"
    packed_inputs = tokenizer(inputs, padding=True, truncation=False, return_tensors="pt") # , return_attention_mask=True

    # remove token_type_ids to save space
    if "token_type_ids" in packed_inputs:
        del packed_inputs["token_type_ids"]

    return packed_inputs

In [31]:
def evaluation(model, tokenizer, inputs, answers, batch_size=128):
    gc.collect()

    model.eval()
    # For choice tokens, we want both " A" and "A".
    # the tokens to be extracted depends on the separator used by the tokenizer
    # see https://huggingface.co/docs/transformers/v4.18.0/en/tokenizer_summary
    # and "SentencePiece" for more details
    tokens_be_extracted = ['▁A', '▁B', '▁C', '▁D', 'A', 'B', 'C', 'D']
    tokens_index = [tokenizer.convert_tokens_to_ids(token) for token in tokens_be_extracted]

    # get the log probability of all the samples
    log_probs_extracted_all = []
    for input_batch in tqdm(batchify(inputs, batch_size), total=len(inputs)//batch_size):
        packed_inputs = prepare_decoder_only_inputs(input_batch, tokenizer)
        with torch.no_grad():
            logits = model(**packed_inputs).logits
            # use the next token's logits
            logits = logits[:,-1,:].squeeze()
            # derive the probability corresponding to "A", "B", "C", "D"
            log_logits = F.log_softmax(logits, dim=-1)

            log_probs_extracted = log_logits[:, tokens_index]
            log_probs_extracted_all.append(log_probs_extracted)

    # calculate the accuracy based on the log probability
    log_probs_extracted_all = torch.cat(log_probs_extracted_all, dim=0)
    choice_all = log_probs_extracted_all.cpu().numpy().argmax(axis=1)
    # print(choice_all.shape)
    print("model answer example: {}".format(choice_all[0:2]))
    # Here we mod 4 because we take both " A" and "A" into account.
    choice_all %= 4
    print("Ground truth answer example: {}".format(answers[:2]))
    acc = (choice_all == answers[:len(choice_all)]).mean()
    return acc

inputs, answers = load_mmlu_sentences(path_test_set)
acc = evaluation(model, tokenizer, inputs, answers, batch_size=batch_size)
print(f"Accuracy: {acc:.3f}")

['Question: What is the value of p in 24 = 2p? Choice: A: p = 4, B: p = 8, C: p = 12, D: p = 24. Please provide only the choice and answer as succinctly as possible. Answer:', 'Question: Ms. Perez drove a total of 40 miles in 5 days. She drove the same number of miles each day. How many miles did Ms. Perez drive each day? Choice: A: 5, B: 7, C: 8, D: 9. Please provide only the choice and answer as succinctly as possible. Answer:']
Answer: ['C', 'C']
Answer in number: [2, 2]


6it [00:09,  1.54s/it]                       

model answer example: [1 2]
Ground truth answer example: [2, 2]
Accuracy: 0.352



