In [1]:
!pip install datasets
!pip install evaluate
!pip install -U bitsandbytes
!pip install transformers==4.46.1

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
import torch
import evaluate
import re
import json
import time

from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import notebook_login
from datasets import load_dataset
from tqdm import tqdm
from collections import defaultdict

In [3]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
def load_model_and_tokenizer(model_name, load_in_float16=False):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if(load_in_float16):
        model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)#, device_map="auto")
    else:
        model = AutoModelForCausalLM.from_pretrained(model_name)

    model.eval()

    model.to('cuda' if torch.cuda.is_available() else 'cpu')

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer


def parse_mmlu_example(example):
    question = example['question'].strip()
    options = example['choices']
    answer_index = example['answer']
    subject = example['subject']

    option_labels = ['A', 'B', 'C', 'D']
    if 0 <= answer_index < len(option_labels):
        answer_label = option_labels[answer_index]
    else:
        answer_label = "N/A"

    return {
        'question': question,
        'options': options,
        'answer': answer_label,
        'subject': subject
    }


def load_and_parse_dataset(dataset_name, parse_function, split, subset=None, max_samples=None):
    if subset is None:
        dataset = load_dataset(dataset_name)[split]
    else:
        dataset = load_dataset(dataset_name, subset)[split]

    dataset = dataset.map(parse_function)

    if max_samples:
        dataset = dataset.select(range(max_samples))

    return dataset


def create_prompt(question, options, few_shot=False, examples=None):
    prompt = ""
    if few_shot and examples:
        for ex in examples:
            prompt += "<|start_header_id|>user<|end_header_id|>\n\n"
            prompt += "Given the following question and four candidate answers (A, B, C, and D), choose the best answer.\n"
            prompt += f"Question: {ex['question']}\n"
            option_labels = ['A', 'B', 'C', 'D']
            for label, option_text in zip(option_labels, ex['options']):
                prompt += f"{label}. {option_text}\n"
            prompt += 'Your response should end with "The best answer is [the_answer_letter]" where [the_answer_letter] is one of A, B, C, or D.\n'
            prompt += "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            prompt += f"The best answer is {ex['answer']}\n\n"

    prompt += "<|start_header_id|>user<|end_header_id|>\n\n"
    prompt += "Given the following question and four candidate answers (A, B, C, and D), choose the best answer.\n"
    prompt += f"Question: {question}\n"
    option_labels = ['A', 'B', 'C', 'D']
    for label, option_text in zip(option_labels, options):
        prompt += f"{label}. {option_text}\n"
    prompt += 'Your response should end with "The best answer is [the_answer_letter]" where [the_answer_letter] is one of A, B, C, or D.'
    prompt += "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
    prompt += "The best answer is"
    return prompt


def extract_choice(output):
    match = re.search(r'The best answer is\s*([A-D])', output, re.IGNORECASE)

    # print("***")
    # print(output)
    # print("***\n")

    if match:
        return match.group(1).upper()
    else:
        # for option in ['A', 'B', 'C', 'D']:
        #     if option in output.upper():
        #         return option
        return "N/A"


def predict_single_example(model, tokenizer, question, options, few_shot=False, examples=None, max_new_tokens=5):
    prompt = create_prompt(question, options, few_shot, examples)
    inputs = tokenizer(prompt, return_tensors='pt', truncation=False, max_length=1024).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens,
            num_beams=1,
            do_sample=False,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    answer = extract_choice(decoded_output)
    return answer, prompt


def predict_batch(model, tokenizer, questions, options_list, few_shot=False, examples=None, max_new_tokens=5):
    prompts = [create_prompt(q, o, few_shot, examples) for q, o in zip(questions, options_list)]

    inputs = tokenizer(prompts, return_tensors='pt', padding=True, padding_side='left', truncation=False).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_new_tokens=max_new_tokens,
            num_beams=1,
            do_sample=False,
            #top_p=0.9,
            #temperature=1.5,
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    answers = [extract_choice(output) for output in decoded_outputs]
    return answers, prompts


def compute_accuracies(predictions, labels, subjects):
    option_label_to_index = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    predictions_mapped = [option_label_to_index.get(pred, -1) for pred in predictions]

    subject_data = defaultdict(lambda: {'predictions': [], 'labels': []})

    for pred, label, subj in zip(predictions_mapped, labels, subjects):
        subject_data[subj]['predictions'].append(pred)
        subject_data[subj]['labels'].append(label)

    accuracy_metric = evaluate.load('accuracy')
    subject_accuracy = {}
    for subject in sorted(subject_data.keys()):
        data = subject_data[subject]
        accuracy = accuracy_metric.compute(predictions=data["predictions"], references=data["labels"])
        subject_accuracy[subject] = accuracy['accuracy']

    overall_accuracy = accuracy_metric.compute(predictions=predictions_mapped, references=labels)['accuracy']

    return overall_accuracy, subject_accuracy


def save_data(file_name, predictions, labels, subjects, overall_accuracy, subject_accuracy, time):

    results_data = {
        'overall_accuracy': overall_accuracy,
        'per_subject_accuracy': subject_accuracy,
        'per_sample': [
            {
                'Subject': subj,
                'Prediction': pred,
                'Label': label
            }
            for pred, label, subj in zip(predictions, labels, subjects)
        ],
        'total_time' : time
    }

    with open(file_name + '.json', 'w') as json_file:
        json.dump(results_data, json_file, indent=4)


def main():
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    load_in_float16 = True
    results_file_name = 'Llama-3.2-1B-Instruct_mmlu_evaluation_results_batched_float16'
    split = 'dev'
    batch_size = 4

    model, tokenizer = load_model_and_tokenizer(model_name, load_in_float16=load_in_float16)

    dataset = load_and_parse_dataset('cais/mmlu', parse_function=parse_mmlu_example, split=split, subset='all', max_samples=None)

    few_shot = False
    if few_shot:
        few_shot_examples = [dataset[i] for i in range(min(3, len(dataset)))]
    else:
        few_shot_examples = None

    predictions = []
    labels = []
    subjects = []

    start_time = time.time()

    if batch_size == 1:
        for i in tqdm(range(len(dataset)), desc="Evaluating"):
            batch = dataset.select([i])
            question = batch['question'][0]
            options = batch['options'][0]

            predicted_answer, _ = predict_single_example(
                model=model,
                tokenizer=tokenizer,
                question=question,
                options=options,
                few_shot=few_shot,
                examples=few_shot_examples,
                max_new_tokens=5
            )

            predictions.append(predicted_answer)
            labels.append(batch['answer'][0])
            subjects.append(batch['subject'][0])

            torch.cuda.empty_cache()

    else:
        print(f'Batch size {batch_size}')
        for i in tqdm(range(0, len(dataset), batch_size), desc="Evaluating"):
            end_idx = min(i + batch_size, len(dataset))
            batch = dataset.select(range(i, end_idx))

            questions = batch['question']
            options_list = batch['options']

            predicted_answers, _ = predict_batch(
                model=model,
                tokenizer=tokenizer,
                questions=questions,
                options_list=options_list,
                few_shot=few_shot,
                examples=few_shot_examples,
                max_new_tokens=10
            )

            #print(predicted_answers)

            predictions.extend(predicted_answers)
            labels.extend(batch['answer'])
            subjects.extend(batch['subject'])

            del batch, questions, options_list, predicted_answers
            torch.cuda.empty_cache()

    end_time = time.time()
    total_time = end_time - start_time
    print(f"\nTotal time: {total_time:.2f} seconds")

    option_label_to_index = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    predictions_mapped = [option_label_to_index.get(pred, -1) for pred in predictions]

    unique_values = set(predictions_mapped)
    print("Unique values in predictions_mapped:", unique_values)

    overall_accuracy, subject_accuracy = compute_accuracies(predictions, labels, subjects)

    save_data(results_file_name, predictions_mapped, labels, subjects, overall_accuracy, subject_accuracy, total_time)

    print(f"\n\nAccuracy by subject:")
    for subject, accuracy in subject_accuracy.items():
        print(f"Subject: {subject}, Accuracy: {accuracy * 100:.2f}%")

    print(f"\n\nOverall Accuracy: {overall_accuracy * 100:.2f}%")

if __name__ == "__main__":
    main()

Batch size 4


Evaluating: 100%|██████████| 72/72 [00:10<00:00,  6.86it/s]



Total time: 10.50 seconds
Unique values in predictions_mapped: {0, 1, 2, 3}


Accuracy by subject:
Subject: abstract_algebra, Accuracy: 60.00%
Subject: anatomy, Accuracy: 40.00%
Subject: astronomy, Accuracy: 20.00%
Subject: business_ethics, Accuracy: 80.00%
Subject: clinical_knowledge, Accuracy: 80.00%
Subject: college_biology, Accuracy: 40.00%
Subject: college_chemistry, Accuracy: 20.00%
Subject: college_computer_science, Accuracy: 40.00%
Subject: college_mathematics, Accuracy: 60.00%
Subject: college_medicine, Accuracy: 80.00%
Subject: college_physics, Accuracy: 20.00%
Subject: computer_security, Accuracy: 60.00%
Subject: conceptual_physics, Accuracy: 40.00%
Subject: econometrics, Accuracy: 40.00%
Subject: electrical_engineering, Accuracy: 20.00%
Subject: elementary_mathematics, Accuracy: 80.00%
Subject: formal_logic, Accuracy: 40.00%
Subject: global_facts, Accuracy: 60.00%
Subject: high_school_biology, Accuracy: 60.00%
Subject: high_school_chemistry, Accuracy: 0.00%
Subject: high_s