# deepeval-mmlu

code reproduced from https://www.datacamp.com/tutorial/deepeval consult the source for extended comments and explanation

This notebook demonstrates how to configure, load, and evaluate a custom large language model (LLM) using the DeepEval library on the MMLU (Massive Multitask Language Understanding) benchmark. The focus is on benchmarking the performance of a quantized version of the Qwen2.5 7B Instruct model, optimized for efficient inference, on specific academic tasks.

In [1]:
from deepeval.models.base_model import DeepEvalBaseLLM
import torch, logging 
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

class QwenModel(DeepEvalBaseLLM):
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = "cuda"

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        prompt = prompt.replace("Output 'A', 'B', 'C', or 'D'. Full answer not needed.","")
        model_inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=2,
            do_sample=True,
            pad_token_id=self.tokenizer.eos_token_id
        )
        prompt_length = len(model_inputs[0])
        generated_tokens = generated_ids[0][prompt_length:]
        clean_output = tokenizer.decode(generated_tokens, skip_special_tokens=True).strip()
        return clean_output.replace(".","")

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "Qwen2.5 7B"

In [2]:
def load_model(model_name: str):
    quant_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=False
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=quant_config,
        device_map="auto"
    )
    model.config.use_cache = False
    model.config.pretraining_tp = 1
    return model

def load_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return tokenizer

In [3]:
# Load model and tokenizer
qwen_model_name = "Qwen/Qwen2.5-7B-Instruct"
model = load_model(qwen_model_name)
tokenizer = load_tokenizer(qwen_model_name)
custom_model = QwenModel(model, tokenizer)

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [4]:
# Test model generation

prompt = """
The following are multiple choice questions (with answers) about abstract algebra.

Find all c in Z_3 such that Z_3[x]/(x^2 + c) is a field.
A. 0
B. 1
C. 2
D. 3
Answer:"""
print(custom_model.generate(prompt))

B


In [5]:
from deepeval.benchmarks import MMLU
from deepeval.benchmarks.tasks import MMLUTask

# Define benchmark with specific tasks and shots
benchmark = MMLU(
    tasks=[MMLUTask.HIGH_SCHOOL_COMPUTER_SCIENCE, MMLUTask.ASTRONOMY],
    n_shots=5
)

benchmark.evaluate(model=custom_model, batch_size=5)

README.md:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

mmlu.py:   0%|          | 0.00/5.01k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/166M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/100 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/9 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing high_school_computer_science: 100%|██████████| 100/100 [00:36<00:00,  2.74it/s]


MMLU Task Accuracy (task=high_school_computer_science): 0.85


Generating test split:   0%|          | 0/152 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/16 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/5 [00:00<?, ? examples/s]

Processing astronomy: 100%|██████████| 152/152 [00:54<00:00,  2.79it/s]

MMLU Task Accuracy (task=astronomy): 0.8092105263157895
Overall MMLU Accuracy: 0.8253968253968254





0.8253968253968254

In [6]:
benchmark.task_scores

Unnamed: 0,Task,Score
0,high_school_computer_science,0.85
1,astronomy,0.809211


In [7]:
benchmark.predictions

Unnamed: 0,Task,Input,Prediction,Expected Output,Correct
0,high_school_computer_science,Let x = 1. What is x << 3 in Python 3?\nA. 1\n...,C,C,1
1,high_school_computer_science,"In Python 3, which of the following function c...",A,A,1
2,high_school_computer_science,"A user enters a Web address in a browser, and ...",A,A,1
3,high_school_computer_science,Digital images are often represented by the re...,B,C,0
4,high_school_computer_science,A programmer is writing a program that is inte...,B,B,1
...,...,...,...,...,...
247,astronomy,Find the best approximation for the surface te...,A,A,1
248,astronomy,Previous IAAC rounds featured Proxima/Alpha Ce...,C,D,0
249,astronomy,How are planetary rings made?\nA. From the dis...,B,A,0
250,astronomy,The lunar maria are:\nA. ancient heavily crate...,C,C,1
