# HumanEval

code reproduced from https://github.com/huggingface/smol-course/blob/main/4_evaluation/notebooks/lighteval_evaluate_and_analyse_your_LLM.ipynb , refer to it for extended context and explanation.

⚠️ models are stored in the local folder tmp you might want to git rid of it afterward

This notebook walks through an evaluation of two language models, Qwen2-1.5B and SmolLM2-1.7B-Instruct, on a subset of tasks from the MMLU (Massive Multitask Language Understanding) benchmark. The evaluation uses LightEval, a lightweight library for assessing large language models (LLMs). By the end of the notebook, we compare the models' accuracies visually using a bar chart.

In [None]:
import lighteval
import os
from datetime import timedelta
from transformers import AutoModelForCausalLM

from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.pipeline import EnvConfig, ParallelismManager, Pipeline, PipelineParameters

from dotenv import load_dotenv

load_dotenv()

TOKEN = os.getenv("HF_TOKEN")
#TOKEN

In [2]:

env_config = EnvConfig(token=TOKEN, cache_dir="./tmp")

evaluation_tracker = EvaluationTracker(
    output_dir="./tmp",
    save_details=False,
    push_to_hub=False,
    push_to_tensorboard=False,
    public=False,
    hub_results_org=False,
)

pipeline_params = PipelineParameters(
    launcher_type=ParallelismManager.ACCELERATE,
    env_config=env_config,
    job_id=1,
    override_batch_size=1,
    num_fewshot_seeds=0,
    max_samples=10,
    use_chat_template=False,
)

In [None]:
def domain_tasks(n:int)-> str:
    return f"leaderboard|mmlu:anatomy|{n}|0,leaderboard|mmlu:professional_medicine|{n}|0,leaderboard|mmlu:high_school_biology|{n}|0,leaderboard|mmlu:high_school_chemistry|{n}|0".format(n)

domain_tasks(5)

In [None]:
qwen_model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-1.5B")

n=10

pipeline = Pipeline(
    tasks=domain_tasks(n),
    pipeline_parameters=pipeline_params,
    evaluation_tracker=evaluation_tracker,
    model=qwen_model
)

pipeline.evaluate()

qwen_results = pipeline.get_results()

pipeline.show_results()

In [None]:
smol_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-1.7B-Instruct")

pipeline = Pipeline(
    tasks=domain_tasks(n),
    pipeline_parameters=pipeline_params,
    evaluation_tracker=evaluation_tracker,
    model=smol_model
)

pipeline.evaluate()
     

smol_results = pipeline.get_results()
     

pipeline.show_results()

In [None]:
import pandas as pd

df = pd.DataFrame.from_records(smol_results["results"]).T["acc"].rename("SmolLM2-1.7B-Instruct")
_df = pd.DataFrame.from_records(qwen_results["results"]).T["acc"].rename("Qwen2-1.5B")
df = pd.concat([df, _df], axis=1)
df.plot(kind="barh")