In [5]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import numpy as np


In [7]:
# Load the MMLU dataset from Hugging Face
mmlu = load_dataset("cais/mmlu", "all")  # This loads all MMLU tasks

# Model and tokenizer setup
checkpoint = "facebook/layerskip-llama3.2-1B"
device = "cuda" if torch.cuda.is_available() else "cpu"

In [12]:
model = AutoModelForCausalLM.from_pretrained(
    checkpoint, 
    device_map="auto", 
    torch_dtype=torch.bfloat16
).to(device)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Fix: Set pad_token to eos_token
tokenizer.pad_token = tokenizer.eos_token  # This sets eos_token as pad_token
model.config.pad_token_id = model.config.eos_token_id  # Set pad_token_id for the model config


In [13]:
# Settings
generation_max_tokens = 32  # Adjust based on needs
batch_size = 8  # Batch size of 8

In [14]:
# Function to generate answers in batches
def generate_answers(batch_prompts):
    inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(device)
    
    with torch.no_grad(), torch.cuda.amp.autocast():  # Use mixed precision
        outputs = model.generate(**inputs, max_new_tokens=generation_max_tokens)

    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [15]:
# Run the model on each task
for task in mmlu:
    task_data = mmlu[task]
    print(f"Evaluating task: {task}")
    
    correct_predictions = 0
    total_predictions = len(task_data)

    # Use a DataLoader for efficient batching
    dataloader = DataLoader(task_data, batch_size=batch_size, shuffle=False)

    for batch in dataloader:
        batch_prompts = batch["question"]  # Assuming MMLU dataset format

        batch_answers = generate_answers(batch_prompts)

        # Check correctness
        correct_predictions += sum([1 for pred, gt in zip(batch_answers, batch["answer"]) if pred == gt])

    accuracy = correct_predictions / total_predictions
    print(f"Task {task} Accuracy: {accuracy * 100:.2f}%")

  with torch.no_grad(), torch.cuda.amp.autocast():  # Use mixed precision
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Evaluating task: test


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

KeyboardInterrupt: 