In [None]:
# run this cell to establish the appropriate environment
# !pip install torch transformers datasets accelerate bitsandbytes

In [None]:
# set up packages and hyperparams 
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset
import ast
import traceback

models = [
    "bigcode/starcoder2-15b",
    "codellama/CodeLlama-34b-Instruct-hf",
    "deepseek-ai/deepseek-coder-33b-instruct",
    "WizardLM/WizardCoder-Python-34B-V1.0",
    "replit/replit-code-v1.5-3b"
] # can add or remove from this list

dataset_name = "openai_humaneval" # can change this or add more
max_new_tokens = 512
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
# function to validate the results

def safe_exec(code_string, inputs=(), expected_output=None):
    try:
        local_env = {}
        exec(code_string, {}, local_env)
        func = [v for v in local_env.values() if callable(v)][0]
        output = func(*inputs)
        return output == expected_output
    except Exception as e:
        print("Execution error:", e)
        traceback.print_exc()
        return False

In [None]:
# function to run evaluation of LLM generated code - uses ast
def evaluate_model(model, tokenizer, dataset):
    n_total = 0
    n_correct = 0

    for sample in dataset:
        prompt = sample["prompt"]
        test_code = sample["test"]

        inputs = tokenizer(prompt, return_tensors="pt").to(device)
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
        generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_code = generated_code[len(prompt):]

        try:
            parsed_test = ast.parse(test_code)
            tests = [node for node in parsed_test.body if isinstance(node, ast.Assert)]

            all_passed = True
            for test in tests:
                expr = ast.unparse(test.test)
                fake_test_code = f"{generated_code}\nassert {expr}"
                passed = safe_exec(fake_test_code)
                if not passed:
                    all_passed = False
                    break

            if all_passed:
                n_correct += 1

        except Exception as e:
            print("Test parsing error:", e)

        n_total += 1

        if n_total % 5 == 0:
            print(f"Samples evaluated: {n_total} - Current Pass@1: {n_correct}/{n_total} = {n_correct/n_total:.2f}")

        # remove the following condition to run on the whole dataset
        if n_total >= 20:
            break

    final_score = n_correct / n_total
    return final_score

In [None]:
dataset = load_dataset(dataset_name, split="test")

# Quantization for low GPU resources. Remove to run full precision inference
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

# for each llm in the list defined above
for model_id in models:
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id, 
        trust_remote_code=True,
        quantization_config=bnb_config,
        device_map="auto"
    )
    model.eval()

    # Evaluate
    score = evaluate_model(model, tokenizer, dataset)
    print(f"\n✅ {model_id} achieved Pass@1 score: {score:.4f} on first 20 samples.")

    # Clean up memory
    del model
    torch.cuda.empty_cache()
