# HumanEval Evaluation: Qwen2.5-Coder-7B-Instruct (Error-Aware Prompt)

This notebook evaluates `Qwen/Qwen2.5-Coder-7B-Instruct` on HumanEval using pass@1 metric.

**This version uses an ERROR-AWARE PROMPT** designed to prevent syntax errors.

**Features:**
- Calculate pass@1 on all 164 HumanEval problems
- Log prompts and generated outputs to a separate txt file
- Calculate error percentages: syntax errors, runtime errors, incomplete code errors
- Track which problem numbers execute successfully

## 1. Setup & Dependencies

In [1]:
# Install required packages
!pip install -q transformers accelerate bitsandbytes torch
!pip install -q human-eval

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m111.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m91.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import json
import os
import re
import ast
import traceback
from typing import Dict, List, Tuple, Any
from collections import defaultdict
import signal
from contextlib import contextmanager

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from human_eval.data import read_problems, write_jsonl

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.6.0+cu124
CUDA available: True
GPU: Tesla T4


## 2. Load Qwen2.5-Coder-7B-Instruct Model

In [3]:
MODEL_NAME = "Qwen/Qwen2.5-Coder-7B-Instruct"

# 4-bit quantization config for memory efficiency
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

print(f"Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=quantization_config,
    device_map="auto",
    trust_remote_code=True
)
model.eval()
print("Model loaded successfully!")

Loading model: Qwen/Qwen2.5-Coder-7B-Instruct


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

2026-01-02 06:25:07.516177: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767335107.735961      20 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767335107.841063      20 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

Model loaded successfully!


## 3. Load HumanEval Dataset

In [4]:
# Load HumanEval problems
problems = read_problems()
print(f"Loaded {len(problems)} HumanEval problems")

# Display sample problem
sample_id = "HumanEval/0"
print(f"\nSample Problem ({sample_id}):")
print("="*50)
print(problems[sample_id]["prompt"][:500])

Loaded 164 HumanEval problems

Sample Problem (HumanEval/0):
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """



## 4. Error-Aware Prompt Template & Generation Functions

Using Qwen2.5-Coder chat template format for code completion with ERROR-AWARE prompting.

In [5]:
def create_prompt(problem_prompt: str) -> str:
    """
    Create the ERROR-AWARE prompt for Qwen2.5-Coder-7B-Instruct.
    Uses the chat template format appropriate for this model.
    """
    instruction = f"""Complete the following Python function. Only output the function body code that should be added after the function signature. Do not include the function signature or any explanations.

CRITICAL REQUIREMENTS:
1. Output ONLY valid Python code wrapped in ```python and ``` markers
2. Use proper indentation (4 spaces per level) for all nested structures
3. Do NOT include the function signature - only the body
4. Do NOT include any explanations or comments outside the code block
5. Ensure all control structures (if/for/while) have properly indented bodies

{problem_prompt}"""
    
    # Qwen2.5-Coder uses ChatML format
    messages = [
        {"role": "system", "content": "You are a helpful coding assistant. Complete the given Python function accurately. Only provide the function body code wrapped in ```python code blocks, no explanations. Ensure proper indentation for all nested structures."},
        {"role": "user", "content": instruction}
    ]
    
    formatted_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    return formatted_prompt


def generate_completion(prompt: str, max_new_tokens: int = 512) -> str:
    """
    Generate code completion using Qwen2.5-Coder-7B-Instruct.
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,  # Greedy decoding for pass@1
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            temperature=None,
            top_p=None,
        )
    
    # Decode only the generated part
    generated_tokens = outputs[0][inputs['input_ids'].shape[1]:]
    completion = tokenizer.decode(generated_tokens, skip_special_tokens=True)
    
    return completion


def extract_function_body(completion: str, prompt: str) -> str:
    """
    Extract and clean the function body from the completion.
    CRITICAL: Adds proper indentation so the code becomes part of the function body.
    """
    # Remove markdown code blocks if present
    completion = re.sub(r'```python\n?', '', completion)
    completion = re.sub(r'```\n?', '', completion)
    
    # Stop at common end markers
    stop_sequences = ["\n\n\n", "\n###", "\nif __name__", "\n# Test", "\n# Example", "\n\n#"]
    for stop in stop_sequences:
        if stop in completion:
            completion = completion[:completion.index(stop)]
    
    completion = completion.strip()
    
    # CRITICAL FIX: Add proper indentation to each line
    # The model outputs code without indentation, but it needs to be inside the function
    lines = completion.split('\n')
    indented_lines = []
    for line in lines:
        # If line is not empty and doesn't already have proper indentation, add 4 spaces
        if line.strip():  # Non-empty line
            # Check if line already starts with spaces (model sometimes includes indentation)
            if line.startswith('    '):  # Already indented
                indented_lines.append(line)
            else:
                indented_lines.append('    ' + line)
        else:  # Empty line
            indented_lines.append('')
    
    return '\n'.join(indented_lines)


print("Prompt generation functions defined.")

Prompt generation functions defined.


## 5. Execution & Error Classification

In [6]:
class TimeoutException(Exception):
    pass


@contextmanager
def time_limit(seconds: int):
    """
    Context manager to limit execution time.
    """
    def signal_handler(signum, frame):
        raise TimeoutException("Execution timed out!")
    
    old_handler = signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    
    try:
        yield
    finally:
        signal.alarm(0)
        signal.signal(signal.SIGALRM, old_handler)


def classify_error(code: str, test_code: str, entry_point: str) -> Tuple[str, str]:
    """
    Execute code and classify any errors.
    Returns: (status, error_message)
    Status can be: 'pass', 'syntax_error', 'runtime_error', 'incomplete_code', 'timeout'
    """
    # First check for syntax errors
    try:
        ast.parse(code)
    except SyntaxError as e:
        return 'syntax_error', str(e)
    
    # Check if the entry point function exists in code
    if f"def {entry_point}" not in code:
        return 'incomplete_code', f"Function '{entry_point}' not found in generated code"
    
    # Try to execute with tests
    full_code = code + "\n" + test_code
    
    try:
        ast.parse(full_code)
    except SyntaxError as e:
        return 'syntax_error', str(e)
    
    # Execute with timeout
    try:
        exec_globals = {}
        with time_limit(5):
            exec(full_code, exec_globals)
        return 'pass', ''
    except TimeoutException:
        return 'timeout', 'Execution timed out (>5 seconds)'
    except AssertionError as e:
        return 'runtime_error', f"Assertion failed: {str(e)}"
    except Exception as e:
        error_type = type(e).__name__
        return 'runtime_error', f"{error_type}: {str(e)}"


print("Execution and error classification functions defined.")

Execution and error classification functions defined.


## 6. Run Evaluation on All 164 Problems

In [7]:
# Initialize tracking variables
results = []
error_counts = defaultdict(int)
successful_problems = []
failed_problems = defaultdict(list)

# Output log file for prompts and generated outputs
output_log_path = "output_log_qwen25_coder_7b_error_aware_all.txt"

print(f"Starting Qwen2.5-Coder-7B-Instruct evaluation on {len(problems)} problems...")
print("="*60)

with open(output_log_path, 'w', encoding='utf-8') as log_file:
    log_file.write("HumanEval Evaluation Log - Qwen2.5-Coder-7B-Instruct (Error-Aware Prompt)\n")
    log_file.write(f"Model: {MODEL_NAME}\n")
    log_file.write("="*60 + "\n\n")
    
    for i, (task_id, problem) in enumerate(problems.items()):
        problem_num = int(task_id.split("/")[1])
        prompt = problem["prompt"]
        test_code = problem["test"]
        entry_point = problem["entry_point"]
        
        # Create prompt and generate completion
        formatted_prompt = create_prompt(prompt)
        completion = generate_completion(formatted_prompt)
        cleaned_completion = extract_function_body(completion, prompt)
        
        # Combine prompt with completion for full code
        # The prompt ends with the function signature, cleaned_completion is now properly indented
        full_code = prompt + "\n" + cleaned_completion
        
        # Classify result
        status, error_msg = classify_error(full_code, test_code, entry_point)
        
        # Track results
        error_counts[status] += 1
        
        if status == 'pass':
            successful_problems.append(problem_num)
        else:
            failed_problems[status].append(problem_num)
        
        results.append({
            "task_id": task_id,
            "problem_num": problem_num,
            "status": status,
            "error": error_msg,
            "completion": cleaned_completion,
            "prompt_type": "error_aware"
        })
        
        # Write to log file
        log_file.write(f"{'='*60}\n")
        log_file.write(f"PROBLEM {problem_num}: {task_id}\n")
        log_file.write(f"STATUS: {status.upper()}\n")
        log_file.write(f"{'='*60}\n\n")
        log_file.write(f"--- ORIGINAL HUMANEVAL PROMPT ---\n")
        log_file.write(f"{prompt}\n\n")
        log_file.write(f"--- FORMATTED PROMPT TO MODEL ---\n")
        log_file.write(f"{formatted_prompt}\n\n")
        log_file.write(f"--- RAW MODEL OUTPUT ---\n")
        log_file.write(f"{completion}\n\n")
        log_file.write(f"--- CLEANED COMPLETION (WITH INDENTATION) ---\n")
        log_file.write(f"{cleaned_completion}\n\n")
        log_file.write(f"--- FULL GENERATED CODE ---\n")
        log_file.write(f"{full_code}\n\n")
        if error_msg:
            log_file.write(f"--- ERROR MESSAGE ---\n")
            log_file.write(f"{error_msg}\n\n")
        log_file.write("\n")
        
        # Progress update
        if (i + 1) % 10 == 0 or (i + 1) == len(problems):
            print(f"Progress: {i+1}/{len(problems)} | Current pass rate: {error_counts['pass']/(i+1)*100:.1f}%")

print("\nEvaluation complete!")
print(f"Output log saved to: {output_log_path}")

The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Starting Qwen2.5-Coder-7B-Instruct evaluation on 164 problems...


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 10/164 | Current pass rate: 90.0%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 20/164 | Current pass rate: 90.0%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 30/164 | Current pass rate: 90.0%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 40/164 | Current pass rate: 90.0%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 50/164 | Current pass rate: 92.0%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 60/164 | Current pass rate: 93.3%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 70/164 | Current pass rate: 94.3%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 80/164 | Current pass rate: 95.0%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 90/164 | Current pass rate: 95.6%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 100/164 | Current pass rate: 96.0%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 110/164 | Current pass rate: 96.4%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 120/164 | Current pass rate: 96.7%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 130/164 | Current pass rate: 96.9%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 140/164 | Current pass rate: 97.1%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 150/164 | Current pass rate: 97.3%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=

Progress: 160/164 | Current pass rate: 97.5%


The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Progress: 164/164 | Current pass rate: 97.0%

Evaluation complete!
Output log saved to: output_log_qwen25_coder_7b_error_aware_all.txt


## 7. Calculate & Display Results

In [8]:
# Calculate metrics
total_problems = len(problems)
passed = error_counts['pass']
syntax_errors = error_counts['syntax_error']
runtime_errors = error_counts['runtime_error']
incomplete_errors = error_counts['incomplete_code']
timeout_errors = error_counts['timeout']

pass_at_1 = (passed / total_problems) * 100
syntax_error_pct = (syntax_errors / total_problems) * 100
runtime_error_pct = (runtime_errors / total_problems) * 100
incomplete_error_pct = (incomplete_errors / total_problems) * 100
timeout_error_pct = (timeout_errors / total_problems) * 100

print("\n" + "="*60)
print("HUMANEVAL EVALUATION RESULTS - Qwen2.5-Coder-7B-Instruct")
print(f"Model: {MODEL_NAME}")
print("="*60)

print(f"\n PASS@1 METRIC:")
print(f"   Pass@1: {pass_at_1:.2f}% ({passed}/{total_problems})")

print(f"\n ERROR BREAKDOWN (on {total_problems} problems):")
print(f"    Passed:           {passed:3d} ({pass_at_1:.2f}%)")
print(f"    Syntax Errors:    {syntax_errors:3d} ({syntax_error_pct:.2f}%)")
print(f"    Runtime Errors:   {runtime_errors:3d} ({runtime_error_pct:.2f}%)")
print(f"    Incomplete Code:  {incomplete_errors:3d} ({incomplete_error_pct:.2f}%)")
print(f"    Timeouts:         {timeout_errors:3d} ({timeout_error_pct:.2f}%)")


HUMANEVAL EVALUATION RESULTS - Qwen2.5-Coder-7B-Instruct
Model: Qwen/Qwen2.5-Coder-7B-Instruct

 PASS@1 METRIC:
   Pass@1: 96.95% (159/164)

 ERROR BREAKDOWN (on 164 problems):
    Passed:           159 (96.95%)
    Syntax Errors:      5 (3.05%)
    Runtime Errors:     0 (0.00%)
    Incomplete Code:    0 (0.00%)
    Timeouts:           0 (0.00%)


In [9]:
# Display successful problem numbers
print("\n" + "="*60)
print("SUCCESSFULLY EXECUTED PROBLEMS")
print("="*60)

successful_problems_sorted = sorted(successful_problems)
print(f"\nTotal: {len(successful_problems_sorted)} problems passed")
print(f"\nProblem Numbers: {successful_problems_sorted}")


SUCCESSFULLY EXECUTED PROBLEMS

Total: 159 problems passed

Problem Numbers: [0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 29, 30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163]


In [10]:
# Display failed problems by category
print("\n" + "="*60)
print("FAILED PROBLEMS BY CATEGORY")
print("="*60)

if failed_problems['syntax_error']:
    print(f"\n Syntax Errors ({len(failed_problems['syntax_error'])}):") 
    print(f"   {sorted(failed_problems['syntax_error'])}")

if failed_problems['runtime_error']:
    print(f"\n Runtime Errors ({len(failed_problems['runtime_error'])}):") 
    print(f"   {sorted(failed_problems['runtime_error'])}")

if failed_problems['incomplete_code']:
    print(f"\n Incomplete Code ({len(failed_problems['incomplete_code'])}):") 
    print(f"   {sorted(failed_problems['incomplete_code'])}")

if failed_problems['timeout']:
    print(f"\n  Timeouts ({len(failed_problems['timeout'])}):") 
    print(f"   {sorted(failed_problems['timeout'])}")


FAILED PROBLEMS BY CATEGORY

 Syntax Errors (5):
   [3, 11, 28, 32, 162]


In [11]:
# Save summary
summary_path = "evaluation_summary_qwen25_coder_7b_error_aware_all.txt"

with open(summary_path, 'w') as f:
    f.write("HUMANEVAL EVALUATION SUMMARY - Qwen2.5-Coder-7B-Instruct\n")
    f.write(f"Model: {MODEL_NAME}\n")
    f.write("="*60 + "\n\n")
    
    f.write(f"PASS@1: {pass_at_1:.2f}% ({passed}/{total_problems})\n\n")
    
    f.write("ERROR BREAKDOWN:\n")
    f.write(f"  - Passed:          {passed} ({pass_at_1:.2f}%)\n")
    f.write(f"  - Syntax Errors:   {syntax_errors} ({syntax_error_pct:.2f}%)\n")
    f.write(f"  - Runtime Errors:  {runtime_errors} ({runtime_error_pct:.2f}%)\n")
    f.write(f"  - Incomplete Code: {incomplete_errors} ({incomplete_error_pct:.2f}%)\n")
    f.write(f"  - Timeouts:        {timeout_errors} ({timeout_error_pct:.2f}%)\n\n")
    
    f.write("SUCCESSFUL PROBLEM NUMBERS:\n")
    f.write(f"{successful_problems_sorted}\n\n")
    
    f.write("FAILED PROBLEMS BY CATEGORY:\n")
    if failed_problems['syntax_error']:
        f.write(f"  Syntax Errors: {sorted(failed_problems['syntax_error'])}\n")
    if failed_problems['runtime_error']:
        f.write(f"  Runtime Errors: {sorted(failed_problems['runtime_error'])}\n")
    if failed_problems['incomplete_code']:
        f.write(f"  Incomplete Code: {sorted(failed_problems['incomplete_code'])}\n")
    if failed_problems['timeout']:
        f.write(f"  Timeouts: {sorted(failed_problems['timeout'])}\n")

print(f"\n Summary saved to: {summary_path}")


 Summary saved to: evaluation_summary_qwen25_coder_7b_error_aware_all.txt


In [12]:
# Save detailed results as JSON
results_path = "evaluation_results_qwen25_coder_7b_error_aware_all.json"

detailed_results = {
    "model": MODEL_NAME,
    "prompt_type": "error_aware",
    "total_problems": total_problems,
    "pass_at_1": pass_at_1,
    "error_counts": dict(error_counts),
    "error_percentages": {
        "pass": pass_at_1,
        "syntax_error": syntax_error_pct,
        "runtime_error": runtime_error_pct,
        "incomplete_code": incomplete_error_pct,
        "timeout": timeout_error_pct
    },
    "successful_problems": successful_problems_sorted,
    "failed_problems": {k: sorted(v) for k, v in failed_problems.items()},
    "detailed_results": results
}

with open(results_path, 'w') as f:
    json.dump(detailed_results, f, indent=2)

print(f" Detailed results saved to: {results_path}")

 Detailed results saved to: evaluation_results_qwen25_coder_7b_error_aware_all.json


## 8. Final Summary

In [13]:
# Define variables derived from the previous output structure
MODEL_NAME = "Qwen/Qwen2.5-Coder-7B-Instruct"
output_log_path = "output_log_qwen25_coder_7b.txt"
summary_path = "evaluation_summary_qwen25_coder_7b.txt"
results_path = "evaluation_results_qwen25_coder_7b.json"

print("\n" + "="*60)
print("EVALUATION COMPLETE")
print("="*60)
print(f"\nModel: {MODEL_NAME}")
print(f"Prompt Type: Error-Aware")
print(f"\nFinal Pass@1: {pass_at_1:.2f}%")

print(f"\nError Breakdown:")
print(f"  - Passed:          {passed:>3d} ({pass_at_1:>5.2f}%)")
print(f"  - Syntax Errors:   {syntax_errors:>3d} ({syntax_error_pct:>5.2f}%)")
print(f"  - Runtime Errors:  {runtime_errors:>3d} ({runtime_error_pct:>5.2f}%)")
print(f"  - Incomplete Code: {incomplete_errors:>3d} ({incomplete_error_pct:>5.2f}%)")
print(f"  - Timeouts:        {timeout_errors:>3d} ({timeout_error_pct:>5.2f}%)")

print(f"\nOutput files:")
print(f"  - {output_log_path}")
print(f"  - {summary_path}")
print(f"  - {results_path}")


EVALUATION COMPLETE

Model: Qwen/Qwen2.5-Coder-7B-Instruct
Prompt Type: Error-Aware

Final Pass@1: 96.95%

Error Breakdown:
  - Passed:          159 (96.95%)
  - Syntax Errors:     5 ( 3.05%)
  - Runtime Errors:    0 ( 0.00%)
  - Incomplete Code:   0 ( 0.00%)
  - Timeouts:          0 ( 0.00%)

Output files:
  - output_log_qwen25_coder_7b.txt
  - evaluation_summary_qwen25_coder_7b.txt
  - evaluation_results_qwen25_coder_7b.json
