### Model Evaluation Benchmark


##### This benchmark tool evaluates the performance of language models in generating Python unit tests. It measures the models' ability to produce valid, executable test cases with high code coverage for given Python functions.

---

This benchmark assumes an OpenAI API compatible server is running locally to generate the test cases for evaluation.

#### Recommended Setup

It is highly recommended to use vLLM for continuous batching.

To initialize a vLLM instance:

```bash
python3 -m vllm.entrypoints.openai.api_server \
    --model <model_card> \
    --gpu-memory-utilization 0.98 \
    --max-model-len 4096 \
    --max-num-seqs 110 \
    --disable-log-requests```


For custom LoRA models, add the following flags:

```bash
--lora-modules <module alias>=<module_path> --chat-template ""
```


#### Example Commands

1. For OpenCodeInterpreter-CL-7B:
```bash
python3 -m vllm.entrypoints.openai.api_server \
    --model m-a-p/OpenCodeInterpreter-CL-7B \
    --gpu-memory-utilization 0.98 \
    --max-model-len 4096
```

2. For Phi-3-mini-4k-instruct with LoRA layers:

```bash
python3 -m vllm.entrypoints.openai.api_server \
    --model unsloth/Phi-3-mini-4k-instruct \
    --gpu-memory-utilization 0.98 \
    --max-model-len 4096 \
    --max-num-seqs 110 \
    --disable-log-requests \
    --lora-modules pytest_module=~/disertatie/outputs/checkpoint-7000/ \
    --chat-template ""
```


Pass ```input_prompt_template``` to benchmark constructor if model was fine-tuned on a different dataset from the tokenizer prompt

In [31]:
import math
import multiprocess

import pandas as pd
from tqdm.notebook import tqdm

input_prompt_template = """
### Input:
{}

### Response:
{}"""

In [2]:
dataset = pd.read_csv("benchmark_data.csv")
print("Dataset overview:")
dataset[["original_code", "pytest_code", "coverage"]]

Dataset overview:


Unnamed: 0,original_code,pytest_code,coverage
0,"def quaternion_multiply(r, q):\r\n \r\n ...",# test_source.py\r\nimport pytest\r\nimport so...,100.0
1,"def sing_three(mu, c, i0=1.0):\r\n \r\n ...",import pytest\r\nfrom source import sing_three...,100.0
2,def get_rgb_from_int(rgb_int):\r\n \r\n ...,import pytest\r\nimport sys\r\nsys.path.insert...,100.0
3,"def inflate(tensor, times, dim):\r\n \r\n ...",# test_source.py\r\nimport pytest\r\nfrom sour...,100.0
4,def radii(mag):\r\n \r\n # ADM mask all ...,# test_source.py\r\nimport pytest\r\nimport sy...,100.0
...,...,...,...
195,"def scale_intrinsics(K, x_scale, y_scale):\r\n...",import sys\r\nimport os\r\nsys.path.append(os....,100.0
196,"def filter_noise_lab(X, labels):\r\n \r\n ...",# import the function to test from source.py\r...,100.0
197,"def heat_flux_to_temperature(heat_flux: float,...",import pytest\r\nimport sys\r\nsys.path.append...,100.0
198,def strip_balanced_edge_parens(s):\r\n \r\n...,import pytest\r\nimport source\r\n\r\ndef test...,100.0


In [3]:
class TestGenerationBenchmark:
    def __init__(
        self,
        dataset,
        input_prompt_template=None,
        tmp_folder="tmp",
        max_retries=8,
        max_feedback=1,
    ):
        self.dataset = dataset["original_code"][:10]
        ##DELETE THIS WHEN NOT BENCHMARKING CUSTOM MODELS

        if input_prompt_template:
            self.dataset = [
                input_prompt_template.format(input_code, "")
                for input_code in self.dataset
            ]

        self.tmp_folder = tmp_folder
        self.max_retries = max_retries
        self.max_feedback = max_feedback

    def _run_test(self, source_code, best_of=8):
        try:
            from agent import CodeAgent
            from pytest_runner import run_pytest
            import pandas as pd

            result_passes = lambda result: (
                result["coverage"] >= 90
                and (not result["failed_assertions"])
                and (not result["stderr"])
            )

            has_errors = lambda result: not (
                result["failed_assertions"] or result["stderr"]
            )

            model = CodeAgent()
            initial_code = model.generate_response(source_code, n=best_of)
            results = []
            for pytest_code in initial_code:
                if pytest_code in ["os.", "sys.", "open(", "subprocess."]:
                    continue

                result = run_pytest(source_code, pytest_code, random_subdir=True)
                result["passed"] = result_passes(result)
                result["has_errors"] = has_errors(result)
                result["generated_test"] = pytest_code
                result["source_code"] = source_code
                results.append(result)

                if result["passed"]:
                    return pd.DataFrame(results)

            return pd.DataFrame(results)
        except Exception as e:
            return str(e)

    def run_evaluation(self):
        runs = []

        with multiprocess.Pool(processes=40) as pool:
            runs = list(
                tqdm(pool.imap(self._run_test, self.dataset), total=len(self.dataset))
            )
        return runs


##  NOTE: make sure to delete input_prompt_template when benchmarking non-LoRA models
evaluator = TestGenerationBenchmark(
    dataset, input_prompt_template=input_prompt_template
)
generation_results = evaluator.run_evaluation()

  0%|          | 0/10 [00:00<?, ?it/s]

In [16]:
generation_results

[   coverage                                             stdout  \
 
                                               stderr  failed_assertions  \
 0  C:\Users\lucap\AppData\Roaming\Python\Python31...                  0   
 1  C:\Users\lucap\AppData\Roaming\Python\Python31...                  0   
 2  C:\Users\lucap\AppData\Roaming\Python\Python31...                  0   
 3  C:\Users\lucap\AppData\Roaming\Python\Python31...                  0   
 4  C:\Users\lucap\AppData\Roaming\Python\Python31...                  0   
 5  C:\Users\lucap\AppData\Roaming\Python\Python31...                  0   
 6                                                                     1   
 7  C:\Users\lucap\AppData\Roaming\Python\Python31...                  0   
 
                                                fails  passed  has_errors  \
 0                                                      False       False   
 1                                                      False       False   
 2            

In [34]:
def compute_model_score(generation_results):
    results = [
        result for result in generation_results if isinstance(result, pd.DataFrame)
    ]
    results = [result[result.passed].coverage.max() for result in results]

    results = [0 if math.isnan(result) else result for result in results]
    return sum(results) // len(results)


score = compute_model_score(generation_results)

print(f"Model achieved a score of {score}%")

Model achieved a score of 10.0%


10.0