In [192]:
%load_ext autoreload
%autoreload 2
%pip install datasets

[autoreload of dspy.signatures.signature failed: Traceback (most recent call last):
  File "/opt/homebrew/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/opt/homebrew/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/opt/homebrew/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/opt/homebrew/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 365, in update_class
    update_instances(old, new)
  File "/opt/homebrew/lib/python3.11/site-packages/IPython/extensions/autoreload.py", line 323, in update_instances
    object.__setattr__(ref, "__class__", new)
TypeError: can't apply this __setattr__ to SignatureMeta object
]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


Let's get the dataset and see what it looks like.

In [193]:
import datasets
ds = datasets.load_dataset("openai_humaneval")
ds['test'][0]


{'task_id': 'HumanEval/0',
 'prompt': 'from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """ Check if in given list of numbers, are any two numbers closer to each other than\n    given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    """\n',
 'canonical_solution': '    for idx, elem in enumerate(numbers):\n        for idx2, elem2 in enumerate(numbers):\n            if idx != idx2:\n                distance = abs(elem - elem2)\n                if distance < threshold:\n                    return True\n\n    return False\n',
 'test': "\n\nMETADATA = {\n    'author': 'jt',\n    'dataset': 'test'\n}\n\n\ndef check(candidate):\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.3) == True\n    assert candidate([1.0, 2.0, 3.9, 4.0, 5.0, 2.2], 0.05) == False\n    assert candidate([1.0, 2.0, 5.9, 4.0, 5.0], 0.95) == True\n    assert 

Before we try to solve the problem, let's just load a language model and make sure everything works.

In [194]:
import dspy, dotenv, os
dotenv.load_dotenv(os.path.expanduser("~/.env"))  # load OpenAI API key from .env file
lm = dspy.OpenAI(model="gpt-3.5-turbo", max_tokens=4000)
dspy.settings.configure(lm=lm)

predictor = dspy.Predict("question -> answer")
print(predictor(question="What is the capital of France?"))

Prediction(
    answer='Paris'
)


Next let's write a program that actually outputs code.

In [195]:
from dspy import InputField, OutputField, Signature
from dspy.functional import TypedPredictor
import pydantic

# We define a pydantic type that automatically checks if it's argument is valid python code.
class PythonCode(pydantic.BaseModel):
    code: str

    @pydantic.field_validator('code')
    def check_syntax(cls, v):
        try:
            # Attempt to compile the code snippet
            compile(v, "<string>", "exec")
        except SyntaxError as e:
            # If a SyntaxError is raised, the code is not syntactically valid
            raise ValueError(f"Code is not syntactically valid: {e}")
            
        return v

# The signature is the main DSpy object. Note that we have types for the input and output fields,
# which was not possible beofore.
class CodeSignature(Signature):
    prompt: str = InputField()
    test: PythonCode = InputField()
    entry_point: str = InputField()
    solution: PythonCode = OutputField()

predictor = TypedPredictor(CodeSignature, chain_of_thought=True, make_example=True)
prediction = predictor(
    prompt=PythonCode(code=ds['test'][0]['prompt']),
    test=PythonCode(code=ds['test'][0]['test']),
    entry_point=ds['test'][0]['entry_point']
)

print(prediction)

Prediction(
    reasoning='produce the solution. We need to ensure that the JSON object we provide is valid and follows the specified schema. The error message indicates that there are trailing characters in the JSON object, which is causing it to be invalid. To avoid this error in the future, we need to make sure that the JSON object is correctly formatted and does not contain any additional characters outside of the specified structure.',
    solution=PythonCode(code='def has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for i in range(len(numbers)):\n        for j in range(i+1, len(numbers)):\n            if abs(numbers[i] - numbers[j]) < threshold:\n                return True\n    return False')
)


Let's see what's happening under the hood

In [196]:
lm.inspect_history(n=3)





Make a very succinct json object that validates with the following schema

---

Follow the following format.

Json Schema: ${json_schema}
Json Object: ${json_object}

---

Json Schema: {"properties": {"code": {"title": "Code", "type": "string"}}, "required": ["code"], "title": "PythonCode", "type": "object"}
Json Object:[32m {"code": "print('Hello, World!')"}[0m







Given the fields `prompt`, `test`, `entry_point`, produce the fields `solution`.

---

Follow the following format.

Prompt: ${prompt}

Test: ${test}

Entry Point: ${entry_point}

Reasoning: Let's think step by step in order to ${produce the solution}. We ...

Solution: ${solution}. Respond with a single JSON object using the schema {"properties": {"code": {"title": "Code", "type": "string"}}, "required": ["code"], "title": "PythonCode", "type": "object"}. For example: {"code": "print('Hello, World!')"}

---

Prompt: code='from typing import List\n\n\ndef has_close_elements(numbers: List[float], threshold: float) -

We can see `functional` first created an example value {"code": "print('Hello, World!')"}, which can be useful to boostrap the json generation.
After that it still failed to generate valid json.
It apparently decided to first repeat the schema, and then give the actual code "as an example"
The validator caught the error, and gave it as a "Past Error", which made the model finally output a valid output.

We need a way to run code. This is actually super tricky to do right in python (see https://stackoverflow.com/questions/3068139/how-can-i-sandbox-python-in-pure-python), so we'll just YOLO and call "exec" with globals={}.

In [197]:
def execute_code(code, globals={}, locals=None):
    try:
        output = exec(code, globals, locals)
        return output, None
    except Exception as e:
        return None, e

Let's run the evaluator on all the "canonical solutions" from HumanEval to check that everything is working.

In [198]:
from dspy import Example

devset = [Example(
    prompt=PythonCode(code=test['prompt']),
    test=PythonCode(code=test['test']),
    entry_point=test['entry_point'],
    solution=PythonCode(code=test['prompt']+test['canonical_solution']),
).with_inputs('prompt', 'test', 'entry_point') for test in ds['test']]

trainset = devset[:40]
testset = devset[40:]

def metric(example, pred, trace=None):
    if pred.solution.code is None:
        return 0
    _output, error = execute_code(
        "from typing import List\n"
        + f"{pred.solution.code}\n"
        + f"{example.test.code}\n"
        + f"check({example.entry_point})"
    )
    return int(error is None)

print("Score with the original model:")
print(100 * sum(metric(example, example) for example in testset) / len(testset))

for example in devset:
    if not metric(example, example):
        print("Bad example:")
        code = (
            "from typing import List\n"
            + f"{example.solution.code}\n"
            + f"{example.test.code}\n"
            + f"check({example.entry_point})"
        )
        print(code)
        output, error = execute_code(code)
        print(f"{output=}")
        print(f"{error=}")
        break


Score with the original model:
100.0


Now test our program.

In [199]:
from dspy.evaluate.evaluate import Evaluate
evaluator = Evaluate(
    devset=testset, num_threads=30, display_progress=True, display_table=5, max_errors=100,
)
res = evaluator(predictor, metric)
print(res)

Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries
Error for example in dev set: 		 Too many retries


Average Metric: 70.0 / 124  (56.5): 100%|██████████| 124/124 [00:00<00:00, 1781.00it/s]

Average Metric: 70.0 / 124  (56.5%)



  


Unnamed: 0,prompt,test,entry_point,example_solution,reasoning,pred_solution,metric,solution
0,"code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([1, 3, 5, 0]) == False\n assert candidate([1, 3, 5, -1]) == False\n assert candidate([1, 3, -2, 1]) == True\n...",triples_sum_to_zero,"code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...",produce the solution. We need to ensure that the JSON object we provide is valid and follows the specified schema. The error message indicates that...,"code='def triples_sum_to_zero(l: list):\n for i in range(len(l)):\n for j in range(i+1, len(l)):\n for k in range(j+1, len(l)):\n if l[i] + l[j] + l[k] == 0:\n...",1.0,
1,"code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...",code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate(2) == 4\n assert candidate(3) == 9\n assert candidate(4) == 16\n assert candidate(8) == 64\n assert candidate(10) == 100\n\n',car_race_collision,"code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...",produce the solution. We need to ensure that the code provided is valid JSON format. The error message indicates that there are trailing characters at...,code='def car_race_collision(n: int):\n return n ** 2',1.0,
2,"code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([]) == []\n assert candidate([3, 2, 1]) == [4, 3, 2]\n assert candidate([5, 2, 5, 2, 3, 3, 9, 0,...",incr_list,"code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...",produce the solution. We need to ensure that the JSON object we provide follows the correct format and does not contain any trailing characters that...,code='def incr_list(l: list):\n return [x + 1 for x in l]',1.0,
3,"code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([1, 3, 5, 0]) == False\n assert candidate([1, 3, -2, 1]) == False\n assert candidate([1, 2, 3, 7]) == False\n...",pairs_sum_to_zero,"code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...","solve this issue. The error message indicates that there are trailing characters in the JSON object provided. To fix this, we need to ensure that...","code='def pairs_sum_to_zero(l):\n for i in range(len(l)):\n for j in range(i+1, len(l)):\n if l[i] + l[j] == 0:\n return True\n return False'",1.0,
4,"code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate(8, 3) == ""22""\n assert candidate(9, 3) == ""100""\n assert candidate(234, 2) == ""11101010""\n assert candidate(16, 2) == ""10000""\n assert...",change_base,"code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...",produce the solution. We need to ensure that the JSON object provided follows the correct format specified in the schema. The error message indicates that...,"code=""def change_base(x: int, base: int):\n result = ''\n while x > 0:\n result = str(x % base) + result\n x //= base\n return result""",1.0,


56.45


Let's try to optimize it a bit

In [200]:
from dspy.teleprompt.bootstrap import BootstrapFewShot

print("Compiling...")
compiled = BootstrapFewShot(
    metric=metric
).compile(
    predictor,
    trainset=trainset,
)

Compiling...


 12%|█▎        | 5/40 [00:00<00:00, 237.80it/s]

Bootstrapped 4 full traces after 6 examples in round 0.





Finally evaluate the trained model

In [201]:
print("Evaluating...")
print(
    "Compiled HumanEval score:",
    evaluator(compiled, metric=metric),
)

Evaluating...


Average Metric: 84 / 124  (67.7): 100%|██████████| 124/124 [00:00<00:00, 2265.09it/s]

Average Metric: 84 / 124  (67.7%)





Unnamed: 0,prompt,test,entry_point,example_solution,reasoning,pred_solution,metric
0,"code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([1, 3, 5, 0]) == False\n assert candidate([1, 3, 5, -1]) == False\n assert candidate([1, 3, -2, 1]) == True\n...",triples_sum_to_zero,"code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...",produce the solution. We need to find three distinct elements in the list that sum to zero. One way to approach this is to use...,"code='\n\ndef triples_sum_to_zero(l: list):\n """"""\n triples_sum_to_zero takes a list of integers as an input.\n it returns True if there are three distinct elements in the list...",1
1,"code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...",code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate(2) == 4\n assert candidate(3) == 9\n assert candidate(4) == 16\n assert candidate(8) == 64\n assert candidate(10) == 100\n\n',car_race_collision,"code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...",produce the solution. We can visualize the cars moving towards each other on the road and calculate the number of collisions that would occur when...,"code='\n\ndef car_race_collision(n: int):\n """"""\n Imagine a road that\'s a perfectly straight infinitely long line.\n n cars are driving left to right; simultaneously, a different set...",1
2,"code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([]) == []\n assert candidate([3, 2, 1]) == [4, 3, 2]\n assert candidate([5, 2, 5, 2, 3, 3, 9, 0,...",incr_list,"code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...",produce the solution. We need to iterate through the list and increment each element by 1.,"code='\n\ndef incr_list(l: list):\n """"""Return list with elements incremented by 1.\n >>> incr_list([1, 2, 3])\n [2, 3, 4]\n >>> incr_list([5, 3, 5, 2, 3, 3, 9,...",1
3,"code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate([1, 3, 5, 0]) == False\n assert candidate([1, 3, -2, 1]) == False\n assert candidate([1, 2, 3, 7]) == False\n...",pairs_sum_to_zero,"code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...",produce the solution. We need to iterate through the list and check if there are any two distinct elements that sum up to zero.,"code='\n\ndef pairs_sum_to_zero(l):\n """"""\n pairs_sum_to_zero takes a list of integers as an input.\n it returns True if there are two distinct elements in the list that\n...",1
4,"code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...","code='\n\nMETADATA = {}\n\n\ndef check(candidate):\n assert candidate(8, 3) == ""22""\n assert candidate(9, 3) == ""100""\n assert candidate(234, 2) == ""11101010""\n assert candidate(16, 2) == ""10000""\n assert...",change_base,"code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...","produce the solution. We need to convert the input number `x` to the desired base `base`. To do this, we can repeatedly divide `x` by...","code='\n\ndef change_base(x: int, base: int):\n """"""Change numerical base of input number x to base.\n return string representation after the conversion.\n base numbers are less than...",1


Compiled HumanEval score: 67.74
