In [17]:
import models.lm as lm
import models.prompt as prompt
import models.gpt_gen as gpt_gen
import data
import defs

import os
import json

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
### SYSTEM

system_prompt = """You are an assistant chatbot with human-like perception, reasoning and learning capabilities.
You can solve tasks concisely, efficiently, and moreover, correctly.
Let's engage in perception- and logic-based tasks.
"""

system_prompt = system_prompt + """You only output source code.
No explanations or any other text.
Only code.
"""





### PREAMBLE
preamble_simple = """You are an efficient assistant for logical reasoning and code generation.
You will help me solve a visual perception and reasoning task.
I will first provide you with the definition of a Domain Specific Language you will use for writing a solution for the task.
I will then provide you with some examples of how to use the DSL.
I will then present you with the description of the task that you will be tested in.
You will then respond the queries I make regarding the solution of the task.
"""

preamble_simple_no_ex = """You are an efficient assistant for logical reasoning and code generation.
You will help me solve a visual perception and reasoning task.
I will first provide you with the definition of a Domain Specific Language you will use for writing a solution for the task.
I will then present you with the description of the task that you will be tested in.
You will then respond the queries I make regarding the solution of the task.
"""

preamble_nl_ex = """
You are a efficient assistant for code generation.
The setup for this interaction is as follows:
I will first show you the EBNF grammar definition for a domain specific language (DSL) that you will use to write source code in.
Then, I will show you a small number of example programs in this DSL.
Then, I will present to you a visual perception and reasoning task, and you will be asked to write a program in the DSL that solves this task.
"""






### DSL
dsl_preamble = """This is the definition of the DSL you will use to solve the task.
It is given as a context-free grammar in the EBNF format used by the Lark parser generator, with some informative comments about the semantics.
You will return a string that is parseable by the `program` non-terminal of the grammar.
"""

dsl_path = "dsl/v0_3/dsl.lark"
with open(dsl_path, "r") as f:
    dsl_grammar = f.read()

dsl_prompt = f"{dsl_preamble}\n```\n{dsl_grammar}\n```\n"





# ### EXAMPLES

# nl_examples = """I will now show you some example programs in the DSL, along with natural language description of their behavior.
# """

# # ic_examples = """I will now show you some example tasks along with their solutions in the DSL.
# # They might be simpler than your test task, but they should allow you to understand the behavior of the DSL.
# # """

### IC EXAMPLES

ic_examples_path = "models/templates/v0_3/ic_examples.txt"
with open(ic_examples_path, "r") as f:
    ic_examples_prompt = f.read()

ic_examples_preamble = """Now I will show you some demonstration tasks along with the output you would be expected to produce for each of them.
"""

ic_examples = f"{ic_examples_preamble}\n{ic_examples_prompt}"

### TASK
task_preamble = """Now we continue with the visual perception and reasoning task.
The input for the task is a small number of pairs of grids of characters.
The value of each of the cells of the grids are the colors defined in the DSL, so we can think of grids as images.
Each pair of images correspond to an input-output example for an unknown program P.
For each pair, the program P is evaluated on the image grid and operates on the objects that appear in it.
The output of the program is then the output image.
The objects in the images are easy and natural to identify for humans, so there is no need to define them explicitly.
However you are able to abstract them correctly, and the DSL is interpreted with the same correct abstraction.
"""

test_task_preamble = """Now follows task you will be evaluated on.
Output the solution as a JSON object, which should contain both a natural language description of the solution and the solution written in the DSL.
The code should be parseable by the DSL grammar.
The JSON must have the following structure:

{
    "nl_description": "TO_BE_FILLED",
    "code": "TO_BE_FILLED"
}

## TEST TASK"""

### QUERY

query_code_only = """
Write a program in the DSL that will solve this task.
The program should be enclosed in backticks as per Markdown syntax.
"""


In [19]:
def make_task_prompt(task_id):
    task_description = data.task_description(task_id=task_id, print_test=False, color_map="char")
    task_prompt = f"{test_task_preamble}\n\n{task_description}"

    prompt = "\n\n".join([
        preamble_simple_no_ex,
        dsl_prompt,
        # task_prompt,
        task_preamble,
        ic_examples,
        task_prompt
    ])

    return prompt

sample_task_id = "ae3edfdc"
prompt = make_task_prompt(sample_task_id)
print(prompt)

You are an efficient assistant for logical reasoning and code generation.
You will help me solve a visual perception and reasoning task.
I will first provide you with the definition of a Domain Specific Language you will use for writing a solution for the task.
I will then present you with the description of the task that you will be tested in.
You will then respond the queries I make regarding the solution of the task.


This is the definition of the DSL you will use to solve the task.
It is given as a context-free grammar in the EBNF format used by the Lark parser generator, with some informative comments about the semantics.
You will return a string that is parseable by the `program` non-terminal of the grammar.

```
library: "(" program* ")"

// Rules are executed one after another, in the order they appear.
// There could be no rules, in which case the program does nothing.
program: "(" "do" rule* ")"

// First, it defines a filter expression that an object must satisfy in order t

In [13]:
# ic_example_name = "recolor_y"
# nl_description = "Recolor all objects to color Y"

# ic_example_name = "extend_to_max"
# nl_description = "Extend all the objects of size 1 toward the object of maximum size, without overlapping."

# ic_example_name = "three_move_recolor"
# nl_description = "Move all objects of size 3 one step up, and then recolor them to color B."

ic_example_name = "move_to_grey"
nl_description = "Move all objects of color different than X, toward the object of color X, if they are neighbors of it."

desc = data.task_description(f"ic_examples/{ic_example_name}.json", print_test=False, color_map="char")
print(desc)

dsl_src_path = f"dsl/v0_3/ic_examples/{ic_example_name}.dsl"
with open(dsl_src_path, "r") as f:
    dsl_src = f.read()
# print(dsl_src)


example_response = json.dumps({
    "nl_description": nl_description,
    "code": dsl_src
}, indent=4)

print(example_response)

PAIR 1
INPUT GRID:
O O O O R O O O A O
O B O O O O B O O O
O O O O O O O O O O
O O O O X X X O O O
O O O O X X X O O F
W O O O O O O O O O
O O F O O O Y O O O
O O O O O O O O O O
O O O O G O O O W O
O O O O O O O O O O
OUTPUT GRID:
O O O O O O O O A O
O B O O O O O O O O
O O O O R O B O O O
O O O O X X X O O O
O O O O X X X F O O
W O O O G O Y O O O
O O F O O O O O O O
O O O O O O O O O O
O O O O O O O O W O
O O O O O O O O O O

PAIR 2
INPUT GRID:
O O O F O O O R O O O O
O O O O O O O O O Y O O
O O O O O B O O O O O O
O O R O O O O O G O O O
B O O O O G O O O O O A
O O O O O O R O O O W O
O O O O O O O O C O O O
G O X X O O O O O O O R
O O X X O O O O O O Y O
O O X X O O A O O O O O
O F X X O O O O O O O O
O O O O O Y O O O O O O
OUTPUT GRID:
O O O O O O O R O O O O
O O O O O O O O O Y O O
O O O O O B O O O O O O
O O O O O O O O G O O O
B O O O O G O O O O O A
O O O O O O R O O O W O
O O R F O O O O C O O O
O G X X R O O O O O O O
O O X X Y O O O O O O O
O O X X A O O O O O O O
O F X X

In [20]:
def query_task(task_id, n_responses, output_dir, model):
    """
    Returns a list of n_responses responses to the prompt
    """
    pmpt = make_task_prompt(task_id=task_id)
    lm_gateway = lm.LanguageModel(model=model)
    system_prompt_path = os.path.join(defs.PROJECT_ROOT, "models/templates/system.txt")
    with open(system_prompt_path, "r") as f:
        system_prompt = f.read()
    completions = lm_gateway.query(pmpt, n_responses, system_prompt=system_prompt, log=True)
    valid_programs, valid_completions, invalid_completions = gpt_gen.process_responses_json(completions)
    # print the number of correct and incorrect completions
    print(f"Task {task_id}")
    print(f"Valid: {len(valid_completions)}")
    print(f"Invalid: {len(invalid_completions)}")
    # save the completions to files
    timestamp = defs.get_timestamp(micros=False)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    with open(os.path.join(output_dir, f"{task_id}_valid_programs.txt"), "w") as f:
        f.write("\n\n".join(valid_programs))
    with open(os.path.join(output_dir, f"{task_id}_valid.txt"), "w") as f:
        f.write("\n\n".join(valid_completions))
    with open(os.path.join(output_dir, f"{task_id}_invalid.txt"), "w") as f:
        f.write("\n\n".join(invalid_completions))
    return completions

def query_task_list(task_ids, n_responses, model):
    # create a directory for the output
    timestamp = defs.get_timestamp(micros=False)
    output_dir = os.path.join(defs.PROJECT_ROOT, "models/logs", f"gens_{timestamp}")
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    # query each task
    for task_id in task_ids:
        query_task(task_id, n_responses, output_dir, model)

In [23]:
model = "gpt-3.5-turbo-0125"
# model = "gpt-4-0125-preview"
n_responses = 30
# n_responses = 3

# sample_task_id = "ae3edfdc"
# task_list = [sample_task_id]

task_list_path = "dsl/v0_3/example_candidates.txt"
with open(task_list_path, "r") as f:
    task_list = [t_id for t_id in f.read().splitlines() if t_id]

query_task_list(task_list, n_responses, model)

Task 05f2a901
Valid: 25
Invalid: 5
Task 4093f84a
Valid: 25
Invalid: 5
Task 6855a6e4
Valid: 26
Invalid: 4
Task 88a10436
Valid: 27
Invalid: 3
Task a48eeaf7
Valid: 25
Invalid: 5
Task ae3edfdc
Valid: 24
Invalid: 6
Task d43fd935
Valid: 24
Invalid: 6
Task f8a8fe49
Valid: 23
Invalid: 7
Task 1e0a9b12
Valid: 26
Invalid: 4
Task 91714a58
Valid: 24
Invalid: 6
Task 9edfc990
Valid: 23
Invalid: 7
Task 42a50994
Valid: 27
Invalid: 3
