In [1]:
import numpy as np
import torch
import os
import json
import random
from transformers import AutoTokenizer

In [2]:
class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [3]:
# If num_prompts is 1, then we will print out the prompt. Otherwise, it will save a json file.
num_prompts = 100
min_num_rows_and_columns = 1
max_num_rows_and_columns = 8

In [4]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

train_grid_names = [
    ["Train_Input_1", "Train_Output_1"],
    ["Train_Input_2", "Train_Output_2"],
    ["Train_Input_3", "Train_Output_3"],
    ["Train_Input_4", "Train_Output_4"],
    ["Train_Input_5", "Train_Output_5"],
]
test_grid_names = [
    ["Test_Input_1", "Test_Output_1"],
    ["Test_Input_2", "Test_Output_2"],
]

colors = ["black", "blue", "red", "green", "yellow", "gray", "magenta", "orange", "cyan", "brown"]

json_file = []
max_token_length = 0
for i in range(num_prompts):
    row_size = random.randint(min_num_rows_and_columns, max_num_rows_and_columns)
    column_size = random.randint(min_num_rows_and_columns, max_num_rows_and_columns)
    random_array = np.random.randint(0, 10, size=(row_size, column_size))

    num_rows = len(random_array)
    num_columns = len(random_array[0])

    Train_or_test = random.randint(0, 1)
    if Train_or_test == 0:
        grid_name = test_grid_names[random.randint(0, len(test_grid_names) - 1)][
            random.randint(0, len(test_grid_names[0]) - 1)
        ]
    else:
        grid_name = train_grid_names[random.randint(0, len(train_grid_names) - 1)][
            random.randint(0, len(train_grid_names[0]) - 1)
        ]

    random_row = random.randint(0, num_rows - 1)
    random_column = random.randint(0, num_columns - 1)

    instruction = (
        "An ARCSolver grid is a rectangular 2D array of digits from 0 to 9. "
    )
    instruction += "An ARCSolver grid's digits represent the colors of each grid square rather than scalar values. "
    instruction += "0 = black, 1 = blue, 2 = red, 3 = green, 4 = yellow, 5 = gray, 6 = magenta, 7 = orange, 8 = cyan, 9 = brown. "
    instruction += "If ARCSolver grid "
    instruction += grid_name + " = "

    instruction += "["
    for i in range(num_rows):
        if i == 0:
            instruction += "["
        for j in range(num_columns):
            instruction += str(random_array[i][j])
            if j < num_columns - 1:
                instruction += ","
            else:
                instruction += "]"
        if i < num_rows - 1:
            instruction += ",["
        else:
            instruction += "]"

    instruction += ", "

    random_question = random.randint(0, 4)
    if random_question == 0:
        # Row question
        instruction += "what is " + grid_name + "[" + str(random_row) + "]?"
        output = grid_name + "[" + str(random_row) + "] is Row " + str(random_row+1) + ", which is the following array: ["
        for i in range(num_columns):
            output += str(random_array[random_row][i])
            if i < num_columns - 1:
                output += ","
            else:
                output += "]"
    elif random_question == 1:
        # Num rows question
        instruction += "how many rows does " + grid_name + " have?"
        output = grid_name + " has " + str(num_rows) + " rows."
    elif random_question == 2:
        # Num columns question
        instruction += "how many columns does " + grid_name + " have?"
        output = grid_name + " has " + str(num_columns) + " columns."
    elif random_question == 3:
        # Color question
        instruction += "what color is the ARCSolver grid square at " + grid_name + "[" + str(random_row) + "][" + str(random_column) + "]?"
        output = grid_name + "[" + str(random_row) + "][" + str(random_column) + "] is digit " + \
            str(random_array[random_row][random_column]) + ", and digit " + \
                str(random_array[random_row][random_column]) + " = " + \
                    colors[random_array[random_row][random_column]] + " in an ARCSolver grid, therefore the color is " + \
                        colors[random_array[random_row][random_column]] + "."
    else:
        # Square question
        instruction += (
            "what digit is "
            + grid_name
            + "["
            + str(random_row)
            + "]["
            + str(random_column)
            + "]?"
        )
        output = grid_name + " is the digit " + str(random_array[random_row][random_column])

    prompt = instruction + " " + output

    # Tokenize the request text
    tokenized_request = tokenizer.tokenize(prompt)

    # Get the token length
    token_length = len(tokenized_request)
    if token_length > max_token_length:
        max_token_length = token_length

    json_file.append({"instruction": instruction, "output": output})

    if num_prompts == 1:
        print("\nInstruction:")
        print(instruction)
        print("\nOutput:")
        print(output)

if num_prompts > 1:
    json_string = json.dumps(json_file, cls=NumpyArrayEncoder)
    base_file_name = "ARCSolver_core_knowledge_on_basic_arrays_" + str(num_prompts)
    filename = base_file_name + ".json"
    filepath = filename
    with open(filepath, "w") as outfile:
        outfile.write(json_string)

print(f"\nMax Token Length: {max_token_length}")


Max Token Length: 298
