In [1]:
import numpy as np
import json
import os
from transformers import AutoTokenizer


class NumpyArrayEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)


############################################################


def format_instruction_and_output(puzzle_string):
    puzzle_dict = json.loads(puzzle_string)
    train_pairs = puzzle_dict["train"]
    test_pairs = puzzle_dict["test"]

    instruction = ""
    output = ""

    for i, train_pair in enumerate(train_pairs):
        if i > 0:
            instruction += " "
        train_input_string = str(train_pair["input"])
        train_input_string = train_input_string.replace(" ", "")
        instruction += "Train_" + str(i + 1) + "_Input=" + train_input_string
        
        train_output_string = str(train_pair["output"])
        train_output_string = train_output_string.replace(" ", "")
        instruction += " Train_" + str(i + 1) + "_Output=" + train_output_string

    for i, test_pair in enumerate(test_pairs):
        test_input_string = str(test_pair["input"])
        test_input_string = test_input_string.replace(" ", "")
        instruction += " Test_" + str(i + 1) + "_Input=" + test_input_string
        
        test_output_string = str(test_pair["output"])
        test_output_string = test_output_string.replace(" ", "")
        output += "Test_" + str(i + 1) + "_Output=" + test_output_string

    return instruction, output

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if __name__ == "__main__":
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Orca-2-13b")
    puzzles = []
    root_dir = "../data/step_2/training_augs/"
    count = 0
    too_long_count = 0
    for root, dirs, files in os.walk(root_dir):
        for name in files:
            if ".json" not in name:
                continue

            file_path = os.path.join(root, name)
            print(count, "file_path", file_path)

            base_file_name = name[:-6]

            with open(file_path, "r") as f:
                data = json.load(f)

            train_tasks = data["train"]
            test_tasks = data["test"]

            if len(train_tasks) > 3:
                train_tasks = train_tasks[:3]

            for i, test_task in enumerate(test_tasks):
                
                if i > 0:
                    base_file_name = base_file_name + "_" + str(i + 1)
                test_task = [test_task]
                
                data = {"train": train_tasks, "test": test_tasks}
                json_string = json.dumps(data, cls=NumpyArrayEncoder)
                instruction, output = format_instruction_and_output(json_string)
                
                prompt = instruction + " " + output
                tokenized_request = tokenizer.tokenize(prompt)
                token_length = len(tokenized_request)
                if token_length <= 4096:
                    puzzles.append({"instruction": instruction, "output": output})
                    count += 1
                else:
                    print("token_length too long:", token_length)
                    too_long_count += 1

    json_string = json.dumps(puzzles, cls=NumpyArrayEncoder)
    base_file_name = "../data/ARC_augmented_training_puzzles"
    filename = base_file_name + ".json"
    filepath = filename
    with open(filepath, "w") as outfile:
        outfile.write(json_string)

print("done:", count, too_long_count)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: cf451474-cb53-403b-9d85-adcc7b92c0ad)')' thrown while requesting HEAD https://huggingface.co/microsoft/Orca-2-13b/resolve/main/tokenizer_config.json
Token indices sequence length is longer than the specified maximum sequence length for this model (4304 > 4096). Running this sequence through the model will result in indexing errors


0 file_path ../data/step_2/training_augs/77fdfe6_7.json
1 file_path ../data/step_2/training_augs/b8cdaf2_19.json
2 file_path ../data/step_2/training_augs/36fdfd6_21.json
token_length too long: 4304
2 file_path ../data/step_2/training_augs/0a938d7_13.json
3 file_path ../data/step_2/training_augs/8403a5d_19.json
4 file_path ../data/step_2/training_augs/1f642eb_9.json
5 file_path ../data/step_2/training_augs/4522001_17.json
6 file_path ../data/step_2/training_augs/272f95f_6.json
7 file_path ../data/step_2/training_augs/3af2c5a_22.json
8 file_path ../data/step_2/training_augs/a416b8f_1.json
9 file_path ../data/step_2/training_augs/1caeab9_24.json
10 file_path ../data/step_2/training_augs/a79310a_25.json
11 file_path ../data/step_2/training_augs/36d6757_2.json
12 file_path ../data/step_2/training_augs/06df4c8_13.json
token_length too long: 8592
12 file_path ../data/step_2/training_augs/80af300_6.json
13 file_path ../data/step_2/training_augs/8731374_27.json
14 file_path ../data/step_2/train