In [1]:
# %pip install -q -U bitsandbytes
# %pip install -q -U git+https://github.com/huggingface/transformers.git
# %pip install -q -U git+https://github.com/huggingface/peft.git
# %pip install -q -U git+https://github.com/huggingface/accelerate.git

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, PeftModel, PeftConfig
from random import randint
import numpy as np

In [3]:
base_model_merged = True

In [4]:
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [6]:
# Load the entire model on the GPU 0
device_map = {"": 0}

In [7]:
# peft_model_id = "../adapters/Mistral-7B-Instruct_finetuned_on_10000_array_basics"
peft_model_id = "../results/Finetuned_merge_002_v001/checkpoint-3200" # "../adapters/Finetuned_merge_001_v001"

if base_model_merged:
    base_model_name = "../merged_models/merged_model_step_2"
    model = AutoModelForCausalLM.from_pretrained(base_model_name, 
                                                    quantization_config=bnb_config, 
                                                    return_dict=True, 
                                                    load_in_4bit=True, 
                                                    device_map={"":0})
    tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
else:
    config = PeftConfig.from_pretrained(peft_model_id)
    model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, 
                                                    quantization_config=bnb_config, 
                                                    return_dict=True, 
                                                    load_in_4bit=True, 
                                                    device_map={"":0})
    tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)


# Load the Lora model
model = PeftModel.from_pretrained(model, peft_model_id)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [8]:
def check_zero_rows_cols(permuted_array, sub_grid_x_dim, sub_grid_y_dim):
    reshaped_array = permuted_array.reshape((sub_grid_x_dim, sub_grid_y_dim))
    return np.any(np.all(reshaped_array == 0, axis=0)) or np.any(
        np.all(reshaped_array == 0, axis=1)
    )

In [9]:
def make_move_obj_grids(min_grid_dim, max_grid_dim):
    input_grids = []
    output_grids = []

    min_move = -3
    max_move = 3
    random_move_x = randint(min_move, max_move)
    random_move_y = randint(min_move, max_move)
    if random_move_x == 0:
        while random_move_y == 0:
            random_move_y = randint(min_move, max_move)

    # Need 4 inputs, 4 outputs (3 train pairs, 1 test pair)
    pair_count = 0
    redo = False
    while pair_count < 4 or redo:
        redo = False
        
        grid_x_dim = randint(min_grid_dim, max_grid_dim)
        grid_y_dim = randint(min_grid_dim, max_grid_dim)
        
        min_sub_grid_dim = 2
        max_sub_grid_dim = 5
        sub_grid_x_dim = randint(min_sub_grid_dim, max_sub_grid_dim)
        sub_grid_y_dim = randint(min_sub_grid_dim, max_sub_grid_dim)

        size = sub_grid_x_dim * sub_grid_y_dim
        permuted_array = np.random.randint(10, size=size)

        if check_zero_rows_cols(permuted_array, sub_grid_x_dim, sub_grid_y_dim):
            redo = True
            continue

        isolated_obj_grid = np.zeros((grid_x_dim, grid_y_dim), dtype=int)
        
        
        # Place sub-grid randomly inside of isolated_obj_grid
        sub_grid = permuted_array.reshape((sub_grid_x_dim, sub_grid_y_dim))
        rand_start_x_index = randint(0, grid_x_dim - sub_grid_x_dim)
        rand_start_y_index = randint(0, grid_y_dim - sub_grid_y_dim)
        isolated_obj_grid[
            rand_start_x_index : rand_start_x_index + sub_grid_x_dim,
            rand_start_y_index : rand_start_y_index + sub_grid_y_dim,
        ] = sub_grid
        
        start_x = rand_start_x_index + random_move_x
        start_y = rand_start_y_index + random_move_y

        if start_x < 0 or start_y < 0:
            redo = True
        else:
            input_grid = np.copy(isolated_obj_grid)
            input_grids.append(input_grid)

            moved_obj_grid = np.zeros((2 * grid_x_dim, 2 * grid_y_dim), dtype=int)
            moved_obj_grid[
                start_x : start_x + sub_grid_x_dim,
                start_y : start_y + sub_grid_y_dim,
            ] = sub_grid

            moved_obj_grid = moved_obj_grid[:grid_x_dim, :grid_y_dim]

            output_grid = np.copy(moved_obj_grid)
            output_grids.append(output_grid)

            pair_count += 1
        
    return random_move_x, random_move_y, input_grids, output_grids

In [10]:
def extract_answer(input_string):
    start_index = input_string.find("[/INST]") + len("[/INST]")
    end_index = input_string.find("]]", start_index)
    answer = input_string[start_index:end_index].strip()
    return answer

In [13]:
min_num_rows_and_columns = 6
max_num_rows_and_columns = 8

x_move, y_move, input_grids, output_grids = make_move_obj_grids(min_num_rows_and_columns, max_num_rows_and_columns)
    
instruction = "Given the following input/output train pairs of ARCSolver grids: "

train_input_grids = input_grids[:3]
train_output_grids = output_grids[:3]

for i, (train_input_grid, train_output_grid) in enumerate(zip(train_input_grids, train_output_grids)):
    instruction += (f"Train_Input_{i+1}=[")
    for j in range(len(train_input_grid)):
        instruction += "["
        for k in range(len(train_input_grid[j])):
            instruction += str(train_input_grid[j][k])
            if k != len(train_input_grid[j]) - 1:
                instruction += ","
        instruction += "]"
        if j != len(train_input_grid) - 1:
            instruction += ","
    instruction += "]"
    instruction += (f" and Train_Output_{i+1}=[")
    for j in range(len(train_output_grid)):
        instruction += "["
        for k in range(len(train_output_grid[j])):
            instruction += str(train_output_grid[j][k])
            if k != len(train_output_grid[j]) - 1:
                instruction += ","
        instruction += "]"
        if j != len(train_output_grid) - 1:
            instruction += ","
    instruction += "]"
    if i != len(train_output_grids) - 1:
        instruction += ", "
        
test_input_grids = input_grids[3:]
test_output_grids = output_grids[3:]

instruction += ". Find the transformation from each input grid to output grid that is common to all 3 train pairs."
instruction += " Then apply this transformation to the following test input grid to get the test output grid: "
    

for i, (test_input_grid) in enumerate(test_input_grids):
    instruction += (f"Test_Input_{i+1}=[")
    for j in range(len(test_input_grid)):
        instruction += "["
        for k in range(len(test_input_grid[j])):
            instruction += str(test_input_grid[j][k])
            if k != len(test_input_grid[j]) - 1:
                instruction += ","
        instruction += "]"
        if j != len(test_input_grid) - 1:
            instruction += ","
    instruction += "]"

output = "The common transformation is that the non-zero element sub-grid in each train input grid is moved " + str(x_move) +\
    " units horizontally and " + str(y_move) + " units vertically to get the corresponding train output grid. Therefore, "
for i, (test_output_grid) in enumerate(test_output_grids):
    output += (f"Test_Output_{i+1}=[")
    for j in range(len(test_output_grid)):
        output += "["
        for k in range(len(test_output_grid[j])):
            output += str(test_output_grid[j][k])
            if k != len(test_output_grid[j]) - 1:
                output += ","
        output += "]"
        if j != len(test_output_grid) - 1:
            output += ","
    output += "]"
    if i != len(test_output_grids) - 1:
        output += ", "

ground_truth = output

query = instruction
prompt = '<s>[INST] ' + query + ' [/INST] '
print("Query:\n", prompt)
encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
model_inputs = encodeds.to("cuda")

# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# model.resize_token_embeddings(len(tokenizer))
# tokenizer.padding_side = "right"
tokenizer.pad_token = tokenizer.eos_token

generated_ids = model.generate(**model_inputs, max_new_tokens=1250, do_sample=False, pad_token_id=tokenizer.eos_token_id)
decoded = tokenizer.batch_decode(generated_ids)
result = decoded[0]
print("\nAnswer:\n", extract_answer(result) + "]]")

print("\nGround truth:\n", ground_truth)

# Tokenize the request text
tokenized_request = tokenizer.tokenize(result)

# Get the token length
token_length = len(tokenized_request)

print("\nNumber of tokens in full result: ", token_length)

Query:
 <s>[INST] Given the following input/output train pairs of ARCSolver grids: Train_Input_1=[[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,8,5,0,7,0,0,0],[0,7,2,4,5,0,0,0],[0,0,0,0,0,0,0,0]] and Train_Output_1=[[0,0,0,0,0,0,0,0],[8,5,0,7,0,0,0,0],[7,2,4,5,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0]], Train_Input_2=[[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,9,1,0,5,7,0,0],[0,0,6,3,6,9,0,0],[0,0,5,6,5,5,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0]] and Train_Output_2=[[9,1,0,5,7,0,0,0],[0,6,3,6,9,0,0,0],[0,5,6,5,5,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0]], Train_Input_3=[[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,2,2,0,0,0,0],[0,0,6,6,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0]] and Train_Output_3=[[0,2,2,0,0,0,0,0],[0,6,6,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0],[0,0,0,0,0,0,0,0]]. Find the tr