In [25]:
import torch
from PIL import Image
from utils import *
import json


In [26]:
def load_structure(structure_id):
    # Where the rendered structures are
    gold_processed_path = "../data/structures/gold-processed"

    # Construct the path to the structure
    structure_path = os.path.join(gold_processed_path, structure_id)

    # Load the structure JSON
    try:
        json_path = os.path.join(structure_path, f"{structure_id}.json")
        s_json = json.load(open(json_path, "r"))
    except FileNotFoundError:
        raise FileNotFoundError(f"Structure {structure_id} not found.")
    
    s_images_list = []

    # Load the images
    for filename in os.listdir(structure_path):
        # Check if the file has a JPG or JPEG extension (case insensitive)
        if filename.lower().endswith(('.jpg', '.jpeg')):
            img_path = os.path.join(structure_path, filename)
            try:
                # Open the image file using PIL
                img = Image.open(img_path)
                s_images_list.append(img)
            except IOError:
                raise IOError(f"Warning: Could not open image {img_path}")
    
    return s_json, s_images_list

In [27]:
def generate_response(model, processor, conversation, target_name, images=None, max_new_tokens=128):
    """
    Generates a response for the given model using the (filtered) conversation history.
    
    - Filters out system messages that are not intended for the current model.
    - role_name should be the current model's identifier (e.g., "Architect" or "Builder").
    """

    # Filter conversation for the current model.
    filtered_conversation = filter_conversation(conversation, target_model=target_name)
    
    # Build the prompt using the processor's chat template.
    prompt = processor.apply_chat_template(filtered_conversation, add_generation_prompt=True)

    inputs = processor(
        images=images,
        text=prompt,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    generate_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    

    # Minimal cleanup to remove special tokens (adjust as needed)
    #response_text = output.replace("[INST]", "").replace("[/INST]", "").strip()
    return output[0].split("[/INST]")[-1]

In [28]:
model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"
structure_id = "C1_bell"
max_rounds = 10
max_new_tokens = 256

In [29]:
model_A, processor_A = initialize_model(model_id, device, 4)
model_B, processor_B = initialize_model(model_id, device, 4)

Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.02s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.00it/s]


In [30]:
s_json, s_images_list = load_structure(structure_id)

In [31]:
current_round = 0
conversation_history = setup_roles()

In [None]:
from pprint import pprint

while current_round < 10:
        print(f"===== Round {current_round + 1} =====")

        # ----- Architect's Turn -----
        # For the first turn, pass the images; later turns might not require images.
        modelA_response = generate_response(
            model=model_A,
            processor=processor_A,
            conversation=conversation_history,
            target_name="Architect",
            images=s_images_list if current_round == 0 else None,
            max_new_tokens=256
        )
        modelA_response = f"[ARCHITECT] {modelA_response}"
        print(modelA_response)

        # Append Architect's response to the conversation history.
        conversation_history.append({
            "role": "user",
            "content": [
                {"type": "text", "text": modelA_response}
            ]
        })

        # Check if Architect signaled to finish.
        if "[FINISH]" in modelA_response:
            pprint("Finishing conversation as indicated by Architect.")
            break

        # ----- Builder's Turn -----
        modelB_response = generate_response(
            model=model_B,
            processor=processor_B,
            conversation=conversation_history,
            target_name="Builder",
            images=None,
            max_new_tokens=256
        )
        modelB_response = f"[BUILDER] {modelB_response}"
        print(modelB_response)

        # Append Builder's response to the conversation history.
        conversation_history.append({
            "role": "user",
            "content": [
                {"type": "text", "text": modelB_response}
            ]
        })

        current_round += 1

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


===== Round 1 =====


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 Alright, let's start building the target structure. From the top-down view, it looks like we have a base that is a 3x3 grid. We'll need to build up from there.

For the base, we'll need to place blocks in the following pattern:
- The top left corner should be purple.
- The top right corner should be green.
- The bottom left corner should be gray.
- The bottom right corner should be red.

Once you have the base set up, we can start building the structure. Let me know when you have the base set up, and we'll move on to the next step. 


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 [BUILDER]  Understood. I will set up the base according to your instructions.

add.(purple, 0, 0, 0)
add.(green, 0, 0, 0)
add.(gray, 0, 0, 0)
add.(red, 0, 0, 0) 
===== Round 2 =====


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 [ARCHITECT]  Great! Now that we have the base set up, let's move on to the next step.

From the side view, we can see that there are two columns of blocks. The left column should be purple, and the right column should be green. The top of each column should be gray, and the bottom of each column should be red.

Please let me know when you have completed this step, and we'll move on to the next one. 


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 [BUILDER]  [BUILDER]  Understood. I will add the blocks according to your instructions.

add.(purple, 0, 0, 1)
add.(green, 0, 0, 1)
add.(gray, 0, 0, 1)
add.(red, 0, 0, 1)

add.(purple, 0, 1, 1)
add.(green, 0, 1, 1)
add.(gray, 0, 1, 1)
add.(red, 0, 1, 1)

add.(purple, 0, 2, 1)
add.(green, 0, 2, 1)
add.(gray, 0, 2, 1)
add.(red, 0, 2, 1)

add.(purple, 0, 3, 1)
add.(green, 0, 3, 1)
add.(gray, 0, 3, 1)
add.(red, 0, 3, 1) 
===== Round 3 =====


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 [ARCHITECT]  [ARCHITECT]  Great job! Now that we have the base and columns set up, let's move on to the top of the structure.

From the front view, we can see that there are two rows of blocks. The top row should be purple, and the bottom row should be green. The leftmost block in each row should be gray, and the rightmost block in each row should be red.

Please let me know when you have completed this step, and we'll move on to the final step. 


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 [BUILDER]  [BUILDER]  Understood. I will add the blocks according to your instructions.

add.(purple, 0, 0, 2)
add.(green, 0, 0, 2)
add.(gray, 0, 0, 2)
add.(red, 0, 0, 2)

add.(purple, 0, 1, 2)
add.(green, 0, 1, 2)
add.(gray, 0, 1, 2)
add.(red, 0, 1, 2)

add.(purple, 0, 2, 2)
add.(green, 0, 2, 2)
add.(gray, 0, 2, 2)
add.(red, 0, 2, 2)

add.(purple, 0, 3, 2)
add.(green, 0, 3, 2)
add.(gray, 0, 3, 2)
add.(red, 0, 3, 2) 
===== Round 4 =====
 [ARCHITECT]  [ARCHITECT]  [ARCHITECT]  Great job! You have successfully replicated the target structure.

[FINISH] 
'Finishing conversation as indicated by Architect.'
