In [1]:
import torch
from PIL import Image
import argparse
from utils import *
import json


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_structure(structure_id):
    # Where the rendered structures are
    gold_processed_path = "../data/structures/gold-processed"

    # Construct the path to the structure
    structure_path = os.path.join(gold_processed_path, structure_id)

    # Load the structure JSON
    try:
        json_path = os.path.join(structure_path, f"{structure_id}.json")
        s_json = json.load(open(json_path, "r"))
    except FileNotFoundError:
        raise FileNotFoundError(f"Structure {structure_id} not found.")
    
    s_images_list = []

    # Load the images
    for filename in os.listdir(structure_path):
        # Check if the file has a JPG or JPEG extension (case insensitive)
        if filename.lower().endswith(('.jpg', '.jpeg')):
            img_path = os.path.join(structure_path, filename)
            try:
                # Open the image file using PIL
                img = Image.open(img_path)
                s_images_list.append(img)
            except IOError:
                raise IOError(f"Warning: Could not open image {img_path}")
    
    return s_json, s_images_list

In [3]:
def generate_response(model, processor, conversation, role_name, images=None, max_new_tokens=128):
    """
    Generates a response for the given model using the (filtered) conversation history.
    
    - Filters out system messages that are not intended for the current model.
    - role_name should be the current model's identifier (e.g., "Architect" or "Builder").
    """

    # Filter conversation for the current model.
    filtered_conversation = filter_conversation(conversation, target_model=role_name)
    
    # Build the prompt using the processor's chat template.
    prompt = processor.apply_chat_template(filtered_conversation, add_generation_prompt=True)

    inputs = processor(
        images=images,
        text=prompt,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    generate_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
    

    # Minimal cleanup to remove special tokens (adjust as needed)
    #response_text = output.replace("[INST]", "").replace("[/INST]", "").strip()
    return output

In [4]:
model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"
structure_id = "C1_bell"
max_rounds = 10
max_new_tokens = 256

In [5]:
model_A, processor_A = initialize_model(model_id, device, 4)
model_B, processor_B = initialize_model(model_id, device, 4)

Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.08s/it]
Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.07it/s]


In [6]:
s_json, s_images_list = load_structure(structure_id)

In [7]:
current_round = 0
conversation_history = setup_roles()

In [8]:
conversation_history

[{'role': 'user',
  'target': 'Architect',
  'content': [{'type': 'image'},
   {'type': 'image'},
   {'type': 'image'},
   {'type': 'image'},
   {'type': 'text',
    'text': "You are an agent playing a collaborative building task along with a partner. Your role is that of the Architect, while your partner is the Builder. You will be shown images of a target structure built in a voxel world, and your job is to guide the Builder in order to replicate it. Give clear and easy to follow instructions. Proceed step by step and avoid providing too many instructions all at once. The Builder will reply with the actions it took and, possibly, clarification questions. Acknowledge the Builder's actions and feedback in order to understand whether they are on the right track or not and to help them. When you think that the Builder correctly completed the structure, output '[FINISH]' to trigger the end of the game. Here are the images of the target structure from four points of view: "}]},
 {'role': '

In [9]:
modelA_response = generate_response(
            model=model_A,
            processor=processor_A,
            conversation=conversation_history,
            role_name="Architect",
            images=s_images_list if current_round == 0 else None,
            max_new_tokens=max_new_tokens
        )


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


In [10]:
modelA_response

["[INST]  \n \n \n \nYou are an agent playing a collaborative building task along with a partner. Your role is that of the Architect, while your partner is the Builder. You will be shown images of a target structure built in a voxel world, and your job is to guide the Builder in order to replicate it. Give clear and easy to follow instructions. Proceed step by step and avoid providing too many instructions all at once. The Builder will reply with the actions it took and, possibly, clarification questions. Acknowledge the Builder's actions and feedback in order to understand whether they are on the right track or not and to help them. When you think that the Builder correctly completed the structure, output '[FINISH]' to trigger the end of the game. Here are the images of the target structure from four points of view:  [/INST][INST] [START] [/INST] Alright, let's start building the target structure. From the top-down view, it looks like we have a base that is a 3x3 grid. We'll need to b