In [None]:
import torch
import torchvision
from PIL import Image
from utils import *
import json


In [None]:
def generate_response(model, processor, conversation, target_name, images=None, max_new_tokens=128):
    """
    Generates a response for the given model using the (filtered) conversation history.
    
    - Filters out system messages that are not intended for the current model.
    - role_name should be the current model's identifier (e.g., "Architect" or "Builder").
    """

    # Filter conversation for the current model.
    filtered_conversation = filter_conversation(conversation, target_model=target_name)
    
    # Build the prompt using the processor's chat template.
    prompt = processor.apply_chat_template(filtered_conversation, add_generation_prompt=True)

    inputs = processor(
        images=images,
        text=prompt,
        padding=True,
        return_tensors="pt"
    ).to(model.device)
    
    generate_ids = model.generate(**inputs,
                                  do_sample=False,     # Deterministic generation
                                  max_new_tokens=max_new_tokens)
    output = processor.batch_decode(generate_ids,
                                    skip_special_tokens=True,
                                    clean_up_tokenization_spaces=False)
    
    parsed = output[0].split("[/INST] ")[-1]

    # Since [INST] and [/INST] are NOT single tokens (i.e., they are processede like '['+'/'+'INST'+'])
    # the model picks it up and understands that when [/ is happening, there is a switch of role.
    # The model then tries to force the switch of role and complete the task on its own (lol).
    if "[/" in parsed:
        #Truncate at the point of the unwanted token
        parsed = parsed.split("[/")[0]

    # Minimal cleanup to remove special tokens (adjust as needed)
    #response_text = output.replace("[INST]", "").replace("[/INST]", "").strip()
    return output, parsed

In [None]:
def load_structure(structure_id):
    # Where the rendered structures are
    gold_processed_path = "../data/structures/gold-processed"

    # Construct the path to the structure
    structure_path = os.path.join(gold_processed_path, structure_id)

    # Load the structure JSON
    try:
        json_path = os.path.join(structure_path, f"{structure_id}.json")
        s_json = json.load(open(json_path, "r"))
    except FileNotFoundError:
        raise FileNotFoundError(f"Structure {structure_id} not found.")
    
    s_images_list = []

    # Load the images
    for filename in os.listdir(structure_path):
        # Check if the file has a JPG or JPEG extension (case insensitive)
        if filename.lower().endswith(('.jpg', '.jpeg')):
            img_path = os.path.join(structure_path, filename)
            try:
                # Open the image file using PIL
                img = Image.open(img_path)
                s_images_list.append(img)
            except IOError:
                raise IOError(f"Warning: Could not open image {img_path}")
    
    return s_json, s_images_list

In [None]:
model_id = "llava-hf/llava-v1.6-mistral-7b-hf"
device = "cuda" if torch.cuda.is_available() else "cpu"
structure_id = "C1_bell"
max_rounds = 10
max_new_tokens = 512

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # The compute dtype for 4-bit operations
    bnb_4bit_use_double_quant=True,        # Whether to use double quantization (optional)
    bnb_4bit_quant_type="nf4"              # The quantization type, e.g. "nf4" or "fp4"
    )

# Load Processor and Model from id
processor = LlavaNextProcessor.from_pretrained(model_id,
                                                #use_fast=True,  # Processor for images
                                                padding_side="left")

model_A = LlavaNextForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
).to(device)

model_B = LlavaNextForConditionalGeneration.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
).to(device)

In [None]:
s_json, s_images_list = load_structure(structure_id)

In [None]:
current_round = 0
conversation_history = setup_roles()

In [None]:
from pprint import pprint

full_outputs = []

while current_round < 10:
        print(f"===== Round {current_round + 1} =====")

        # ----- Architect's Turn -----
        # For the first turn, pass the images; later turns might not require images.
        full_output_A, modelA_response = generate_response(
            model=model_A,
            processor=processor,
            conversation=conversation_history,
            target_name="Architect",
            images=s_images_list if current_round == 0 else None,
            max_new_tokens=max_new_tokens
        )
        full_outputs.append(full_output_A)
        print(f"[ARCHITECT]\n {modelA_response}")

        # Append Architect's response to the conversation history.
        conversation_history.append({
            "role": "user",
            "content": [
                {"type": "text", "text": modelA_response}
            ]
        })

        # Check if Architect signaled to finish.
        if "[FINISH]" in modelA_response:
            pprint("Finishing conversation as indicated by Architect.")
            break

        # ----- Builder's Turn -----
        full_output_B, modelB_response = generate_response(
            model=model_B,
            processor=processor,
            conversation=conversation_history,
            target_name="Builder",
            images=None,
            max_new_tokens=max_new_tokens
        )
        full_outputs.append(full_output_B)
        print(f"[BUILDER]\n {modelB_response}")

        # Append Builder's response to the conversation history.
        conversation_history.append({
            "role": "user",
            "content": [
                {"type": "text", "text": modelB_response}
            ]
        })

        current_round += 1

In [None]:
conversation_history

In [None]:
full_outputs

In [None]:
from pprint import pprint

use_img = True
use_json = False

img_text = "images of a target structure built in a voxel world, "
json_text = "a JSON text file representing the target structure, "

img_dict = {"type": "image"}
json_raw = None

arch_content = []
if use_img: arch_content.append(img_dict)
if use_json: arch_content.append(json_raw)
arch_content.append({
                "type": "text",
                "text":
                    (
                        "You are an agent playing a collaborative building task along with a partner. "
                        "Your role is that of the Architect, while your partner is the Builder. You will be shown "
                        "images of a target structure built in a voxel world, "
                        f"{img_text if use_img else ''}{"and " if use_img and use_json else ''}{json_text if use_json else ''}"
                        "and your job is to guide the Builder "
                        "in order to replicate it. Give clear and easy to follow instructions. Proceed step by step "
                        "and avoid providing too many instructions all at once. The Builder will reply with the "
                        "actions it took and, possibly, clarification questions. Acknowledge the Builder's actions "
                        "and feedback in order to understand whether they are on the right track or not and to help "
                        "them. When you think that the Builder correctly completed the structure, output '[FINISH]' "
                        "to trigger the end of the game. Here are the images of the target structure from four points of view: "
                    )
            },)

# Use the "target" field in system messages to restrict visibility.
conversation_history = []

# System message for Architect (visible only to Architect)
conversation_history.append({
    "role": "user",
    "target": "Architect",
    "content": arch_content
})

pprint(conversation_history)

[{'content': [({'type': 'image'},
               {'type': 'image'},
               {'type': 'image'},
               {'type': 'image'}),
              {'text': 'You are an agent playing a collaborative building task '
                       'along with a partner. Your role is that of the '
                       'Architect, while your partner is the Builder. You will '
                       'be shown images of a target structure built in a voxel '
                       'world, images of a target structure built in a voxel '
                       'world, and your job is to guide the Builder in order '
                       'to replicate it. Give clear and easy to follow '
                       'instructions. Proceed step by step and avoid providing '
                       'too many instructions all at once. The Builder will '
                       'reply with the actions it took and, possibly, '
                       "clarification questions. Acknowledge the Builder's "
        