In [1]:
dataset_location = "/tmp/clevr-act-5"
print("Train set:")
!head -n 5 {dataset_location}/dataset/_annotations.train.jsonl
print("\nValidation set:")
!head -n 5 {dataset_location}/dataset/_annotations.valid.jsonl

Train set:
{"image": "CLEVR_4285005254.jpg", "prefix": "move large green cube onto large gray sphere <loc0440><loc0534><loc0049><loc0102><loc0048><loc0079>", "suffix": "<loc0533><loc0428><loc0071><loc0109><loc0009><loc0015><loc0414><loc0335><loc0069><loc0109><loc0009><loc0015>"}
{"image": "CLEVR_3958594555.jpg", "prefix": "move large purple sphere onto large blue cube <loc0411><loc0477><loc0052><loc0101><loc0048><loc0083>", "suffix": "<loc0618><loc0655><loc0060><loc0101><loc0048><loc0083><loc0404><loc0376><loc0074><loc0101><loc0048><loc0083>"}
{"image": "CLEVR_4134492330.jpg", "prefix": "move small gray sphere onto small cyan sphere <loc0412><loc0505><loc0050><loc0103><loc0050><loc0083>", "suffix": "<loc0540><loc0600><loc0070><loc0103><loc0050><loc0083><loc0624><loc0577><loc0050><loc0103><loc0050><loc0083>"}
{"image": "CLEVR_1134966472.jpg", "prefix": "move small green sphere onto small cyan cube <loc0401><loc0501><loc0049><loc0102><loc0048><loc0078>", "suffix": "<loc0971><loc0939><loc

In [9]:
import cv2
import json
import supervision as sv
from typing import List
import re

def read_n_lines(file_path: str, n: int) -> List[str]:
    with open(file_path, 'r') as file:
        lines = [next(file).strip() for _ in range(n)]
    return lines

def parse_trajectory_tokens(caption):
  caption = caption.strip("\n")
  if ";" in caption:
    raise ValueError
  pattern = r"(?:<loc\d{4}>)+ ([\w\s\-]+)$"
  match = re.search(pattern, caption)
  #print(f"Matched words: {match.group(1)}")
  if match:
    return match
  else:
    return ""

train_file = f"{dataset_location}/dataset/_annotations.train.jsonl"
images = []
lines = read_n_lines(train_file, 25)
first = json.loads(lines[0])

#CLASSES = first.get('prefix').replace("detect ", "").split(" ; ")
for line in lines:
    data = json.loads(line)
    image = cv2.imread(f"{dataset_location}/dataset/{data.get('image')}")
    (h, w, _) = image.shape

    suffix = data.get('suffix')
    prefix = data.get('prefix')
    #match = parse_trajectory_tokens(suffix)
    print(prefix)
    

move large green cube onto large gray sphere <loc0440><loc0534><loc0049><loc0102><loc0048><loc0079>
move large purple sphere onto large blue cube <loc0411><loc0477><loc0052><loc0101><loc0048><loc0083>
move small gray sphere onto small cyan sphere <loc0412><loc0505><loc0050><loc0103><loc0050><loc0083>
move small green sphere onto small cyan cube <loc0401><loc0501><loc0049><loc0102><loc0048><loc0078>
move small blue cube onto small blue sphere <loc0445><loc0528><loc0051><loc0099><loc0047><loc0078>
move small gray sphere onto large blue sphere <loc0411><loc0521><loc0048><loc0104><loc0046><loc0077>
move large yellow cube onto small purple cube <loc0461><loc0541><loc0050><loc0099><loc0044><loc0071>
move small purple cube onto small cyan cube <loc0434><loc0528><loc0050><loc0102><loc0045><loc0076>
move large blue sphere onto small cyan sphere <loc0418><loc0513><loc0049><loc0106><loc0041><loc0074>
move large yellow sphere onto small blue sphere <loc0403><loc0518><loc0046><loc0108><loc0044><loc

In [7]:
!ls



tensor([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 0., 1.]])

In [10]:

import torch
class DummyCamera:
    def __init__(self):
        self.height = 448
        self.width = 448

camera = DummyCamera()

def parse_trajectory_xyz(caption, camera, num_tokens=3):
    DEPTH_SCALE = 100
    # Pattern to extract numbers inside <loc####> tags
    loc_strings = re.findall(r"<loc(\d{4})>", caption)
    num_position_tokens = len(loc_strings)
    loc_strings_pairs = loc_strings[:(num_position_tokens//num_tokens)*num_tokens]
    loc_numbers = [int(x) for x in loc_strings_pairs]
    loc_h = [x/(1024-1)*camera.height for x in loc_numbers[::num_tokens]]
    loc_w = [x/(1024-1)*camera.width for x in loc_numbers[1::num_tokens]]
    loc_d = [x/DEPTH_SCALE for x in loc_numbers[2::num_tokens]]  # depth
    curve_2d = torch.tensor((loc_w, loc_h)).T
    return curve_2d, torch.tensor(loc_d)

print(suffix)
print(parse_trajectory_xyz(suffix, camera))


ModuleNotFoundError: No module named 'torch'

In [5]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import io
import html
import base64
from IPython.display import display, HTML


text = "<loc0625><loc0762><loc0468><loc0724><loc0384><loc0679><loc0375><loc0632><loc0430><loc0587><loc0587> 1"
action_text = data.get('prefix')


def render_example_trajectory(image, caption):
    # Pattern to extract numbers inside <loc####> tags
    print(parse_trajectory_tokens(caption))
    
    env_id = 0
    x, y = curve_2d[env_id, :, 0].tolist(), curve_2d[env_id, :, 1].tolist()

    pixel_width, pixel_height = 448, 448
    dpi = 100
    figsize = (pixel_width / dpi, pixel_height / dpi)
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    ax.imshow(image)
    ax.axis('off')
    ax.plot(curve_2d_short[env_id,:,0], curve_2d_short[env_id, :,1],'.-', color='lime')
    with io.BytesIO() as buffer:
        fig.savefig(buffer, format='jpeg',bbox_inches='tight', dpi=dpi)
        image_b64 = str(base64.b64encode(buffer.getvalue()), "utf-8")
        res_str =  f"data:image/jpeg;base64,{image_b64}"
    plt.close(fig)
    return f"""
<div style="display: inline-flex; align-items: center; justify-content: center;">
    <img style="width:224px; height:224px;" src="{res_str}" />
    <p style="width:256px; margin:10px; font-size:small;">{html.escape(caption)}</br>{html.escape(caption)}</p>

</div>
"""

html_out = ""
#for image, _, caption in make_predictions(validation_data_iterator(), num_examples=1, batch_size=1):
#  html_out += render_example(image, caption)

caption = data.get('prefix')
image = Image.open(f"{dataset_location}/dataset/{data.get('image')}")
html_out += render_example_trajectory(image, caption)
display(HTML(html_out))
print("done!")




NameError: name 'curve_2d' is not defined

In [None]:
matches[:, 0]

In [28]:
all_file = f"{dataset_location}/dataset/_annotations.all.jsonl"

with open(all_file) as f_obj:
    lines_str = f_obj.readlines()

lines = []
prefix_short = []
for line_str in lines_str:
    line_json = json.loads(line_str)
    #lines.append(line_json)
    prefix_short.append(line_json["prefix"].split("<")[0].strip())

In [32]:
# for i in range(50):
#     print(prefix_short[i])
import numpy as np
unique_str, counts = np.unique(prefix_short, return_counts=True)
print(counts.mean(), counts.max(), counts.min())

126.00806451612904 170 87


In [1]:
import mani_skill

ModuleNotFoundError: No module named 'mani_skill'

In [10]:
import sys
from platform import python_version
print(python_version())
!python --version
!pip list |grep mani_skill
#!pip show mani_skill
print(sys.executable)
!python -m "import mani_skill"

3.12.7
Python 3.12.7
mani_skill                3.0.0b10       /home/argusm/lang/ManiSkill
/home/argusm/local/miniconda3/envs/paligemma/bin/python
/home/argusm/local/miniconda3/envs/paligemma/bin/python: No module named import mani_skill
