- camera coordinate의 검증
- dataset의 skeleton에 대한 검증
    - 몇개의 class로 구성이 되어있는지
    - class 당 몇개의 image로 구성이 되어 있는 지
    - 필요한 것 LLM을 통하여 vision conditioned caption을 다양하게 뽑아줄 필요가 있다. 다만, 어느 정도 성능이 좋은 것으로 pipelining해서 해야한다.
    - 이후에 dataloader 구조에 대해서 dataframe으로 composite하기


In [41]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


import plotly.graph_objects as go
import numpy as np
import pandas as pd
import os

In [None]:
def plot_cameras_and_object(object_pos, cameras, axis_lim:float = 1.0):
    """
    세계 좌표계에서 (0,0,0)에 있는 객체와 여러 카메라의 위치 및 방향을 플롯합니다.
    각 카메라의 xyz 좌표계도 표시됩니다.

    Parameters:
    - cameras: [R|t] 행렬(3x4 NumPy 배열)의 리스트
    """
    # 객체 위치

    # 플롯 요소를 저장할 리스트
    data = []

    # 객체 플롯
    # object_pos = [0, 0, 0]
    data.append(go.Scatter3d(
        x=[object_pos[0]], y=[object_pos[1]], z=[object_pos[2]],
        mode='markers',
        marker=dict(size=5, color='red'),
        name='객체'
    ))

    # 카메라 플롯
    for i, RT in enumerate(cameras):
        R = RT[:, :3]
        t = RT[:, 3]
        C = -np.dot(R.T, t)

        x_dir = np.dot(R.T, [1, 0, 0])
        y_dir = np.dot(R.T, [0, 1, 0])
        z_dir = np.dot(R.T, [0, 0, 1])

        arrow_length = 0.5

        data.append(go.Scatter3d(
            x=[C[0]], y=[C[1]], z=[C[2]],
            mode='markers',
            marker=dict(size=3, color='blue'),
            name=f'카메라 {i+1} (위치: {C[0]:.2f}, {C[1]:.2f}, {C[2]:.2f})'
        ))

        arrow_end_x = C + arrow_length * x_dir
        data.append(go.Scatter3d(
            x=[C[0], arrow_end_x[0]],
            y=[C[1], arrow_end_x[1]],
            z=[C[2], arrow_end_x[2]],
            mode='lines',
            line=dict(color='red', width=2),
            showlegend=False
        ))

        arrow_end_y = C + arrow_length * y_dir
        data.append(go.Scatter3d(
            x=[C[0], arrow_end_y[0]],
            y=[C[1], arrow_end_y[1]],
            z=[C[2], arrow_end_y[2]],
            mode='lines',
            line=dict(color='green', width=2),
            showlegend=False
        ))

        arrow_end_z = C + arrow_length * z_dir
        data.append(go.Scatter3d(
            x=[C[0], arrow_end_z[0]],
            y=[C[1], arrow_end_z[1]],
            z=[C[2], arrow_end_z[2]],
            mode='lines',
            line=dict(color='blue', width=2),
            showlegend=False
        ))

        data.append(go.Scatter3d(
            x=[arrow_end_x[0], arrow_end_y[0], arrow_end_z[0]],
            y=[arrow_end_x[1], arrow_end_y[1], arrow_end_z[1]],
            z=[arrow_end_x[2], arrow_end_y[2], arrow_end_z[2]],
            mode='text',
            text=['X', 'Y', 'Z'],
            textposition='top center',
            showlegend=False
        ))

    fig = go.Figure(data=data)

    fig.update_layout(
        scene=dict(
            xaxis=dict(title='X'),  # x축 범위 설정
            yaxis=dict(title='Y'),  # y축 범위 설정
            zaxis=dict(title='Z'),  # z축 범위 설정
            aspectmode='cube',                     # 축 비율을 동일하게 유지
            camera=dict(projection=dict(type='orthographic'))  # 직교 투영 사용
        ),
        title='세계 좌표계에서 객체와 카메라 (카메라 xyz 좌표계 포함)'
    )

    fig.show()

In [43]:
data_root = "/workspace/data/3ddst/train"
df_path = "/workspace/code/3DAnything/debug/data/3ddst.csv"
df = pd.read_csv(df_path)

In [119]:
meta_class_name = "n02100735"
class_name = "78c2ceb1e51149698c6a8e6242d5dbb2"
dataset_type = "annotation"

camera_results = df[(df["meta_class"] == meta_class_name) & (df["class"] == class_name) & (df["dataset_type"] == dataset_type)]
camera_results_path = []
matrix_world = []
matrix_raw = []
object_pos: np.ndarray = np.zeros(3)

for idx, each_base_name in enumerate(camera_results["base_name"].values):
    each_path = os.path.join(
        data_root, meta_class_name, class_name, dataset_type, each_base_name
    )
    camera_results_path.append(each_path)
    data = np.load(each_path, allow_pickle=True).item()
    if idx == 0:
        object_pos = data["rendering_offset"]
    
    # _mat = data["modelview_matrix"] 
    _mat = data["matrix_world"] 
    matrix_world.append(_mat[:3])
    matrix_raw.append(np.load(each_path, allow_pickle=True).item())
    

In [92]:
camera_results_path

['/workspace/data/3ddst/train/n02100735/78c2ceb1e51149698c6a8e6242d5dbb2/annotation/000.npy',
 '/workspace/data/3ddst/train/n02100735/78c2ceb1e51149698c6a8e6242d5dbb2/annotation/001.npy',
 '/workspace/data/3ddst/train/n02100735/78c2ceb1e51149698c6a8e6242d5dbb2/annotation/002.npy',
 '/workspace/data/3ddst/train/n02100735/78c2ceb1e51149698c6a8e6242d5dbb2/annotation/003.npy',
 '/workspace/data/3ddst/train/n02100735/78c2ceb1e51149698c6a8e6242d5dbb2/annotation/004.npy',
 '/workspace/data/3ddst/train/n02100735/78c2ceb1e51149698c6a8e6242d5dbb2/annotation/005.npy',
 '/workspace/data/3ddst/train/n02100735/78c2ceb1e51149698c6a8e6242d5dbb2/annotation/006.npy']

In [117]:
matrix_raw[0].keys()

dict_keys(['theta', 'phi', 'dist', 'camera_rotation', 'principal', 'source', 'shape_id', 'rendering_scale', 'rendering_offset', 'azimuth', 'elevation', 'pose_sampling', 'strength', 'matrix_world', 'modelview_matrix', 'projection_matrix', 'focal_length', 'sensor_width', 'sensor_height'])

In [114]:
matrix_raw[3]

{'theta': 1.8055418083264432,
 'phi': 1.632716827196733,
 'dist': 1.5,
 'camera_rotation': 0.0,
 'principal': array([256, 256]),
 'source': 'objaverse',
 'shape_id': '78c2ceb1e51149698c6a8e6242d5dbb2',
 'rendering_scale': 2.1698285072504087,
 'rendering_offset': array([ 1.26503408e-04,  3.67745757e-04, -4.72724557e-01]),
 'azimuth': 0.0,
 'elevation': 0.0,
 'pose_sampling': 1,
 'strength': 1,
 'matrix_world': array([[-2.11031818e+00, -3.12308297e-02, -5.03725410e-01,
         -3.48224550e-01],
        [-5.04692674e-01,  1.30588591e-01,  2.10627341e+00,
          1.45606446e+00],
        [ 5.81780206e-08,  2.16566992e+00, -1.34271160e-01,
         -9.28214118e-02],
        [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
          1.00000000e+00]]),
 'modelview_matrix': array([[-4.48225915e-01, -1.07195385e-01,  1.16690053e-08,
          1.14169815e-07],
        [-6.63334690e-03,  2.77366806e-02,  4.59982783e-01,
         -4.05682563e-08],
        [-1.06989965e-01,  4.47366983e-01, -

In [120]:
cameras = [matrix_world[0], matrix_world[2]]
plot_cameras_and_object(object_pos, cameras, axis_lim=2)