- dataset을 분석하기 위한 코드 부분
    - LLM caption들을 여기에 저장할 것이다.

In [17]:
import os
import pandas as pd
import numpy as np

In [59]:
# dataset main path
data_root = "/workspace/data/3ddst/train"
dataset_types = ["annotation", "cannyedge_render", "depth", "image_render", "normal_render", "caption"]

In [60]:
meta_class_list = [
    d for d in os.listdir(data_root)
    if os.path.isdir(os.path.join(data_root, d)) 
]
print(f"Found {len(meta_class_list)} meta classes in the dataset.")

Found 498 meta classes in the dataset.


In [45]:
class_dict = {
}  # {key: meta_class, vaue: class_list}
class_list = []  # concatenated list of all classes

for each_meta_class in meta_class_list:
    class_path = os.path.join(data_root, each_meta_class)
    valid_sub_classes = []
    for each_class in os.listdir(class_path):
        if not os.path.isdir(class_path):
            continue
        # currently to save pose as .npy in annotation folder
        annot_path = os.path.join(class_path, each_class, "annotation")
        if os.path.exists(annot_path):
            annotation_files = [f for f in os.listdir(
                annot_path) if f.endswith(".npy")]
            if len(annotation_files) > 1:  # Requirement 4: Exclude if 1 or fewer files
                valid_sub_classes.append(each_class)
        
        class_dict[each_meta_class] = valid_sub_classes
        class_list.extend(valid_sub_classes)
    
    if len(valid_sub_classes) == 0:
        print(f"Warning: No valid classes found in meta class '{each_meta_class}'.")
        del class_dict[each_meta_class]
        meta_class_list.remove(each_meta_class)



In [46]:
# statistics per meta class
class_numbers = {
    k: len(v) for k, v in class_dict.items()    
}
class_num_list = np.array([v for v in class_numbers.values()])
print(
    f"Total number of classes: {len(class_list)} "
    f"Average number of classes per meta class: {np.mean(class_num_list):.2f} "
    f"standard deviation: {np.std(class_num_list):.2f} "
    f"median: {np.median(class_num_list):.2f} "
    f"min: {np.min(class_num_list):.2f} "
    f"max: {np.max(class_num_list):.2f} "
    ,sep="\n"
)
# 특수한 경우 발견, n02971356, validd sub class가 하나도 없는 경우

Total number of classes: 28511 Average number of classes per meta class: 9.24 standard deviation: 3.59 median: 9.00 min: 1.00 max: 21.00 


In [77]:
# dataframe construction
data = []
for each_meta_class, each_class_list in class_dict.items():
    for each_class in each_class_list:
        sub_class_path = os.path.join(data_root, each_meta_class, each_class)
        for dataset_type in dataset_types:
            dataset_path = os.path.join(sub_class_path, dataset_type)
            if os.path.exists(dataset_path):
                base_names = sorted([f for f in os.listdir(dataset_path)])
            else:
                base_names = []
            data.append({"class": each_class, "dataset_type": dataset_type, "value": base_names})

df = pd.DataFrame(data)
# df.set_index(["class", "dataset_type"], inplace=True)
df.to_csv(os.path.join("./data", "3ddst.csv"), index=False)

In [78]:
df[df["class"] == "dcb921b4517a45db8853bfdd4d0cafdc"]

Unnamed: 0,class,dataset_type,value
0,dcb921b4517a45db8853bfdd4d0cafdc,annotation,"[000.npy, 001.npy, 002.npy, 003.npy, 004.npy, ..."
1,dcb921b4517a45db8853bfdd4d0cafdc,cannyedge_render,"[000.png, 001.png, 002.png, 003.png, 004.png, ..."
2,dcb921b4517a45db8853bfdd4d0cafdc,depth,"[000.exr, 001.exr, 002.exr, 003.exr, 004.exr, ..."
3,dcb921b4517a45db8853bfdd4d0cafdc,image_render,"[000.png, 001.png, 002.png, 003.png, 004.png, ..."
4,dcb921b4517a45db8853bfdd4d0cafdc,normal_render,"[000.png, 001.png, 002.png, 003.png, 004.png, ..."
5,dcb921b4517a45db8853bfdd4d0cafdc,caption,[]
