### parquet -> 이미지, 라벨

In [1]:
from PIL import Image
from io import BytesIO
import json
import os
import pandas as pd

df = pd.read_parquet('./data/cord_receipt/train-00000-of-00004-b4aaeceff1d90ecb.parquet')

image_dir = './data/cord_receipt/img/train'
json_dir = './data/cord_receipt/cord_json'
os.makedirs(image_dir, exist_ok=True)
os.makedirs(json_dir, exist_ok=True)

#이미지 저장
for index, row in df.iterrows():
    image_data = row['image']['bytes']
    image = Image.open(BytesIO(image_data))
    image.save(f'./data/cord_receipt/img/train/image_{index+1}.jpg')

#json 저장
for index, row in df.iterrows():
    ground_truth_str = row['ground_truth']  
    ground_truth_dict = json.loads(ground_truth_str)
    with open(f'./data/cord_receipt/cord_json/image_{index+1}.json', 'w', encoding='utf-8') as json_file:
        json.dump(ground_truth_dict, json_file, ensure_ascii=False, indent=4)

### json -> ufo

In [2]:
import os
import json
import datetime
from typing import Dict, List

# 경로 설정
json_folder = './data/cord_receipt/cord_json/'
output_path = './data/cord_receipt/ufo/train.json'

# 출력 디렉터리가 존재하지 않으면 생성
os.makedirs(os.path.dirname(output_path), exist_ok=True)

# 현재 시간
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

# UFO 초기화
ufo = {
    'images': {}
}

# Bounding box 변환 함수
def custom_bbox_to_ufo(quad_info: Dict[str, float]) -> List[List[float]]:
    return [
        [quad_info['x1'], quad_info['y1']],
        [quad_info['x2'], quad_info['y2']],
        [quad_info['x2'], quad_info['y3']],
        [quad_info['x1'], quad_info['y4']]
    ]

# JSON 파일을 UFO 포맷으로 변환
def custom_to_ufo(json_folder: str, output_path: str) -> None:
    for file_name in sorted(os.listdir(json_folder), key=lambda x: int(x.split('_')[1].split('.')[0])):
        if file_name.endswith('.json'):
            with open(os.path.join(json_folder, file_name), 'r') as f:
                data = json.load(f)

            image_name = f"image_{file_name.split('_')[1].split('.')[0]}.jpg"
            ufo['images'][image_name] = {
                "paragraphs": {},
                "words": {},
                "chars": {},
                "img_w": data['meta']['image_size']['width'],
                "img_h": data['meta']['image_size']['height'],
                'num_patches': None,
                "tags": [],
                "relations": {},
                "annotation_log": {
                    "worker": "",
                    "timestamp": now,
                    "tool_version": "",
                    "source": None
                },
                "license_tag": {
                    "usability": True,
                    "public": False,
                    "commercial": True,
                    "type": None,
                    "holder": "Upstage"
                }
            }

            anno_id = 1
            for ann_info in data['valid_line']:
                for word_info in ann_info['words']:
                    quad_info = word_info['quad']
                    ufo['images'][image_name]['words'][str(anno_id).zfill(4)] = {
                        "transcription": "",
                        "points": custom_bbox_to_ufo(quad_info),
                    }
                    anno_id += 1

    # 결과를 JSON 파일로 저장
    with open(output_path, "w") as f:
        json.dump(ufo, f, indent=4)

# 변환 함수 실행
custom_to_ufo(json_folder, output_path)
