In [None]:
import os
import json

def merge_json_files(source_dir, target_json_path):
    '''
    合并文件夹下所有JSON文件到指定的JSON文件中，结果以列表形式存储
    
    参数：
    source_dir: 源文件夹路径，包含要合并的JSON文件
    target_json_path: 目标JSON文件路径，用于存储合并后的结果
    '''
    # 初始化合并后的结果列表
    merged_data = []
    
    # 遍历源文件夹中的所有文件
    for filename in os.listdir(source_dir):
        # 只处理JSON文件
        if filename.endswith('.json'):
            file_path = os.path.join(source_dir, filename)
            try:
                # 读取JSON文件内容
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = json.load(f)
            
                # 如果文件内容是字典，直接添加到列表中
                merged_data.append(content)

                print(f"已成功读取并合并 {filename}")
                
            except Exception as e:
                print(f"处理文件 {filename} 时出错：{str(e)}")
    
    # 将合并后的数据写入目标JSON文件
    with open(target_json_path, 'w', encoding='utf-8') as f:
        json.dump(merged_data, f, ensure_ascii=False, indent=2)
    

In [None]:
# 使用示例
source_directory = '/Users/bytedance/Project/OmniDocBench/OursDataset/jsons'  # 替换为你的源文件夹路径
target_file = '/Users/bytedance/Project/OmniDocBench/OursDataset/OmniDocBench.json'  # 替换为你的目标文件路径

# 调用函数进行合并
merge_json_files(source_directory, target_file)

In [None]:
import os

# 读取 passed.txt 中的内容到集合
passed_files = set()
with open('/Users/bytedance/Project/OmniDocBench/OursDataset/passed.txt', 'r', encoding='utf-8') as f:
    for line in f:
        # 去除每行的换行符等空白字符
        passed_files.add(line.strip())

# 目标文件夹路径，根据实际情况修改
folder_path = '/Users/bytedance/Project/OmniDocBench/OursDataset/jsons'  

# 遍历文件夹中的文件
for file_name in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file_name)
    
    # 确保是文件且文件名不在 passed.txt 中
    if os.path.isfile(file_path) and file_name[:-5] not in passed_files:
        os.remove(file_path)
        print(f"已删除文件：{file_name}")


In [None]:
import json
with open("/Users/bytedance/Project/OmniDocBench/OursDataset/OmniDocBench.json", "r") as f:
    data = json.load(f)

for item in data:
    i = item["page_info"]["image_path"]

    parts = i.split("_")
    # 给数字部分补零到三位
    new_name = f"{int(parts[0]):03d}_{int(parts[1]):02d}.jpg"   
    # 拼接新的完整路径
    # 更新回 item
    item["page_info"]["image_path"] = new_name
    print(item["page_info"]["image_path"])

with open("/Users/bytedance/Project/OmniDocBench/OursDataset/OmniDocBench.json", "w") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
import re

s = r'其他地方 \[不动] [\[15, Theorem 1\]](#page-15-0) 还有 [\[4\]](#page-9-1)'

def repl(m):
    inner = m.group(1)
    # 仅对捕获到的 inner 去掉 \[ \]
    inner = inner.replace(r'\[', '[').replace(r'\]', ']')
    return inner  # 不保留外层括号

res = re.sub(r'\[(.*?)\]\(#page-\d+-\d+\)', repl, s)
print(res)
# 输出: 其他地方 \[不动] [15, Theorem 1] 还有 [4]



In [6]:

import json
with open("/Users/bytedance/Project/OmniDocBench/OursDataset/OmniDocBench.json",'r') as f:
    data = json.load(f)


for item in data:
    for ele in item['layout_dets']:
        print(ele)
        if ele['category_type'] == 'equation_isolated':
            ele['latex'] = ele['text']
            del ele['text']

with open("/Users/bytedance/Project/OmniDocBench/OursDataset/OmniDocBench.json",'w') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)


{'category_type': 'header', 'poly': [722.5, 192.50000000000003, 1802.5, 192.50000000000003, 1802.5, 240.00000000000003, 722.5, 240.00000000000003], 'ignore': False, 'order': 1, 'anno_id': 1, 'text': 'NON-UNIVALENT FUNCTIONS AND A PARABOLIC REGION', 'line_with_spans': [{'category_type': 'text_span', 'poly': [722.5, 192.50000000000003, 1802.5, 192.50000000000003, 1802.5, 240.00000000000003, 722.5, 240.00000000000003], 'text': ''}], 'attribute': {'text_language': '', 'text_background': '', 'text_rotate': ''}}
{'category_type': 'header', 'poly': [2190, 192.5, 2230, 192.5, 2230, 235, 2190, 235], 'ignore': False, 'order': 2, 'anno_id': 1, 'text': '11', 'line_with_spans': [{'category_type': 'text_span', 'poly': [2190, 192.5, 2230, 192.5, 2230, 235, 2190, 235], 'text': ''}], 'attribute': {'text_language': '', 'text_background': '', 'text_rotate': ''}}
{'category_type': 'formula_caption', 'poly': [2132.5, 397.5, 2230, 397.5, 2230, 452.5, 2132.5, 452.5], 'ignore': False, 'order': 5, 'anno_id': 1