In [None]:
import os
import time
import requests
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

API_BASE = "http://wisdoc-rs-dev.atominnolab.com/api/v2"

def upload_file(file_path):
    """上传文件并获取 JOB_ID"""
    print(f"提交文件: {file_path}")
    url = f"{API_BASE}/documents"
    with open(file_path, "rb") as f:
        files = {"file": f}
        resp = requests.post(url, files=files)
        resp.raise_for_status()
        res = resp.json()
        return res.get("job_id") or res.get("JOB_ID")

def check_status(job_id, interval=1):
    """轮询 JOB 状态直到完成，返回 RESULT_FILE_PATH"""
    url = f"{API_BASE}/documents/{job_id}"
    while True:
        resp = requests.get(url)
        resp.raise_for_status()
        res = resp.json()
        status = res.get("status")
        if status == "completed":
            return res.get("result_file_path")
        elif status == "failed":
            raise RuntimeError(f"Job {job_id} failed")
        time.sleep(interval)

def fetch_result(result_file_path):
    """获取解析结果"""
    url = f"{API_BASE}/files/{result_file_path}"
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.text

def process_file(file_path, output_folder):
    """处理单个文件：上传 → 等待完成 → 获取结果 → 存文件"""
    job_id = upload_file(file_path)
    result_file_path = check_status(job_id)
    result = fetch_result(result_file_path)

    base_name = os.path.splitext(os.path.basename(file_path))[0]
    output_path = os.path.join(output_folder, f"{base_name}.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json_data = json.loads(result)
        json.dump(json_data, f, ensure_ascii=False, indent=2)

    print(f"完成: {file_path} → {output_path}")
    return output_path

def get_results(folder_path, output_folder, max_workers=5):
    """并发处理文件夹内所有 PDF 文件"""
    os.makedirs(output_folder, exist_ok=True)
    files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if int(f[:-4]) >= 57]

    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_file, f, output_folder): f for f in files}
        # 使用tqdm添加进度条
        for future in tqdm(as_completed(futures), total=len(futures), desc="处理进度"):
            try:
                results.append(future.result())
            except Exception as e:
                print(f"文件 {futures[future]} 处理失败: {e}")
    return results


In [None]:
if __name__ == "__main__":
    input_folder = "/Users/bytedance/Project/OmniDocBench/OursDataset/pdfs"
    output_folder = "/Users/bytedance/Project/OmniDocBench/Models_ouput/wisdoc_output"

    results = get_results(input_folder, output_folder, max_workers=5)
    print("所有文件处理完成，结果存储在：", output_folder)


In [None]:
data_dir = "/Users/bytedance/Project/MinerU/wisdoc_output"





In [1]:
import collections
import json
import os
output_path = "/Users/bytedance/Project/OmniDocBench/Models_ouput/wisdoc_results_md/"
def process_json(json_path):

    with open(json_path, 'r') as f:
        data = json.load(f)

    doc_num = int(json_path.split('/')[-1].split('.')[0])
    
    
    data = data['tree']

    paged_data = {}

    for item in data:
        if item['type'] == 'IMAGE':
            continue
        page_no = item['page_no']
        
        if item['type'] == 'FORMULA':
            item['text'] = f"$$\n{item['text']}\n$$"

        paged_data.setdefault(page_no, []).append(item)

    for page_no, items in paged_data.items():
        text = '\n\n'.join([item['text'] for item in items])

        with open(os.path.join(output_path, f"{doc_num:03d}_{page_no+1:02d}.md"), "w", encoding="utf-8") as f:
            f.write(text)


In [2]:
# d = process_json('/Users/bytedance/Project/OmniDocBench/Models_ouput/wisdoc_output/pdf_json/57.json')

path = '/Users/bytedance/Project/OmniDocBench/Models_ouput/wisdoc_output/pdf_json'
for f in  os.listdir(path):
    process_json(os.path.join(path, f))



In [None]:
print(d[0])

In [None]:
with open("/Users/bytedance/Project/MinerU/wisdoc_output/demo2.json", "r", encoding="utf-8") as f:
    json_data = json.load(f)

In [3]:
from pylatexenc.latex2text import LatexNodes2Text
from bs4 import BeautifulSoup

def latex_table_to_html(latex_table):
    # 转换LaTeX到文本（包含基本HTML结构）
    text_converter = LatexNodes2Text()
    html_output = text_converter.latex_to_text(latex_table)
    
    # 美化HTML（可选）
    soup = BeautifulSoup(html_output, 'html.parser')
    return soup.prettify()

# 示例LaTeX表格
latex_table = r"""
\begin{tabular}{|c|c|c|}
    \hline
    姓名 & 年龄 & 城市 \\
    \hline
    Alice & 25 & 北京 \\
    \hline
    Bob & 30 & 上海 \\
    \hline
\end{tabular}
"""

# 转换并打印结果
html_result = latex_table_to_html(latex_table)
print(html_result)
    

ModuleNotFoundError: No module named 'pylatexenc'