In [None]:
import os
import fitz  # PyMuPDF
import json
import pandas as pd
from tqdm import tqdm
import openai  # 如果你用本地Qwen，这里替换成你自己的调用方式

# ========== 参数设置 ==========
PDF_FOLDER = r"E:\finetuning\knowlegde graph\pdf"
OUTPUT_QP_JSON = "qp_pairs.json"
OUTPUT_QP_CSV = "qp_pairs.csv"
MODEL_NAME = "gpt-3.5-turbo"  # 本地模型请替换

# ========== 工具函数 ==========

def extract_paragraphs_from_pdf(pdf_path, min_len=100):
    doc = fitz.open(pdf_path)
    paragraphs = []
    for page in doc:
        blocks = page.get_text("blocks")
        for block in blocks:
            text = block[4].strip()
            if len(text) > min_len and not text.lower().startswith("figure"):
                paragraphs.append(text)
    return paragraphs

def generate_questions(passage, model="gpt-3.5-turbo", temperature=0.7):
    prompt = f"""你是一个用于训练问答系统的数据标注助手。请基于以下段落提出1-2个有代表性的问题，这些问题应当是该段内容可以直接回答的。每个问题用列表列出：

段落如下：
\"\"\"{passage}\"\"\"

输出格式要求如下（JSON数组）：
["问题1", "问题2"]
    """
    try:
        response = openai.ChatCompletion.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature
        )
        qlist = json.loads(response['choices'][0]['message']['content'])
        return qlist
    except Exception as e:
        print(f"[!] 问题生成失败: {e}")
        return []

# ========== 主流程 ==========

def generate_qp_pairs():
    all_qp_pairs = []

    pdf_files = [f for f in os.listdir(PDF_FOLDER) if f.lower().endswith('.pdf')]
    for fname in tqdm(pdf_files, desc="Processing PDFs"):
        pdf_path = os.path.join(PDF_FOLDER, fname)
        paras = extract_paragraphs_from_pdf(pdf_path)

        for para in paras:
            questions = generate_questions(para)
            for q in questions:
                all_qp_pairs.append({
                    "question": q,
                    "passage": para,
                    "source_file": fname
                })

    # 保存为 JSON
    with open(OUTPUT_QP_JSON, "w", encoding="utf-8") as f:
        json.dump(all_qp_pairs, f, ensure_ascii=False, indent=2)

    # 保存为 CSV
    pd.DataFrame(all_qp_pairs).to_csv(OUTPUT_QP_CSV, index=False, encoding="utf-8-sig")
    print(f"\n✅ 成功保存 {len(all_qp_pairs)} 个 Q-P 对到 {OUTPUT_QP_JSON} 和 {OUTPUT_QP_CSV}")

# ========== 执行 ==========
if __name__ == "__main__":
    generate_qp_pairs()
