In [6]:
!pip3 install openai

Defaulting to user installation because normal site-packages is not writeable
Collecting openai
  Downloading openai-1.84.0-py3-none-any.whl (725 kB)
[K     |████████████████████████████████| 725 kB 682 kB/s eta 0:00:01
[?25hCollecting httpx<1,>=0.23.0
  Downloading httpx-0.28.1-py3-none-any.whl (73 kB)
[K     |████████████████████████████████| 73 kB 139 kB/s eta 0:00:01
[?25hCollecting pydantic<3,>=1.9.0
  Downloading pydantic-2.11.5-py3-none-any.whl (444 kB)
[K     |████████████████████████████████| 444 kB 155 kB/s eta 0:00:01
Collecting distro<2,>=1.7.0
  Downloading distro-1.9.0-py3-none-any.whl (20 kB)
Collecting tqdm>4
  Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 503 kB/s eta 0:00:01
[?25hCollecting anyio<5,>=3.5.0
  Downloading anyio-4.9.0-py3-none-any.whl (100 kB)
[K     |████████████████████████████████| 100 kB 540 kB/s ta 0:00:01
[?25hCollecting sniffio
  Downloading sniffio-1.3.1-py3-none-any.whl (10 kB)
Collecti

In [28]:
import json
import unicodedata
import time
from tqdm import tqdm
from openai import OpenAI
import os

client = OpenAI(
    api_key="sk-bCeqVHwJt0yXU5VIarZPidGnjGK510UKA19Dqh6EuASQrCEG",
    base_url="https://lonlie.plus7.plus/v1"
)

scene_name_map = {
    "software_PSI": "软件PSI",
    "software_MPC": "软件MPC",
    "software_PIR": "软件PIR",
    "hardware_PSI": "硬件PSI",
    "hardware_MPC": "硬件MPC",
    "hardware_PIR": "硬件PIR",
    "hardware_PIRMPC": "硬件PIRMPC",
    "Federated_learning": "联邦学习"

}


def build_prompt(rule, num_samples=100, previous_first_sample=None):
    prompt = f"""
You are a PQL generation assistant. Your task is to generate exactly {num_samples} high-quality question-PQL triples, formatted as a **raw JSON array** (no markdown or explanatory text), like this:

[
  {{
    "question": "...",
    "Chinese_question": "...",
    "PQL_query": "..."
  }},
  ...
]

## Generation Requirements:
1. Each triple must describe a meaningful secure computation task.
2. All table names, field names, and tenant/platform names used in the PQL must appear **explicitly** in both the English and Chinese questions.
3. English and Chinese questions must be **natural and fluent**, with **non-templated** phrasing. The Chinese question should NOT be a direct translation.
4. Ensure **diversity** in question intent and structure — avoid duplicate or near-duplicate questions.
5. If the scenario is a hardware environment (such as TEE), please specify it in the question.
6. **Only return the JSON array.** Do not include any explanations, headings, markdown (such as ```), or extra text.
"""
    if previous_first_sample:
        prompt += f"""

## Constraint:
Avoid generating samples that are similar in content, structure, or intent to the following existing one:

{json.dumps(previous_first_sample, ensure_ascii=False, indent=2)}
"""
    prompt += f"""

Based on the rule below, generate {num_samples} diverse and correct samples.

Rule:
{rule}
"""
    return prompt


def generate_100_triplets(rule, scene, num_samples=100, previous_first_sample=None):
    prompt_system = "You are a database expert proficient in SQL and PQL for privacy-preserving applications."
    user_prompt = build_prompt(rule, num_samples=num_samples, previous_first_sample=previous_first_sample)
    try:
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "system", "content": prompt_system},
                {"role": "user", "content": user_prompt}
            ],
            temperature=1.0,
            max_tokens=8192
        )
        content = completion.choices[0].message.content
        content = unicodedata.normalize('NFKC', content)

        try:
            parsed = json.loads(content)
            if isinstance(parsed, list):
                print(f"{scene} 场景成功生成 {len(parsed)} 条。")
                return parsed
            else:
                raise ValueError("返回内容不是 JSON 数组")
        except json.JSONDecodeError:
            print(f"[警告] {scene} 场景 JSON 解析失败，已保存原始文本供手动处理。")
            os.makedirs("fallback", exist_ok=True)
            with open(f"fallback/{scene}_raw.txt", "w", encoding="utf-8") as f:
                f.write(content)
            return None
    except Exception as e:
        print(f"[错误] {scene} 场景生成失败: {e}")
        return None


def main():
    rule_file = "rules.json"            # 输入规则文件，格式：{ "场景名": "规则内容", ... }
    output_file = "pql_dataset.json"    # 最终输出文件

    with open(rule_file, "r", encoding="utf-8") as f:
        rules_dict = json.load(f)

    final_data = {}

    for scene, rule in rules_dict.items():
        print(f"\n🎯 正在生成场景：{scene}")
        all_triplets = []
        prev_first_sample = None

        for i in range(5):
            print(f"  ⏳ 第 {i + 1}/5 次生成中...")
            triplets = generate_100_triplets(rule, scene, num_samples=20, previous_first_sample=prev_first_sample)
            if triplets:
                all_triplets.extend(triplets)
                prev_first_sample = triplets[0]
            else:
                print(f"  ⚠️ 第 {i + 1} 次生成失败，跳过。")
            time.sleep(1)

        # 去重（以 question 为主）
        unique_triplets = {}
        for item in all_triplets:
            q = item["question"]
            if q not in unique_triplets:
                unique_triplets[q] = item
        deduped = list(unique_triplets.values())[:100]

        final_data[scene] = deduped
        print(f"✅ 场景 {scene} 共保留 {len(deduped)} 条去重后样本。")

    # 保存最终数据集
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(final_data, f, ensure_ascii=False, indent=2)

    print(f"\n✅ 全部生成完成，已保存至 {output_file}")


if __name__ == "__main__":
    main()



🎯 正在生成场景：software_PSI
  ⏳ 第 1/5 次生成中...
software_PSI 场景成功生成 11 条。
  ⏳ 第 2/5 次生成中...
software_PSI 场景成功生成 20 条。
  ⏳ 第 3/5 次生成中...
software_PSI 场景成功生成 20 条。
  ⏳ 第 4/5 次生成中...
software_PSI 场景成功生成 20 条。
  ⏳ 第 5/5 次生成中...
software_PSI 场景成功生成 20 条。
✅ 场景 software_PSI 共保留 91 条去重后样本。

🎯 正在生成场景：software_MPC
  ⏳ 第 1/5 次生成中...
software_MPC 场景成功生成 11 条。
  ⏳ 第 2/5 次生成中...
software_MPC 场景成功生成 20 条。
  ⏳ 第 3/5 次生成中...
software_MPC 场景成功生成 20 条。
  ⏳ 第 4/5 次生成中...
software_MPC 场景成功生成 20 条。
  ⏳ 第 5/5 次生成中...
software_MPC 场景成功生成 20 条。
✅ 场景 software_MPC 共保留 91 条去重后样本。

🎯 正在生成场景：software_PIR
  ⏳ 第 1/5 次生成中...
software_PIR 场景成功生成 20 条。
  ⏳ 第 2/5 次生成中...
software_PIR 场景成功生成 20 条。
  ⏳ 第 3/5 次生成中...
software_PIR 场景成功生成 20 条。
  ⏳ 第 4/5 次生成中...
software_PIR 场景成功生成 20 条。
  ⏳ 第 5/5 次生成中...
software_PIR 场景成功生成 20 条。
✅ 场景 software_PIR 共保留 100 条去重后样本。

🎯 正在生成场景：hardware_PSI
  ⏳ 第 1/5 次生成中...
[警告] hardware_PSI 场景 JSON 解析失败，已保存原始文本供手动处理。
  ⚠️ 第 1 次生成失败，跳过。
  ⏳ 第 2/5 次生成中...
hardware_PSI 场景成功生成 20 条。
  ⏳ 第 3/5 次生成中...
[警告]