In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
!ls "/content/drive/MyDrive/finalproject"

checkpoint  pubmed_article.json


In [14]:
import json
with open("/content/drive/MyDrive/finalproject/pubmed_article.json") as f:
    articles = json.load(f)
examples = articles[:10]
print(examples)

[{'article_title': 'CORRIGENDUM.', 'article_abstract': '[This corrects the article DOI: 10.48101/ujms.v129.10741.].', 'pub_date': {'year': '2025', 'month': '04', 'day': '03'}}, {'article_title': 'Hepatolithiasis pathogenesis update.', 'article_abstract': "Hepatolithiasis is prevalent in East Asian countries and not common in Western countries. In recent years, because of the increased number of immigrants from East Asia in Western countries, hepatolithiasis has gradually become a global problem. Although current surgical interventions for hepatolithiasis boast a high rate of stone clearance, the persistent challenges of the disease's refractory nature and high recurrence rate continue to complicate its treatment. Therefore, understanding its underlying pathogenesis is meaningful for effective treatment. In this review, we discuss the common risk factors: infection, cholangitis, environmental factors and diet habits, abnormal bile components, anatomical abnormalities, and bile stasis, a

In [19]:
import json
import re
from tqdm import tqdm
from openai import OpenAI


# Prompt 模板
prompt_template = """
Given the following PubMed abstract, write one medical question that can be answered using the content of the abstract.
Then provide the answer using only the information from the abstract.

Abstract:
{abstract}

Question:
"""

# 强健的解析函数
def parse_output(output):
    match = re.search(r"Question[:：]?\s*(.+?)\s*Answer[:：]?\s*(.+)", output, re.DOTALL | re.IGNORECASE)
    if match:
        return match.group(1).strip(), match.group(2).strip()
    else:
        parts = re.split(r"\n\s*\n", output.strip())
        if len(parts) >= 2:
            return parts[0].strip(), parts[1].strip()
        elif len(parts) == 1:
            return parts[0].strip(), ""
        return None, None

# 调用 LLM
def generate_qa(abstract):
    prompt = prompt_template.format(abstract=abstract)
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}],
        temperature=0,
        max_tokens=256
    )
    return response.choices[0].message.content

# 加载 PubMed 数据
with open("/content/drive/MyDrive/finalproject/pubmed_article.json") as f:
    examples = json.load(f)

qa_pairs = []

# 生成并解析问答对
for article in tqdm(examples[:10]):
    abstract = article['article_abstract']
    try:
        output = generate_qa(abstract)
        question, answer = parse_output(output)
        if question and answer:
            qa_pairs.append({
                "abstract": abstract,
                "question": question,
                "answer": answer
            })
        else:
            print("⚠️ 无法解析格式：", output)
    except Exception as e:
        print("❌ Error:", e)

# 保存为 JSON 文件
output_path = "/content/drive/MyDrive/finalproject/generated_qa_pairs.json"
with open(output_path, "w") as f:
    json.dump(qa_pairs, f, indent=2, ensure_ascii=False)

print(f"✅ Saved {len(qa_pairs)} QA pairs to {output_path}")


100%|██████████| 10/10 [00:09<00:00,  1.01it/s]

✅ Saved 10 QA pairs to /content/drive/MyDrive/finalproject/generated_qa_pairs.json



