In [14]:
import os

from langchain_neo4j import Neo4jGraph
from langchain_core.documents import Document
from langchain_experimental.graph_transformers import LLMGraphTransformer

from langchain_community.chat_models import ChatTongyi

import json

os.environ["DASHSCOPE_API_KEY"] = "sk-c763fc92bf8c46c7ae31639b05d89c96"

os.environ["NEO4J_URI"] = "neo4j://localhost:7687"
os.environ["NEO4J_USERNAME"] = "neo4j"
os.environ["NEO4J_PASSWORD"] = "password"

In [15]:
graph = Neo4jGraph()

In [16]:
llm = ChatTongyi(
    model="qwen-max",        
    temperature=0,
    # max_tokens=2048,
)

In [17]:
disease = []
FILE_PATH = "disease.jsonl"
with open(FILE_PATH,"r") as file:
    content = file.readlines()
    s = ""
    for l in content:
        s += l
        if l == "}\n":
            disease.append(json.loads(s))
            s = ""

In [18]:
llm_transformer = LLMGraphTransformer(
    llm=llm,
    allowed_nodes=["皮肤病", "证型", "症状", "方剂"],
    allowed_relationships=[
        "辨证为",      # 皮肤病 → 证型
        "主症包括",    # 证型 → 症状
        "治法为",      # 证型 → 治法（可扩展）
        "方剂包含",    # 方剂 → 中药（若包含中药节点）
        "用于治疗"     # 方剂 → 皮肤病
    ],
    strict_mode=True,
    node_properties=["别名", "病因", "病机"],
    relationship_properties=["依据", "强度"],
    additional_instructions="""
    json格式中，name属性对应皮肤病和别名，注意两个是同一个皮肤病，请不要放在不同节点里面，症状在key_point中，用于治疗和方剂在solution里面
    皮肤病带括号的括号内是别名,各个节点的关系如下：皮肤病 → 证型，证型 → 症状，证型 → 治法（可扩展），方剂 → 中药（若包含中药节点），方剂 → 皮肤病
    """
)

In [19]:
documents = []
for d in disease:
    text = json.dumps(d,ensure_ascii=False)
    documents.append(Document(text))
print(len(documents))

241


In [20]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
import tiktoken

# === 配置 ===
BATCH_SIZE = 10
TOKENIZER_MODEL = "gpt-4"  # 仅用于估算 token，Qwen 与 tiktoken 接近
MAX_CHUNK_TOKENS = 20000    # 安全上限（Qwen 最大上下文 ~32k，留足余量）

# 初始化分词器
tokenizer = tiktoken.encoding_for_model(TOKENIZER_MODEL)

def tiktoken_len(text):
    return len(tokenizer.encode(text))

# 初始化文本分割器（按 token 切分）
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=MAX_CHUNK_TOKENS,
    chunk_overlap=200,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", "。", "；", "？", "！", " ", ""],
)

# === 加载断点 ===
begin = 0
try:
    with open("bp.txt", "r") as f:
        begin = int(f.read().strip())
except (FileNotFoundError, ValueError):
    begin = 0

# === 主循环：按原始文档索引断点续跑 ===
for doc_idx in range(begin, len(documents)):
    print(f"Processing document {doc_idx + 1}/{len(documents)}...")
    original_doc = documents[doc_idx]

    # 自动切分超长文档
    try:
        split_docs = text_splitter.split_documents([original_doc])
    except Exception as e:
        print(f"  ⚠️ Failed to split document {doc_idx}: {e}")
        split_docs = [original_doc]  # 回退到原始文档（可能后续失败）

    # 分批处理切分后的片段（即使一个文档被切成多块，也逐块处理）
    for i in range(0, len(split_docs), BATCH_SIZE):
        batch = split_docs[i:i + BATCH_SIZE]
        try:
            graph_docs = llm_transformer.convert_to_graph_documents(batch)
            graph.add_graph_documents(graph_docs, baseEntityLabel=True)
        except Exception as e:
            print(f"  ❌ Batch failed in doc {doc_idx}, chunk {i//BATCH_SIZE}: {e}")
            # 可选：记录失败 chunk，但为简化，此处不中断
            continue

    # 成功处理完一个原始文档后，更新断点（按原始 doc_idx）
    with open("bp.txt", "w") as f:
        f.write(str(doc_idx + 1))  # 下次从下一个原始文档开始

Processing document 240/241...
  ❌ Batch failed in doc 239, chunk 0: 'request'
Processing document 241/241...
  ❌ Batch failed in doc 240, chunk 0: 'request'
