In [136]:
import os
import spacy
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_neo4j import Neo4jGraph
from langchain_ollama import ChatOllama, OllamaEmbeddings
from neo4j import GraphDatabase
from langchain_community.vectorstores import Neo4jVector
from dotenv import load_dotenv

load_dotenv()

True

In [137]:
# Tải mô hình spaCy
nlp = spacy.load("en_core_web_sm")

# Kết nối Neo4j
graph = Neo4jGraph(
    url=os.environ["NEO4J_URI"],
    username=os.environ["NEO4J_USERNAME"],
    password=os.environ["NEO4J_PASSWORD"]
)

driver = GraphDatabase.driver(
    uri=os.environ["NEO4J_URI"],
    auth=(os.environ["NEO4J_USERNAME"], os.environ["NEO4J_PASSWORD"])
)

In [138]:
# Lấy schema tự động từ Neo4j
schema = graph.get_structured_schema
node_labels = list(schema["node_props"].keys())
index_properties = {}

for label, props in schema["node_props"].items():
    string_props = [prop["property"]
                    for prop in props if prop.get("type") in ["STRING", "string"]]
    if string_props:
        index_properties[label] = string_props

print(f"Automatically detected nodes and properties: {index_properties}")

Automatically detected nodes and properties: {'Patient': ['name', 'gender', 'blood_type', 'admission_type', 'treated_by'], 'Disease': ['name'], 'Doctor': ['name'], 'Hospital': ['name'], 'InsuranceProvider': ['name'], 'Medication': ['name'], 'TestResults': ['test_outcome']}


In [139]:
def create_global_fulltext_index(tx):
    labels = "|".join(node_labels)
    props = ", ".join(
        [f"n.{prop}" for label in index_properties for prop in index_properties[label]])
    query = f'''
    CREATE FULLTEXT INDEX `entity_search_index`
    IF NOT EXISTS
    FOR (n:{labels})
    ON EACH [{props}];
    '''
    tx.run(query)

def create_vector_indices(tx):
    for label in node_labels:
        query = f'''
        CREATE VECTOR INDEX {label.lower()}_vector IF NOT EXISTS
        FOR (n:{label}) ON (n.embedding)
        OPTIONS {{indexConfig: {{`vector.dimensions`: 1024, `vector.similarity_function`: 'cosine'}}}}
        '''
        tx.run(query)

with driver.session() as session:
    session.execute_write(create_global_fulltext_index)
    print("Global fulltext index created or already exists.")
    session.execute_write(create_vector_indices)
    print("Vector indices created or already exist.")


Global fulltext index created or already exists.
Vector indices created or already exist.


In [140]:
# Cấu hình embeddings
embeddings = OllamaEmbeddings(model="mxbai-embed-large")
print(f"Embedding size from Ollama: {len(embeddings.embed_query('test'))}")

# Tạo embedding cho tất cả node
with driver.session() as session:
    for label in node_labels:
        text_properties = index_properties.get(label, [])
        if not text_properties:
            continue
        query = f"MATCH (n:{label}) WHERE n.embedding IS NULL RETURN n"
        result = session.run(query)
        for record in result:
            node = record["n"]
            text = " ".join([str(node.get(prop, ""))
                            for prop in text_properties])
            embedding = embeddings.embed_query(text)
            session.run(f"MATCH (n:{label}) WHERE ID(n) = $id SET n.embedding = $embedding",
                        {"id": node.element_id, "embedding": embedding})
        print(f"Embeddings created for {label} nodes.")

# Tạo vector index từ index thủ công
vector_indices = {}
for label in node_labels:
    try:
        text_properties = index_properties.get(label, [])
        if not text_properties:
            continue
        text_property = text_properties[0]
        vector_indices[label] = Neo4jVector.from_existing_index(
            embeddings,
            index_name=f"{label.lower()}_vector",
            node_label=label,
            embedding_node_property="embedding",
            text_node_property=text_property
        )
        print(f"Vector index loaded for {label}")
    except Exception as e:
        print(f"Failed to load vector index for {label}: {e}")

vector_retrievers = {label: index.as_retriever(
    search_kwargs={"k": 10}) for label, index in vector_indices.items()}

Embedding size from Ollama: 1024
Embeddings created for Patient nodes.
Embeddings created for Disease nodes.
Embeddings created for Doctor nodes.
Embeddings created for Hospital nodes.
Embeddings created for InsuranceProvider nodes.
Embeddings created for Medication nodes.
Embeddings created for TestResults nodes.
Vector index loaded for Patient
Vector index loaded for Disease
Vector index loaded for Doctor
Vector index loaded for Hospital
Vector index loaded for InsuranceProvider
Vector index loaded for Medication
Vector index loaded for TestResults


In [141]:
# Hàm trích xuất thực thể bằng spaCy
def extract_entities(question):
    doc = nlp(question)
    entities = {
        "names": [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG"]],  # Tên người, tổ chức (Doctor, Hospital, InsuranceProvider)
        "dates": [ent.text for ent in doc.ents if ent.label_ == "DATE"],  # Ngày tháng (date_of_admission, discharge_date)
        "numbers": [ent.text for ent in doc.ents if ent.label_ == "CARDINAL"],  # Số (age, room_number, amount)
        "nouns": [token.text for token in doc if token.pos_ == "NOUN"],  # Từ khóa (condition, disease, medication, etc.)
        "verbs": [token.text for token in doc if token.pos_ == "VERB"]  # Động từ (admitted, treated, works, etc.)
    }
    return entities

In [142]:
# Hàm graph_retriever tổng quát
def graph_retriever(question: str) -> str:
    result = set()
    try:
        entities = extract_entities(question)
        print(f"Extracted entities: {entities}")
        names = entities.get("names", [])
        dates = entities.get("dates", [])
        nouns = entities.get("nouns", [])
        verbs = entities.get("verbs", [])
    except Exception as e:
        print(f"Error extracting entities: {e}")
        return "No entities extracted due to an error."

    if not (names or dates or nouns or verbs):
        return "No entities extracted to query."

    # Truy vấn theo ngày nhập viện và bệnh lý nếu có dates
    if dates:
        for date in dates:
            response = graph.query(
                """
                MATCH (p:Patient)
                WHERE toString(p.date_of_admission) CONTAINS $query
                OPTIONAL MATCH (p)-[:HAS_DISEASE]->(d:Disease)
                OPTIONAL MATCH (p)-[:TREATED_BY]->(doc:Doctor)
                RETURN 'Patient: ' + coalesce(p.name, 'Unknown') + 
                       ' - Admitted on: ' + toString(p.date_of_admission) + 
                       ' - Disease: ' + coalesce(d.name, 'Unknown') + 
                       ' - Treated by: ' + coalesce(doc.name, 'Unknown') AS output
                LIMIT 50
                """,
                {"query": date},
            )
            result.update([el['output'] for el in response if el['output']])

    # Truy vấn mối quan hệ cụ thể theo tên nếu có names
    if names:
        for entity in names:
            response = graph.query(
                """
                CALL db.index.fulltext.queryNodes('entity_search_index', $query, {limit: 5})
                YIELD node, score
                CALL (node) {
                    WITH node
                    MATCH (p:Patient)-[r:TREATED_BY]->(d:Doctor)
                    WHERE d.name = $query OR p.name = $query
                    RETURN 'Patient: ' + coalesce(p.name, 'Unknown') + ' - TREATED_BY -> Doctor: ' + coalesce(d.name, 'Unknown') AS output
                    UNION ALL
                    WITH node
                    MATCH (d:Doctor)-[r:WORKS_AT]->(h:Hospital)
                    WHERE d.name = $query
                    RETURN 'Doctor: ' + coalesce(d.name, 'Unknown') + ' - WORKS_AT -> Hospital: ' + coalesce(h.name, 'Unknown') AS output
                }
                RETURN DISTINCT output LIMIT 50
                """,
                {"query": entity},
            )
            result.update([el['output'] for el in response if el['output']])

    # Truy vấn tổng quát nếu chỉ có nouns hoặc verbs
    if nouns or verbs:
        for entity in nouns + verbs:
            response = graph.query(
                """
                CALL db.index.fulltext.queryNodes('entity_search_index', $query, {limit: 5})
                YIELD node, score
                CALL (node) {
                    WITH node
                    MATCH (node)-[r]->(neighbor)
                    RETURN labels(node)[0] + ': ' + coalesce(node.name, toString(node.room_number), toString(node.amount), toString(node.date_of_admission), 'Unknown') + 
                           ' - ' + type(r) + ' -> ' + labels(neighbor)[0] + ': ' + coalesce(neighbor.name, toString(neighbor.room_number), toString(neighbor.amount), toString(neighbor.date_of_admission), 'Unknown') AS output
                    UNION ALL
                    WITH node
                    MATCH (node)<-[r]-(neighbor)
                    RETURN labels(neighbor)[0] + ': ' + coalesce(neighbor.name, toString(neighbor.room_number), toString(neighbor.amount), toString(neighbor.date_of_admission), 'Unknown') + 
                           ' - ' + type(r) + ' -> ' + labels(node)[0] + ': ' + coalesce(node.name, toString(node.room_number), toString(node.amount), toString(node.date_of_admission), 'Unknown') AS output
                }
                RETURN DISTINCT output LIMIT 50
                """,
                {"query": entity},
            )
            result.update([el['output'] for el in response if el['output']])

    result_str = "\n".join(sorted(result))
    print(f"Graph data retrieved: {result_str}")
    return result_str or "No relevant relationships found."

In [143]:
def full_retriever(question: str):
    graph_data = graph_retriever(question)
    vector_data = set()
    try:
        entities = extract_entities(question)
        entity_names = entities.get("names", [])
        dates = entities.get("dates", [])
        print(f"Entities for vector retrieval: {entity_names + dates}")
        for entity in entity_names + dates:
            for label in ['Patient', 'Doctor']:
                retriever = vector_retrievers.get(label)
                if retriever:
                    retrieved_docs = retriever.invoke(entity)
                    print(f"Raw vector results for {label} with '{entity}': {retrieved_docs}")
                    for doc in retrieved_docs:
                        content = doc.page_content
                        if isinstance(content, str) and content.strip():
                            vector_data.add(content.title())
    except Exception as e:
        print(f"Error in vector retrieval: {e}")

    vector_data_str = "\n".join(sorted(vector_data)) if vector_data else "No vector data available"
    print(f"Vector data retrieved: {vector_data_str}")
    final_data = f"""Graph data:
{graph_data}
Vector data:
{vector_data_str}
    """
    print(f"Final context passed to LLM: {final_data}")
    return final_data

In [144]:
# Template tối ưu hóa
llm_text = ChatOllama(model="llama3.2", temperature=0.1)
template = """Answer the question based only on the following context:
{context}

The context is divided into 'Graph data' (relationships between entities) and 'Vector data' (similar entities). 
Use 'Graph data' to identify relationships and attributes relevant to the question. 
- For questions about patients admitted on a specific date and their medical condition, extract patient names, admission dates, and diseases from 'Graph data'. Format the answer naturally as: '[Patient name] was admitted on [date] with [disease].'
- For questions about who is treated by a doctor, extract from 'TREATED_BY' relationships and answer: '[Patient name] is treated by Doctor [doctor name].'
- For questions about doctors and hospitals, extract from 'WORKS_AT' relationships and answer: 'Doctor [doctor name] works at [hospital name].'
- For other questions, analyze 'Graph data' to infer relationships and attributes, and provide a concise, natural language answer based on the question's intent. Do not list raw data unless explicitly asked.
Capitalize names properly (e.g., 'Eric Wells' instead of 'eric weLls'). 
If multiple results are found, list them in separate sentences. 
If no relevant data is found or the data does not match the question, say "No information is available."

Question: {question}
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": full_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm_text
    | (lambda x: (print(f"Raw LLM output: {x}"), x)[1])
    | StrOutputParser()
)

In [145]:
# Thử nghiệm
questions = [
    "Which patient is treated by doctor Kevin Wells?"
]
for q in questions:
    print(f"Question: {q}")
    try:
        answer = chain.invoke(q)
        print(f"Answer from LLM: {answer}")
    except Exception as e:
        print(f"Error in chain execution: {e}")
    print("-" * 50)

Question: Which patient is treated by doctor Kevin Wells?
Extracted entities: {'names': ['Kevin Wells'], 'dates': [], 'numbers': [], 'nouns': ['patient', 'doctor'], 'verbs': ['treated']}
Graph data retrieved: Doctor: Kevin Wells - WORKS_AT -> Hospital: Hernandez Rogers and Vang,
Patient: andrEw waTtS - TREATED_BY -> Doctor: Kevin Wells
Entities for vector retrieval: ['Kevin Wells']
Raw vector results for Patient with 'Kevin Wells': [Document(metadata={'treated_by': 'Tonya Fuller', 'admission_type': 'Emergency', 'age': 74, 'gender': 'Male', 'discharge_date': neo4j.time.Date(2022, 6, 22), 'blood_type': 'AB+', 'date_of_admission': neo4j.time.Date(2022, 6, 21)}, page_content='joNaTHAn wells'), Document(metadata={'treated_by': 'Crystal Cain', 'admission_type': 'Elective', 'age': 76, 'gender': 'Male', 'discharge_date': neo4j.time.Date(2019, 12, 19), 'blood_type': 'B-', 'date_of_admission': neo4j.time.Date(2019, 12, 14)}, page_content='MiCHeAl wEllS'), Document(metadata={'treated_by': 'Victor

In [147]:
chain.invoke(input="Cancer patient at Sons and Miller?")

Extracted entities: {'names': ['Sons', 'Miller'], 'dates': [], 'numbers': [], 'nouns': ['Cancer', 'patient'], 'verbs': []}
Graph data retrieved: Patient: Bobby JacksOn - HAS_DISEASE -> Disease: Cancer
Patient: CHrisTInA MARtinez - HAS_DISEASE -> Disease: Cancer
Patient: CLaYTON PEterSon - HAS_DISEASE -> Disease: Cancer
Patient: ChRISTopher BerG - HAS_DISEASE -> Disease: Cancer
Patient: DR. LaUreN ClaRk DDs - HAS_DISEASE -> Disease: Cancer
Patient: ERic riveRa - HAS_DISEASE -> Disease: Cancer
Patient: Erin oRTEga - HAS_DISEASE -> Disease: Cancer
Patient: GRanT GeoRge - HAS_DISEASE -> Disease: Cancer
Patient: JessE BaNks - HAS_DISEASE -> Disease: Cancer
Patient: JessIca king - HAS_DISEASE -> Disease: Cancer
Patient: KaREn pricE dDs - HAS_DISEASE -> Disease: Cancer
Patient: LaurA pEtErS - HAS_DISEASE -> Disease: Cancer
Patient: MALIk MARtINeZ - HAS_DISEASE -> Disease: Cancer
Patient: MELIndA richARds - HAS_DISEASE -> Disease: Cancer
Patient: PAUL Hahn - HAS_DISEASE -> Disease: Cancer
Pati

'From the Graph data, we can see that there are two patients with cancer who have a last name of either "Sons" or "Miller".\n\nBobby JacksOn was admitted with Cancer.\nDr. Laurence Clark DDS was admitted with Cancer.\n\nNo information is available about other patients with cancer who have these last names.'