In [1]:
import sys
import os
sys.path.append(r'/home/pjtl2w01admin/csm/graphDB_pjt')
import config
import logging
import json
import pandas as pd
from io import BytesIO
from typing import List, Dict, Any
from uuid import uuid4
from fastapi import UploadFile
from elasticsearch.helpers import BulkIndexError
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from langchain_elasticsearch import ElasticsearchRetriever
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma, ElasticsearchStore
from langchain_core.documents import Document
from collections import defaultdict
from db_to_graphDB.extract_preprocessed_data import list_preprocessed_columns, preprocess_data

In [2]:
# Logging 설정
logging.basicConfig(level='INFO', format='%(asctime)s %(name)s: %(message)s', datefmt='%Y-%m-%dT%H:%M:%S')
logger = logging.getLogger(__name__)

# TMP 디렉토리 설정
TMP_DIR = os.path.join(os.getcwd(), "tmp")
os.makedirs(TMP_DIR, exist_ok=True)  # 디렉토리가 없으면 생성

# VectorDB 저장 경로
PERSIST_DIRECTORY = "inforactive_vdb"
os.makedirs(PERSIST_DIRECTORY, exist_ok=True) 

In [4]:
ELASTICSEARCH_URL = os.getenv("ELASTICSEARCH_URL", "http://localhost:9200")
ELASTICSEARCH_INDEX_NAME = os.getenv("ELASTICSEARCH_INDEX_NAME", "inforactive_vdb")
OPEN_API_KEY_SERVER = os.getenv("OPEN_API_KEY_SERVER")

In [5]:
TEXT_FIELD = "text"
CONTENT_FIELD = "text"
VECTOR_FIELD = "vector"

기타 필요 함수

In [6]:
# 파일 저장 함수
def store_file(file_name, df, version_name):
    xlsx_path = os.path.join(TMP_DIR, f"{file_name}_{version_name}.xlsx")
    df.to_excel(xlsx_path, index=False)
    logger.info(f"{file_name}_{version_name} 버전 저장 완료 (.xlsx) : {xlsx_path}")

In [6]:
import psutil
print(psutil.virtual_memory())


svmem(total=545013977088, available=489936875520, percent=10.1, used=50730840064, free=380560707584, active=44154126336, inactive=111897530368, buffers=4707917824, cached=109014511616, shared=55058432, slab=7331848192)


# Generate Schema

## RAG 구성 for schema generation

In [7]:
def get_embedding_model():
    try:
        embeddings = OpenAIEmbeddings (
            openai_api_key = config.OPEN_API_KEY_SERVER,
            model = "text-embedding-3-small"
        )
        logger.info("OpenAI Embedding 모델이 성공적으로 초기화되었습니다.")
        return embeddings
    except Exception as e:
        logger.error(f"Embedding 모델 초기화 중 오류 발생 : {e}")
        raise e    

In [8]:
embeddings = get_embedding_model()

2025-07-28T13:29:54 __main__: OpenAI Embedding 모델이 성공적으로 초기화되었습니다.


In [9]:
elastic_vector_search = ElasticsearchStore(
    index_name = ELASTICSEARCH_INDEX_NAME,
    es_url = ELASTICSEARCH_URL,
    embedding=embeddings
)

  elastic_vector_search = ElasticsearchStore(
2025-07-28T13:30:02 elastic_transport.transport: GET http://localhost:9200/ [status:200 duration:0.003s]


In [10]:
#initialization
vector_store = ElasticsearchStore(
    "inforactive_vdb",
    embedding = embeddings,
    es_url = ELASTICSEARCH_URL,
    vector_query_field="vector"
)

vector_store

2025-07-28T13:30:04 elastic_transport.transport: GET http://localhost:9200/ [status:200 duration:0.004s]


<langchain_community.vectorstores.elasticsearch.ElasticsearchStore at 0x7fa1c057b310>

In [11]:
#add items to vector store
def add_schema_data_to_vdb(schema_json_path: str) -> ElasticsearchStore:
    with open(schema_json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    documents: List[Document] = []

    for table in data["schemas"]:
        table_name = table["table_name"]
        chunks = []

        # summary 저장
        documents.append(Document(
            page_content=f"[Table: {table_name}]\n{table['schema_explanation']}",
            metadata={
                "table_name": table_name,
                "chunk_type": "table_summary"
            }
        ))

        # node 저장
        for node in table.get("nodes", []):
            documents.append(Document(
                page_content = (
                    f"[Node: {node['label']}]\n"
                    f"Description: {node['description']}\n"
                    f"Properties: {', '.join(node.get('properties', []))}"
                ),
                metadata={
                    "table_name": table_name,
                    "chunk_type": "node",
                    "label": node["label"],
                    "properties_list": ', '.join(node.get("properties", []))
                }
            ))


        # relation 저장
        for rel in table.get("relationships", []):
            documents.append(Document(
                page_content=(
                    f"[Relationship: {rel['from']} - [{rel['type']}] -> {rel['to']} ] \n"
                    f"Description: {rel['description']}\n"
                    f"Properties: {', '.join(rel.get('properties', []))}"
                ),
                metadata={
                    "table_name": table_name,
                    "chunk_type": "relationship",
                    "from": rel["from"],
                    "to": rel["to"],
                    "relationship_type": rel["type"],
                    "properties_list": ', '.join(rel.get("properties", []))
                }
            ))
    
    print(f"총 {len(documents)}개 문서를 처리중입니다")

    uuids = [str(uuid4()) for _ in range(len(documents))]

    try:
        vector_store.add_documents(documents=documents, ids=uuids)
        return vector_store

    except BulkIndexError as e:
        for i, err in enumerate(e.errors):
            reason = err.get('index', {}).get('error', {}).get('reason', 'No reason')
            doc_id = err.get('index', {}).get('_id', 'No ID')
            print(f"[{i}] Failed to index document ID {doc_id} → Reason: {reason}")
            # 추가 정보 출력해도 됨
            print("Full error:", err)

In [1]:
### 삭제용
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200")

mapping = {
    "mappings": {
        "properties": {
            "vector": {
                "type": "dense_vector",
                "dims": 1536,
                "index": True,
                "similarity": "cosine"
            },
            "metadata": {
                "type": "object"
            },
            "text": {
                "type": "text"
            }
        }
    }
}

if es.indices.exists(index="inforactive_vdb"):
    es.indices.delete(index="inforactive_vdb")

es.indices.create(index="inforactive_vdb", body=mapping)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'inforactive_vdb'})

In [None]:
add_schema_data_to_vdb(r"/home/pjtl2w01admin/csm/graphDB_pjt/rag/graphdb_schema_data.json")

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embedding = model.encode("임베딩 테스트용 문장")

Dense search - 의미기반

In [15]:
# Dense vector Search
def vector_query(search_query: str) -> Dict:
    query_vector = embeddings.embed_query(search_query)  # same embeddings as for indexing
    return {
        "knn": {
            "field": VECTOR_FIELD,
            "query_vector": query_vector,
            "k": 8,
            "num_candidates": 15,
        }
    }

embeddings = get_embedding_model()

vector_retriever = ElasticsearchRetriever.from_es_params(
    index_name = ELASTICSEARCH_INDEX_NAME,
    body_func=lambda search_query: vector_query(search_query),
    content_field=CONTENT_FIELD,
    url=ELASTICSEARCH_URL
)
#retrieved_documents = vector_retriever.invoke("울산광역시에 있는 회사 고객 정보가 있는 스키마를 추출해줘")

2025-07-21T11:26:05 __main__: OpenAI Embedding 모델이 성공적으로 초기화되었습니다.
2025-07-21T11:26:05 elastic_transport.transport: GET http://localhost:9200/ [status:200 duration:0.003s]


Sparse Search - 키워드 기반 검색

In [16]:
# BM25 -> keyword matching => 실제 데이터 예시가 schema에는 없기 때문에 추출하기가 어려움!
def bm25_query(search_query: str) -> Dict:
    return {
        "query": {
            "match": {
                TEXT_FIELD : search_query,
            }
        }
    }

bm25_retriever = ElasticsearchRetriever.from_es_params(
    index_name = ELASTICSEARCH_INDEX_NAME,
    body_func=lambda search_query: bm25_query(search_query),
    content_field=TEXT_FIELD,
    url=ELASTICSEARCH_URL
)

bm25_retriever.invoke("울산광역시에 있는 회사 고객 정보가 있는 스키마를 추출해줘")

2025-07-21T11:26:05 elastic_transport.transport: GET http://localhost:9200/ [status:200 duration:0.002s]
2025-07-21T11:26:05 elastic_transport.transport: POST http://localhost:9200/inforactive_vdb/_search [status:200 duration:0.007s]


[Document(metadata={'_index': 'inforactive_vdb', '_id': '70a19d39-c257-40a7-8638-0eb3e5ea6404', '_score': 5.392723, '_source': {'metadata': {'table_name': 'MAST', 'chunk_type': 'table_summary'}, 'vector': [0.020358940586447716, 0.06678103655576706, 0.021170515567064285, -0.03271806985139847, -0.0007644747383892536, -0.03675275668501854, -0.025135640054941177, -0.005283933598548174, -0.020463285967707634, -0.012695353478193283, 0.030538411810994148, -0.0074433027766644955, -0.005530304741114378, -0.0338078998029232, 0.06112319976091385, 0.04574964568018913, 0.004179611802101135, -0.011727260425686836, -0.02543708123266697, 0.00043404774623923004, 0.06645640730857849, -0.006741870194673538, 0.005910005886107683, -0.004208596423268318, -0.02018503099679947, 0.034109342843294144, 0.0031709398608654737, -0.028381939977407455, -0.04067150503396988, -0.018654633313417435, -0.015767743811011314, -0.010637430474162102, 0.01889810524880886, 0.025089263916015625, 0.003281082259491086, -0.00339991

Hybrid Search

In [17]:
# def hybrid_search(search_query: str) -> Dict:
#     query_vector = embeddings.embed_query(search_query)
#     return {
#         "query": {
#             "bool": {
#                 "should": [
#                     {
#                         "match": {
#                             TEXT_FIELD: search_query,
#                         }
#                     },
#                     {
#                         "knn": {
#                             "field": VECTOR_FIELD,
#                             "query_vector": query_vector,
#                             "k": 2,
#                             "num_candidates": 5
#                         }
#                     }
#                 ]
#             }
#         }
#     }

# def hybrid_search(search_query: str) -> Dict:
#     query_vector = embeddings.embed_query(search_query)
#     return {
#         "query": {
#             "knn": {
#                 "field": VECTOR_FIELD,
#                 "query_vector": query_vector,
#                 "k": 2,
#                 "num_candidates": 5,
#                 "filter": {
#                     "match": {
#                         TEXT_FIELD: search_query
#                     }
#                 }
#             }
#         }
#     }


# hybrid_retriever = ElasticsearchRetriever.from_es_params(
#     index_name = ELASTICSEARCH_INDEX_NAME,
#     body_func=lambda search_query: hybrid_search(search_query),
#     content_field=TEXT_FIELD,
#     url=ELASTICSEARCH_URL
# )

# hybrid_retriever.invoke("울산광역시에 있는 회사 고객 정보가 있는 스키마를 추출해줘")

## LLM - generate schema

In [18]:
def get_model():
    try:
        model = ChatOpenAI(
            openai_api_key="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",
            model="gpt-4o",
            temperature=0,
            max_tokens=4048
        )
        logger.info("OpenAI 모델이 성공적으로 초기화되었습니다.")
        return model
    except Exception as e:
        logger.error(f"모델 초기화 중 오류 발생: {e}")
        raise e


In [19]:
def extract_fields_description(table_name: str, fields: list) -> dict:
    fields_and_description = {}
    try:
        with open("/home/pjtl2w01admin/csm/graphDB_pjt/rag/column_description.json", "r", encoding="utf-8") as f:
            column_descriptions = json.load(f)
        
        table_specific_columns= {}
        for entry in column_descriptions:
            if entry.get("table_name").upper() == table_name.upper():
                table_specific_columns = {k.upper(): v for k, v in entry.get("columns", {}).items()}
                break
        
        for field in fields:
            description = table_specific_columns.get(field.upper(), field)
            fields_and_description[field] = description
        print(fields_and_description)
        return fields_and_description
    
    except Exception as e:
        logger.error(f"컬럼 설명을 추출하는 중 오류가 발생했습니다: {str(e)}")

In [20]:
def get_data_info(file_name, file: UploadFile):
    table_name = file_name

    try:
        #file.file.seek(0)
        #df = pd.read_excel(BytesIO(file.file.read()), dtype=str)
        df = pd.read_excel(file, dtype=str)
        df.columns = [col.strip().upper() for col in df.columns]
        df = df.fillna("").applymap(str.strip)
    except Exception as e:
        logger.error(f"엑셀 파일 로딩 실패: {str(e)}")
        raise e
    
    cleaned_df = preprocess_data(df)

    store_file(table_name, cleaned_df, version_name="cleaned")

    fields = list_preprocessed_columns(cleaned_df)
    fields_and_description = extract_fields_description(table_name, fields)
    sample_rows = cleaned_df.head(10).to_dict(orient='records')

    data_info = {
        "table_name": table_name,
        "fields_and_description" : fields_and_description,
        "sample_rows": sample_rows
    }
    logger.info("Data information successfully extracted")

    return data_info

In [21]:
def create_graphdb_schema(model, data_info, retrieved_context: str):
    required_keys = ['table_name', 'fields_and_description', 'sample_rows']
    
    for key in required_keys:
        if key not in data_info:
            raise ValueError(f"Missing required key in data_info: {key}")
    
    try:
        # sample_rows 문자열화
        sample_str = json.dumps(data_info['sample_rows'], indent=2)
        logger.info(f"retrieve된 문서들 : {retrieved_context}")

        schema_prompt = f"""
You are a data modeling expert with deep understanding of SAP systems and business processes. Your task is to analyze the following SAP table and design a comprehensive and semantically meaningful GraphDB schema.

# Data Information : 
- Table Name: {data_info['table_name']}
- Sample Rows:
{sample_str}
- Extra Important information:
{retrieved_context}

Your task consists of the following steps:

---
**Step 1: Understand Inherent Data Information & Column Semantics**

First, meticulously analyze each column (field) in the provided table.
- **Define Column Meaning:** For each column, explicitly describe its business meaning and  and what real-world concept or attribute it represents.
- **Identify Key Entities & Attributes:** Determine what primary business objects and their essential attributes are implied by these columns.
- **Infer Relationships & Structure:** Based on the column values, enterprise data patterns and SAP context, infer the inherent relationships between these entities. Recognize if there's a hierarchical or many-to-many structure implied.

---
**Step 2: Identify All Business Purposes / Usage Scenarios of the Data**

Based on your comprehensive understanding from Step 1, identify **all distinct and valuable business usage contexts or scenarios** for this data. Think broadly about how this data could empower various organizational functions. Consider:
- **Core Business Processes:** How does this data support fundamental operations (e.g., order fulfillment, financial reporting, material management, human resources, sales analytics)?
- **Analytical Insights:** What types of insights can be derived (e.g., performance metrics, trend analysis, root cause analysis, predictive modeling)?
- **Decision Support:** How can this data inform strategic or operational decisions (e.g., resource allocation, risk assessment, process optimization, market segmentation)?
- **Master Data Management:** How does it define, maintain, or relate master data records (e.g., customer, vendor, product master)?
- **Transactional Context:** How does it capture and describe specific business events or transactions?
- **Reporting & Dashboards:** What key performance indicators (KPIs) or reports could be generated?
- **Integration & Data Flow:** How might this data interact with or be enriched by other systems or datasets (e.g., external market data, IoT sensor data, CRM systems, HR systems)?

You are not limited to one usage scenario; enumerate all relevant and valuable ones.

---

**Step 3: Design the GraphDB Schema**

Translate your comprehensive understanding from Step 1 and Step 2 into a GraphDB schema, following the specified JSON output format.

- **schema description**: write a summarized explanation of the schema. include what entities and relationships (with direction) are included in natural language.

- **Nodes (Entities)**: Identify all significant business entities that should be represented as nodes.
  - For each node:
    - `label`: A clear, semantically meaningful name.
    - `primary_key`: The column(s) that uniquely identify this node.
    - `description`: A rich, business-contextual explanation of what the entity represents, its key identifying properties, how it typically relates to other entities, and any overall/specific usage.
    - `properties`: A list of all relevant original column names that describe this node.

- **Relationships (Edges)**: Define explicit connections between your nodes based on the inherent relationships and identified business usages.
  - For each relationship:
    - `from`: The label of the source node.
    - `to`: The label of the target node.
    - `type`: A clear, semantically meaningful relationship type (e.g., `HAS_ITEM`, `PERFORMS_TRANSACTION`, `BELONGS_TO`).
    - `properties`: A list of original column names that describe the relationship itself -> ONLY include the core ones if needed.
    - `description`: A detailed explanation of the business logic or connection it represents, the direction's implication, and how it enables specific data navigation, classification, or aggregation.

- **'Extra' Node:** Create an 'Extra' node to capture any original columns that are not used in your primary graph schema. This node ensures no data is lost.
  - Connect this 'Extra' node to the most relevant primary entity (e.g., `Material`, `Order`, `Document`) via a generic relationship (e.g., `HAS_EXTRA_INFO`). Clearly state which primary node it connects to.

---

4. **Output Format**
Return the result as **valid JSON only**, following this structure:

{{
  "table_name": "{data_info['table_name']}",
  "schema_explanation" : "The following is an ontology consisting of the entities Person and Food. There exists a FRIEND_OF relationship between persons, and a LIKES relationship between persons and food. The friendship relationship is bidirectional."
  "nodes": [
    {{
      "label": "EntityA",
      "primary_key": "field2",
      "description": "EntityA represents the main business object in this dataset. It is uniquely identified by a combination of field2, field4, field11, and field12 (stored as a hash in concatenatedFields). The node typically serves as the starting point in relationships with other entities such as EntityB. The 'name' property is used for human-readable display.",
      "properties": ["name", "field2", "field4", "field11", "field12"]
    }},
    {{
      "label": "EntityB",
      "primary_key": "concatenatedFields",
      "description": "EntityB contains field3, which is used as an external lookup key to find matching weather or cost data",
      "properties": ["name", "field1", "field8", "field3"]
    }},
    {{
      "label": "EntityC",
      "primary_key": "field9",
      "description": "EntityC stores auxiliary classification or category data. It is identified by 'field9' and used primarily for lookup, filtering, or enrichment operations in queries.",
      "properties": ["name", "field9"]
    }},
    {{
      "label": "Extra",
      "primary_key": "field2",
      "description": "This node stores all unused or supplementary fields not captured in the main schema. It connects to the primary entity (EntityA) via a general-purpose relationship. It helps preserve full row fidelity.",
      "properties": ["name", "UnusedField1", "UnusedField2"]
    }}
  ],
  "relationships": [
    {{
      "from": "EntityA",
      "to": "EntityB",
      "type": "RELATION_TYPE",
      "properties": ["field3"],
      "description": "This relationship indicates that EntityA is associated with EntityB through a linkage defined by 'field3'. It reflects a core business connection such as assignment, ownership, or dependency."
    }},
    {{
      "from": "EntityB",
      "to": "EntityC",
      "type": "CLASSIFIED_AS",
      "properties": [],
      "description": "This relationship indicates that each EntityB instance can be categorized under a corresponding EntityC class. EntityB instances are classified under EntityC via CLASSIFIED_AS."
    }}
    ],
  "used_fields" : ["field2", "field4", "field11", "field12", "field1", "field8", "field3", "field9"]
}}

---

Guidelines:
- **Domain Agnostic:** Design the schema to be adaptable to various SAP modules (FI, CO, SD, MM, PP, HR, etc.) without hardcoding domain-specific terms unless explicitly derived from the *sample data provided*.
- **Exhaustive:** Uncover **all meaningful entities, relationships, and their properties** embedded in the dataset.
- **Business Context Focus:** Every element in the schema should clearly map back to a real-world business concept or process, regardless of the specific SAP module.
- **Logical Flow:** The schema should be intuitive and logically support all identified business usages.
- **Respond ONLY with the JSON object, without any explanation or formatting (no markdown, no code blocks)**
"""
        
        response = model.invoke(schema_prompt)

        try:
            parsed = json.loads(response.content)
            logger.info("GraphDB schema 생성 성공")
            return parsed
        except json.JSONDecodeError:
            logger.warning("LLM 응답이 JSON 형식이 아닙니다")
            return response
        
    except Exception as e:
        logger.error(f"GraphDB schema 생성 중 오류 발생: {str(e)}")
        return None

In [22]:
# def run_graphDB_schema_generation_llm(file_name, file):
#     try:
#         model = get_model()
#         data_info = get_data_info(file_name, file)
#         graphdb_schema = create_graphdb_schema(model, data_info, retrieved_documents)
#         print(graphdb_schema)
#         return graphdb_schema
#     except Exception as e:
#         logger.error(f"GraphDB schema generation failed: {str(e)}")
#         return {"error": str(e)}

In [23]:
# with open(r"/home/pjtl2w01admin/csm/graphDB_pjt/Table_KNA1.XLSX", "rb") as f:
#     result = run_graphDB_schema_generation_llm("KNA1", f)

# Storing Schema

Execution

In [24]:
# vectordb = load_schema_vector_db()  # 기존 DB 로드
# query = "고객의 세금 정보와 주소 연결 스키마"
# results = hybrid_search_rrf(vectordb, query, k=8, alpha=0.6)

# for i, doc in enumerate(results):
#     print(f"\n--- Document {i+1} ---")
#     print(f"Chunk Type: {doc.metadata.get('chunk_type')}")
#     print(f"Table Name: {doc.metadata.get('table_name')}")
#     print(doc.page_content[:300])


----------------------------------------------------------------------

# Digital Brain

## 온톨로지 예시 및 적용 예시

In [25]:
ontology_llm_instruction = """
You are given an ontology (knowledge graph schema) describing nodes (entities) and relationships.

Your goal is to reason step-by-step in order to determine the appropriate actions for answering a user question.

Your process should:
1. Identify the main entity and key fields using schema descriptions.
2. Determine if the question is about properties or related entities.
3. Trace the reasoning path (multi-hop if needed).
4. Choose actions: generate Cypher query, call tools, or combine both.

Return:
a. Natural language explanation of reasoning in **KOREAN**.
b. Ordered list of required actions.
"""

In [26]:
ontology_schema = {
  "table_name": "Ontology_Sample",
  "schema_explanation": "A unified graph of materials, people, places, and organizations. It enables reasoning across production, geography, and social networks.",
  "nodes": [
    {"label": "Material", "primary_key": "id", "description": "Raw to finished goods", "properties": ["id", "name"]},
    {"label": "Plant", "primary_key": "id", "description": "Manufacturing facility", "properties": ["id", "name"]},
    {"label": "Person", "primary_key": "id", "description": "Individual with name/job", "properties": ["id", "name", "job"]},
    {"label": "Hobby", "primary_key": "id", "description": "Leisure activity", "properties": ["id", "name"]},
    {"label": "Place", "primary_key": "id", "description": "Named location", "properties": ["id", "name", "type"]},
    {"label": "City", "primary_key": "id", "description": "Urban region", "properties": ["id", "name"]},
    {"label": "Country", "primary_key": "id", "description": "Nation or region", "properties": ["id", "name"]}
  ],
  "relationships": [
    {"from": "Material", "to": "Material", "type": "MADE_OF", "description": "Material hierarchy"},
    {"from": "Material", "to": "Plant", "type": "PRODUCED_IN", "description": "Production location"},
    {"from": "Person", "to": "Person", "type": "FRIEND_OF"},
    {"from": "Person", "to": "Person", "type": "COWORKER_OF"},
    {"from": "Person", "to": "Person", "type": "RELATIVE_OF"},
    {"from": "Person", "to": "Person", "type": "FAMILY_OF"},
    {"from": "Person", "to": "Hobby", "type": "ENJOYS"},
    {"from": "Plant", "to": "City", "type": "LOCATED_IN"},
    {"from": "City", "to": "Country", "type": "PART_OF"},
    {"from": "Person", "to": "City", "type": "LIVES_IN"}
  ]
}


In [27]:
ontology_example = """
# **Nodes (Entities)**
## Material
- {{id: "m1", name: "Cake"}}
- {{id: "m2", name: "Flour"}}
- {{id: "m3", name: "Egg"}}
- {{id: "m4", name: "Whipped Cream"}}
- {{id: "m5", name: "Milk"}}

## Person
- {{id: "p1", name: "Amy", job: "Engineer"}}
- {{id: "p2", name: "Bob", job: "Designer"}}
- {{id: "p3", name: "Charlie", job: "Chef"}}
- {{id: "p4", name: "Diana", job: "Manager"}}

## Hobby
- {{id: "h1", name: "Climbing"}}
- {{id: "h2", name: "Cooking"}}

## Place
- {{id: "a1", name: "Bukhansan", type: "Mountain"}}
- {{id: "a2", name: "Kitchen Studio", type: "Indoor"}}

## City
- {{id: "c1", name: "Seoul"}}
- {{id: "c2", name: "Busan"}}
- {{id: "c3", name: "Ulsan"}}
- {{id: "c4", name: "Gumi"}}
- {{id: "c5", name: "Osaka"}}

## Country
- {{id: "k1", name: "Korea"}}
- {{id: "k1", name: "Japan"}}

## Plant
- {{id: "plnt1", name: "CJ"}}
- {{id: "plnt2", name: "Otoki"}}
- {{id: "plnt3", name: "Lotte",}}


# **Relationships**
- MADE_OF: {{ from: "m1", to: "m2" }}
- MADE_OF: {{ from: "m1", to: "m3" }}
- MADE_OF: {{ from: "m1", to: "m4" }}
- MADE_OF: {{ from: "m4", to: "m5" }}
-> can infer that "Cake" is a finished product, "Whipped Cream" is a semi-finished product, and "Flour", "Egg", "Milk" are raw materials

- PRODUCED_IN: {{ from: "m2", to: "plnt1" }}
- PRODUCED_IN: {{ from: "m3", to: "plnt2" }}
- PRODUCED_IN: {{ from: "m4", to: "plnt3" }}
- PRODUCED_IN: {{ from: "m5", to: "plnt3" }}
-> allows tracing the origin/plant of each ingredient

- FRIEND_OF: {{ from: "p1", to: "p2" }}
- FRIEND_OF : {{ from: "p2", to: "p4"}}
- COWORKER_OF: {{ from: "p2", to: "p3" }}
- RELATIVE_OF: {{ from: "p1", to: "p3" }}
- FAMILY_OF: {{ from: "p3", to: "p4" }}
-> enables reasoning over social networks and extended chains of connection between people

- ENJOYS: {{ from: "p1", to: "h1" }}
- ENJOYS: {{ from: "p3", to: "h2" }}
-> Links people to their hobbies, which can be extended to locations of activity

- LOCATED_IN: {{ from: "plnt1", to: "c4" }}
- LOCATED_IN: {{ from: "plnt2", to: "c3" }}
- LOCATED_IN: {{ from: "plnt3", to: "c1" }}
- LOCATED_IN: {{ from: "plnt3", to: "c2" }}
- LOCATED_IN: {{ from: "c2", to: "k1" }}
- LOCATED_IN: {{ from: "c3", to: "k1" }}
-> Enables inference of national or city-level production and residence

- LIVES_IN: {{ from: "p1", to: "c1" }}
- LIVES_IN: {{ from: "p2", to: "c1" }}
- LIVES_IN: {{ from: "p3", to: "c3" }}
- LIVES_IN: {{ from: "p4", to: "c5" }}
-> Links person to location, useful for geo-based aggregation or reasoning

- PART_OF: {{ from: "c1", to: "k1" }}
- PART_OF: {{ from: "c2", to: "k1" }}
- PART_OF: {{ from: "c3", to: "k1" }}
- PART_OF: {{ from: "c4", to: "k1" }}
- PART_OF: {{ from: "c5", to: "k2" }}
-> Defines geographic hierarchy: City ∈ Country
"""

In [28]:
applied_example = """
Q1. 무슨 지역에 홍수가 나면 Cake라는 완성품을 위한 공급에 차질이 생길 수 있나요?
## Reasoning

### 1. Schema Structure Analysis
- 'Cake' is a Material node.
- It connects recursively to sub-materials through MADE_OF, which indicates that 'Cake' functions as a **finished product** composed of lower-level materials.
- Each sub-material is connected to a Plant via PRODUCED_IN.
- Each Plant connects to a City through LOCATED_IN.

### 2. Inference Flow
- Trace: Cake → Sub-materials → Plant → City
- Identify cities involved in the production of Cake components.
- Query weather data for flood risks in those cities.
- Simulate cost impact if supply disruption occurs.

## Actions
### 1. Tool Call: Connect to Neo4j GraphDB and run cypher query
MATCH (cake:Material {name: "Cake"})-[:MADE_OF*]->(m:Material)-[:PRODUCED_IN]->(p:Plant)-[:LOCATED_IN]->(c:City)
RETURN DISTINCT c.name AS vulnerable_cities
→ Example result: ["Ulsan", "Seoul"]

### 2. Tool Call: Check Flood Risk via Weather API
### 3. Tool call: Cost Calculation Engine
### 4. Final LLM answer (Example) : 울산에 홍수가 발생하면 케이크의 핵심 재료인 크림의 공급이 차질을 빚을 수 있습니다.  
이는 전체적인 케이크 생산에 영향을 미치고 비용 상승으로 이어질 수 있습니다.  
시뮬레이션 결과, 이러한 공급 차질로 인해 케이크 가격이 약 12% 상승할 수 있는 것으로 나타났습니다.

-------------

Q2. 서울에 홍수가 나면 어떤 피해가 있을 수 있나요?
## Reasoning

### 1. Schema Structure Analysis
- "Seoul" is a `City` node.
- A `City` may be connected to:
  - `Plant` via `LOCATED_IN`
  - `Person` via `LIVES_IN`
- From those connections, we can explore:
  - `Plant` → `Material` via `PRODUCED_IN`
  - `Material` → `Material` via `MADE_OF`
  - `Person` → social links via `FRIEND_OF`, `COWORKER_OF`, `RELATIVE_OF`
  - `Person` → `Hobby` → `Place`

### 2. Inference Flow
- Check which materials are produced in Seoul via local plants
- Identify finished products that depend on those materials
- List people who live in Seoul and may be impacted
- Trace how their social connections and hobbies may also be affected

---
## Actions

### 1. Tool Call: Connect to Neo4j GraphDB and run cypher queries.
#### 1) Explore all entities linked to Seoul
MATCH path = (c:City {name: "Seoul"})<-[*1..3]-(n)
RETURN DISTINCT labels(n) AS entity_type, n.name AS name
→ Example result:
| entity_type | name     |
|-------------|----------|
| ["Plant"]   | Lotte    |
| ["Material"]| Cream    |
| ["Material"]| Milk     |
| ["Person"]  | Amy      |
| ["Person"]  | Bob      |

#### 2) Find materials produced in Seoul
MATCH (p:Plant {name: "Lotte"})<-[:PRODUCED_IN]-(m:Material)
RETURN DISTINCT m.name AS material

#### 3) Find finished products that depend on those materials
MATCH (product:Material)-[:MADE_OF*]->(:Material {name: "Milk"})
RETURN DISTINCT product.name AS affected_product

#### 4) Trace their social/hobby relationships
MATCH (p:Person {name: "Amy"})-[:FRIEND_OF|COWORKER_OF|RELATIVE_OF|ENJOYS*1..2]-(related)
RETURN type(r) AS relation, labels(related) AS related_type, related.name AS related_name

MATCH (p:Person {name: "Bob"})-[:FRIEND_OF|COWORKER_OF|RELATIVE_OF|ENJOYS*1..2]-(related)
RETURN type(r) AS relation, labels(related) AS related_type, related.name AS related_name

### 2. Final LLM answer (example) : 서울에 홍수가 발생하면 롯데 공장이 영향을 받아 크림과 우유 같은 재료의 생산이 중단될 수 있습니다.  
이는 케이크의 생산과 공급에도 차질을 일으킬 수 있습니다.  
서울에 거주하는 Amy와 Bob과 같은 인물들도 일상생활에 불편을 겪을 수 있으며, 이들과 연결된 사회적 관계나 취미 활동 네트워크 역시 영향을 받을 수 있습니다.

"""

### 적절한 스키마 가져오기

In [29]:
#KNA1 관련
k_query = "울산에 홍수가 났을 때 어떤 고객이 영향을 받는지 알려줘"


#BOM 관련 (mast, stpo)
ms_query = "MATNR이 ZHALB13인 제품은 어떤 자재들로 구성되어있나요?"
ms_query2 = "ZHALB13이 포함된 전체 BOM을 보여주세요"
ms_query3 = "ZFERT101이라는 완제품에 포함된 원재료는 무엇이 있나요?"
ms_query4 = "ZFERT101에 대해 최하위 단계까지의 구성 자재 목록을 추출해주세요"
ms_query5 = "원재료 ZROH33는 어떤 완성품에 포함되어 있나요?"
#BOM 관련 (stpo, eina)
se_query = "ZROH34를 공급하는 회사와 위치는 어디인가요?"
se_query2 = "자재 ZROTH34의 공급자 목록과 도시 정보를 알려주세요"
se_query3 = "완성품 ZFERT101의 전체 부품과 그 공급자를 함께 보여주세요"
se_query4 = "ZFERT101의 구성품을 공급하는 회사는 어떤 지역에 분포되어 있나요?"
#BOM 관련 (mast, stpo, EINA)
mse_query = "특정 공급자의 생산 지역에 문제가 생기면, 어떤 완성품의 생산이 영향을 받나요?"
mse_query2 = "울산광역시에 문제가 생기면 ZFERT101 생산에 어떤 영향이 있을 수 있나요?"
mse_query3 = "서울의 공급망 장애가 어떤 제품에 영향을 주는지 알려주세요."
mse_query4 = "미래에셋생명보험 회사에 장애가 생기면 어떤 완성품의 공급이 영향을 받을까요?"
mse_query5 = "Hyundai Motor 회사가 공급하는 모든 부품과, 관련된 완성품을 알려주세요"


#박사님 데이터 관련

In [30]:
retrieved_documents = vector_retriever.invoke(ms_query)
retrieved_documents

2025-07-21T11:26:06 httpx: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-07-21T11:26:06 elastic_transport.transport: POST http://localhost:9200/inforactive_vdb/_search [status:200 duration:0.007s]


[Document(metadata={'_index': 'inforactive_vdb', '_id': '374f89df-a59c-4bbf-8e66-dc8192b5c41e', '_score': 0.7289561, '_source': {'metadata': {'table_name': 'MAST_STPO_EINA', 'chunk_type': 'node', 'label': 'Material', 'properties_list': 'MATNR, STLNR'}, 'vector': [0.01418999396264553, 0.06051945686340332, -0.002284548245370388, -0.029605906456708908, 0.014874464832246304, -0.04458253085613251, -0.022311696782708168, -0.012310254387557507, -0.05549319460988045, -0.01730586774647236, 0.01870545744895935, 0.04331574961543083, -0.014496473595499992, -0.05733207240700722, 0.049077559262514114, 0.03622585907578468, -0.005690299905836582, -0.03692054748535156, 0.010614401660859585, 0.016386428847908974, 0.0722065344452858, -0.016652045771479607, 0.004099161364138126, -0.0028272720519453287, -0.009235244244337082, 0.021269667893648148, 0.03526555746793747, -0.019175391644239426, -0.028992947190999985, 0.0049930596724152565, -0.0007304424652829766, -0.024436620995402336, 0.03297717869281769, 0.0

In [31]:
keyword_search = bm25_retriever.invoke(ms_query)
keyword_search

2025-07-21T11:26:06 elastic_transport.transport: POST http://localhost:9200/inforactive_vdb/_search [status:200 duration:0.005s]


[Document(metadata={'_index': 'inforactive_vdb', '_id': '70a19d39-c257-40a7-8638-0eb3e5ea6404', '_score': 4.409972, '_source': {'metadata': {'table_name': 'MAST', 'chunk_type': 'table_summary'}, 'vector': [0.020358940586447716, 0.06678103655576706, 0.021170515567064285, -0.03271806985139847, -0.0007644747383892536, -0.03675275668501854, -0.025135640054941177, -0.005283933598548174, -0.020463285967707634, -0.012695353478193283, 0.030538411810994148, -0.0074433027766644955, -0.005530304741114378, -0.0338078998029232, 0.06112319976091385, 0.04574964568018913, 0.004179611802101135, -0.011727260425686836, -0.02543708123266697, 0.00043404774623923004, 0.06645640730857849, -0.006741870194673538, 0.005910005886107683, -0.004208596423268318, -0.02018503099679947, 0.034109342843294144, 0.0031709398608654737, -0.028381939977407455, -0.04067150503396988, -0.018654633313417435, -0.015767743811011314, -0.010637430474162102, 0.01889810524880886, 0.025089263916015625, 0.003281082259491086, -0.00339991

In [32]:
# 원하는 개수만큼 table_name이 나올 때까지 retrieve하는 방법
from typing import List, Set
from langchain_core.documents import Document  # 혹은 사용하는 Document 타입에 맞게 조정

def retrieve_until_diverse_tables(
    query: str,
    retriever,
    min_unique_tables: int = 3,
    max_attempts: int = 5,
    verbose: bool = True
) -> List[Document]:
    """
    Dense vector retriever에서 서로 다른 table_name이 일정 수 이상 포함된 문서를 확보할 때까지 반복 검색합니다.

    Args:
        query (str): 검색 쿼리
        retriever: ElasticsearchRetriever 또는 LangChain 호환 retriever
        min_unique_tables (int): 확보해야 할 서로 다른 테이블 수
        max_attempts (int): 반복 최대 횟수
        verbose (bool): 진행 로그 출력 여부

    Returns:
        List[Document]: 수집된 문서 리스트
    """

    unique_tables: Set[str] = set()
    retrieved_docs: List[Document] = []
    i = 0

    while len(unique_tables) < min_unique_tables and i < max_attempts:
        if verbose:
            print(f"[Try {i+1}] Retrieving documents...")

        docs = retriever.invoke(query)
        for doc in docs:
            table = doc.metadata.get("_source", {}).get("metadata", {}).get("table_name")
            if table:
                unique_tables.add(table)

        retrieved_docs.extend(docs)
        i += 1

        if verbose:
            print(f"  ▶ Unique tables so far: {unique_tables}")

    if verbose:
        print(f"[Done] Total documents retrieved: {len(retrieved_docs)} | Unique tables: {len(unique_tables)}")

    return retrieved_docs


In [None]:
# 유사도 높은 스키마 가져오기
def get_schemas_from_retrieved_docs(dense_docs: List[Document], sparse_docs: List[Document], schema_json_path: str) -> List[dict]:
    all_docs = dense_docs + sparse_docs
    # 1. table_name 전부 수집 (중복 없이)
    table_names = {
        doc.metadata.get("_source", {}).get("metadata", {}).get("table_name")
        for doc in all_docs
        if doc.metadata.get("_source", {}).get("metadata", {}).get("table_name")
    }

    print(f"[DEBUG] table_names found in retrieved docs: {table_names}")

    # 2. 전체 스키마 로드
    with open(schema_json_path, "r", encoding="utf-8") as f:
        full_schema = json.load(f)

    # 3. 필요한 table_name만 필터링
    matched_schemas = [
        schema for schema in full_schema["schemas"]
        if schema.get("table_name") in table_names
    ]

    return matched_schemas


In [34]:
global matched_schemas
matched_schemas = get_schemas_from_retrieved_docs(dense_docs=retrieved_documents, sparse_docs=keyword_search, schema_json_path="/home/pjtl2w01admin/csm/graphDB_pjt/rag/graphdb_schema_data.json")

[DEBUG] table_names found in retrieved docs: {'MAST_STPO_EINA', 'MAST', 'MAST_STPO', 'STPO'}


In [35]:
matched_schemas

[{'table_name': 'MAST',
  'schema_explanation': '이 스키마는 완성품(또는 반제품)을 구성하는 BOM(자재 명세서) 상의 최상위 구조를 정의합니다. 각 제품(MATNR)은 고유한 BOM 번호(STLNR)를 가지며, 이를 통해 제품이 어떤 하위 자재들로 구성되는지를 탐색할 수 있는 진입점이 됩니다. 이 구조는 완성품 기준 역추적, 제품 계층 분석, 구조 비교 등 조립 기반 reasoning에 필수적입니다. — This schema defines the top-level BOM structure for finished or semi-finished products. Each material number (MATNR) is linked to a unique BOM ID (STLNR), which serves as the entry point for traversing its component breakdown. This enables reasoning over product decomposition, variant comparison, and dependency tracing.',
  'nodes': [{'label': 'Material',
    'primary_key': 'STLNR',
    'description': "이 노드는 BOM에서 완성품 또는 반제품으로 정의되는 상위 자재를 나타냅니다. 'MATNR'은 자재 번호이며, 'STLNR'은 해당 자재 구조를 참조하는 BOM 번호입니다. 이 노드는 STPO와의 연결을 통해 구성 요소를 탐색하는 출발점이 됩니다. — This node represents a top-level product or semi-finished material as defined in the BOM. 'MATNR' is the material number, and 'STLNR' is the BOM ID used to link to component structures in STPO."}],
  'r

### Reasoning 테스트

In [36]:
from openai import OpenAI
from typing import Optional

def run_ontology_reasoning_prompt(
        ontology_llm_instruction,
        ontology_example,
        applied_example,
        retrieved_documents,
        user_query
) -> str :
    
    client = OpenAI(api_key="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")
    messages = [
        # Step 1: instruction 넣어주기 in system prompt
        {
            "role": "system",
            "content": ontology_llm_instruction
        },

        # Step 2 : Ontology example와 applied example 넣어주기
        {
            "role": "user",
            "content": f"""
Here is an example ontology and how you should reason with it:
## Ontology Example : {ontology_example}

## Applied Reasoning Examples : {applied_example}

IMPORTANT:
- The examples you receive are for **reasoning pattern and format only**.
- When answering a real question, you must rely **only on the schema and context** that follows the question, not the examples.
"""
        },
        {
            "role": "assistant",
            "content": "Understood. I will follow this structure in future reasoning."
        },


        # Step 3 : 실제 사용자 query와 관련 schema 정보 넣어주기
        {
            "role": "user",
            "content": f""" Here are the schemas for you to understand the relationships between entities, and to construct actions.
Schema : {retrieved_documents}
Now answer this question using the same step-by-step reasoning and action structure.
Q: {user_query}

- Use the schema information to generate cypher queries. you must generate correct cypher queries for all the actions related to running queries.
"""
        }
    ]

    response = client.chat.completions.create(
        model = 'gpt-4o',
        messages = messages,
        temperature = 0
    )

    answer = response.choices[0].message.content
    print(answer)
    return answer

In [37]:
reasoning_context = run_ontology_reasoning_prompt(
    ontology_llm_instruction=ontology_llm_instruction,
    ontology_example=ontology_example,
    applied_example=applied_example,
    retrieved_documents=matched_schemas,
    user_query=ms_query
)

2025-07-21T11:26:15 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


## Reasoning

### 1. Schema Structure Analysis
- The question asks about the components of a product identified by `MATNR` "ZHALB13".
- According to the schema, the `MAST` table provides the top-level BOM structure, where each `MATNR` is linked to a unique `STLNR`.
- The `STPO` table defines the hierarchical structure linking parent materials (`STLNR`) to their components (`IDNRK`).
- The `MAST_STPO` schema integrates these tables, allowing us to trace from a finished or semi-finished product to its components.

### 2. Inference Flow
- First, identify the `STLNR` associated with `MATNR` "ZHALB13" from the `MAST` table.
- Use the `STLNR` to find all components (`IDNRK`) in the `STPO` table.
- This will provide a list of materials that compose the product "ZHALB13".

## Actions

### 1. Tool Call: Connect to Neo4j GraphDB and run Cypher query to find the `STLNR` for `MATNR` "ZHALB13".
```cypher
MATCH (m:Material {MATNR: "ZHALB13"})
RETURN m.STLNR AS stlnr
```

### 2. Tool Call: Use the `S

# Agent 연결

## LangChain Agent

In [None]:
import os
print(os.getcwd())  # 현재 워킹 디렉토리 확인

import sys
for path in sys.path:
    print(path)  # import 가능한 경로 확인

/home/pjtl2w01admin/csm/graphDB_pjt/rag
/home/pjtl2w01admin/anaconda3/envs/neo4j/lib/python311.zip
/home/pjtl2w01admin/anaconda3/envs/neo4j/lib/python3.11
/home/pjtl2w01admin/anaconda3/envs/neo4j/lib/python3.11/lib-dynload

/home/pjtl2w01admin2/.local/lib/python3.11/site-packages
/home/pjtl2w01admin/anaconda3/envs/neo4j/lib/python3.11/site-packages
/home/pjtl2w01admin/csm/graphDB_pjt


In [None]:
from langchain_neo4j import GraphCypherQAChain, Neo4jGraph
from neo4j import GraphDatabase
from langchain.tools import tool
from langchain.agents import create_react_agent, AgentExecutor
from langchain_core.prompts import ChatPromptTemplate
from langchain.agents.openai_functions_agent.base import create_openai_functions_agent
from langchain_google_genai import ChatGoogleGenerativeAI
from neo4j_graphrag.retrievers import Text2CypherRetriever
import requests
import importlib
import csm.graphDB_pjt.rag.config as config
importlib.reload(config)
print(dir(config))

from csm.graphDB_pjt.rag.config import MATCHED_SCHEMA

['AURA_INSTANCEID', 'AURA_INSTANCENAME', 'CYPHER_TEMPLATES', 'GEMINI_API_KEY', 'MATCHED_SCHEMA', 'NEO4J_DATABASE', 'NEO4J_EXAMPLES', 'NEO4J_PASSWORD', 'NEO4J_SCHEMA', 'NEO4J_URI', 'NEO4J_USERNAME', 'OPEN_API_KEY_SERVER', 'OPEN_API_KEY_TEST', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__']


In [40]:
# 개인 neo4j aura gdb
uri = "neo4j+s://4a1461b3.databases.neo4j.io"
username = "neo4j"
password = "4a3eaaII9Gde1VIFSvECkPtER_Jzsks0NOtK6cBs0Sw"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [41]:
llm = get_model()

2025-07-21T11:26:16 __main__: OpenAI 모델이 성공적으로 초기화되었습니다.


In [42]:
global matched_schemas_str

In [43]:
def replace_curly_braces_with_square_braces(json_like_string) -> str:
    # 문자열 내의 '{'을 '['로, '}'을 ']'로 대체
    modified_string = json_like_string.replace('{', '[').replace('}', ']')
    return modified_string

# matched_schemas를 먼저 문자열로 변환
matched_schemas_str = str(matched_schemas)
matched_schemas_str = replace_curly_braces_with_square_braces(matched_schemas_str)
matched_schemas_str

'[[\'table_name\': \'MAST\', \'schema_explanation\': \'이 스키마는 완성품(또는 반제품)을 구성하는 BOM(자재 명세서) 상의 최상위 구조를 정의합니다. 각 제품(MATNR)은 고유한 BOM 번호(STLNR)를 가지며, 이를 통해 제품이 어떤 하위 자재들로 구성되는지를 탐색할 수 있는 진입점이 됩니다. 이 구조는 완성품 기준 역추적, 제품 계층 분석, 구조 비교 등 조립 기반 reasoning에 필수적입니다. — This schema defines the top-level BOM structure for finished or semi-finished products. Each material number (MATNR) is linked to a unique BOM ID (STLNR), which serves as the entry point for traversing its component breakdown. This enables reasoning over product decomposition, variant comparison, and dependency tracing.\', \'nodes\': [[\'label\': \'Material\', \'primary_key\': \'STLNR\', \'description\': "이 노드는 BOM에서 완성품 또는 반제품으로 정의되는 상위 자재를 나타냅니다. \'MATNR\'은 자재 번호이며, \'STLNR\'은 해당 자재 구조를 참조하는 BOM 번호입니다. 이 노드는 STPO와의 연결을 통해 구성 요소를 탐색하는 출발점이 됩니다. — This node represents a top-level product or semi-finished material as defined in the BOM. \'MATNR\' is the material number, and \'STLNR\' is the BOM ID used to link to component structures 

In [44]:
reasoning_context = replace_curly_braces_with_square_braces(reasoning_context)

In [53]:
cypher_schema = matched_schemas_str

In [56]:
CYPHER_TEMPLATES = """
Task: Generate a VALID Cypher statement for querying a Neo4j graph database from a user input.
---
You are an expert in generating Cypher queries for a Neo4j graph database that stores Bill of Materials (BoM) data.

---

Schema:
"""+cypher_schema+"""


Input:
{query_text}


Do not use any properties or relationships not included in the schema. Never make up the labels on your own.
Do not translate any words used in the query into English (ex. DO NOT '울산광역시' -> 'Ulsan')
Do not include triple backticks ``` or any additional text except the generated Cypher statemity name(ORT01) the user usedent in your response.

Cypher query:
"""

In [46]:
NEO4J_EXAMPLES = [
  "USER_INPUT: 'BoM 넘버가 41일 때 어떤 구성품들로 이루어져 있어?' QUERY: MATCH (p:Material {{STLNR: '41'}})-[:HAS_COMPONENT]->(c:Material) RETURN c.IDNRK AS Component",

  "USER_INPUT: '46번 BOM 구조와 각 구성품의 수량을 보여줘.' QUERY: MATCH (p:Material {{STLNR: '46'}})-[r:HAS_COMPONENT]->(c:Material) RETURN c.IDNRK AS Component, r.MENGE AS Quantity, r.MEINS AS Unit ORDER BY r.STPOZ",

  "USER_INPUT: 'GE612B2000 자재가 어떤 BoM에 쓰이는지 알려줘.' QUERY: MATCH (p:Material)-[r:HAS_COMPONENT]->(c:Material {{IDNRK: 'GE612B2000'}}) RETURN p.STLNR AS BOMNumber, r.MENGE AS Quantity",

  "USER_INPUT: '100번 BoM 전체 트리를 그래프로 보여줘.' QUERY: MATCH path = (p:Material {{STLNR: '100'}})-[:HAS_COMPONENT*]->(c:Material) RETURN path",

  "USER_INPUT: '서울특별시에 문제가 생기면 ZFERT101 생산에 어떤 영향이 있어?' QUERY: MATCH (m:Material {{MATNR: 'ZFERT101'}})-[:HAS_COMPONENT*1..5]->(part:Material)<-[:SUPPLIED_BY]-(s:Supplier)-[:LOCATED_IN]->(city:City {{ORT01: '서울특별시'}}) RETURN DISTINCT s.LIFNR AS Supplier, part.IDNRK AS Component"
]

In [None]:
def create_tools():
    @tool("text2Cypher")
    def text2Cypher(query: str) -> str:
        """
        Translates a natural language question into a Cypher query, executes it on the graph DB,
        and returns both the generated Cypher and search results.
        """
        try:
            retriever = Text2CypherRetriever(
                driver=driver,
                llm=llm,  # type: ignore
                neo4j_schema=config.MATCHED_SCHEMA,
                examples=NEO4J_EXAMPLES,
                custom_prompt =CYPHER_TEMPLATES
            )
            
            result = retriever.search(query_text=query)

            # Cypher 쿼리 추출
            cypher_query = result.metadata.get("cypher", "[No Cypher query generated]")

            # Cypher 결과 포맷팅
            if not result.items:
                results_str = "NO INFORMATION FOUND"
            else:
                formatted_items = [
                    f"[{i+1}] {item.content}" for i, item in enumerate(result.items)
                ]
                results_str = "\n".join(formatted_items)

            
            output = (
                f"[Cypher Query]\n{cypher_query}\n\n"
                f"[Results]\n{results_str}"
            )
            return output

        except Exception as e:
            return f"text2Cypher failed for query '{query}'. Error: {str(e)}"

    
    return [text2Cypher]
        

In [48]:
def get_system_prompt():
    system_prompt = """
    You are an intelligent agent that helps route natural language queries to the appropriate data system: GraphDB.
    
    ## Your Goal
    Given a user's question, follow these steps to determine how to answer it:

    1. Analyze the question and identify what information is required.
    2. Refer to the schemas below to understand what data is available in:
        - The **Graph Database (Neo4j)**.
    3. Determine which system(s) contain the necessary data.
    4. Use the appropriate tool:
        - Use `text2Cypher` if the required data is in the graph database.
    5. **Important**: Only use the columns explicitly listed in the schema below.  
        - **Do not assume or generate column names that are not present.**
        - If the required information is not in the schema, Use other data to gain the information!
    6. The reasoning information is provided to help you get the idea of constructing your actions.
    
    Ontology reasoning context : """ +reasoning_context+ """
    
    Then call the corresponding tool among:
    {tools}

    ##  Database Information in Graph DB (Neo4j)
    This database models product structure (BoM) as a graph.
    Schema : """ +matched_schemas_str+ """

    ---
    Use the following format:

    Question: the input question you must answer
    Thought: you should always think about what to do
    Action: the action to take, should be one of [{tool_names}] 
    Action Input: the input to the action
    Observation: the result of the action
    ... (this Thought/Action/Action Input/Observation can repeat maximum 5 times)
    Thought: I now know the final answer
    Final Answer: the final answer to the original input question

     IMPORTANT: 
    - Do NOT include code blocks, backticks in your Action Input
    
    Now, respond with the correct function call based on the user's query.

    """

    return ChatPromptTemplate.from_messages(
        [
            ("system", system_prompt),
            ("human", "{input}"),
            ("assistant", "{agent_scratchpad}"),
        ]
    )

In [49]:
def create_agent(model, tools, prompt):
    """Create the OpenAI Functions agent."""
    return create_react_agent(model, tools, prompt)

def create_agent_executor(agent, tools):
    """Create the agent executor."""
    return AgentExecutor(
        agent=agent,
        tools=tools,
        verbose=True,
        return_intermediate_steps=True,
        handle_parsing_errors=True
    ) 

In [50]:
def initialize_agent():
    """Initialize the agent and return the executor."""

    tools = create_tools()
    print(tools)
    prompt = get_system_prompt()
    agent = create_agent(llm, tools, prompt)
    agent_executor = create_agent_executor(agent, tools)
    return agent_executor

In [51]:
def run_SAP_llm(query):
    
    agent_executor = initialize_agent()
    result = agent_executor.invoke({"input": query})

    return result

In [57]:
result=run_SAP_llm(ms_query)

[StructuredTool(name='text2Cypher', description='Translates a natural language question into a Cypher query, executes it on the graph DB,\nand returns both the generated Cypher and search results.', args_schema=<class 'langchain_core.utils.pydantic.text2Cypher'>, func=<function create_tools.<locals>.text2Cypher at 0x7eff8615b6a0>)]


[1m> Entering new AgentExecutor chain...[0m


2025-07-21T11:29:30 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3mQuestion: MATNR이 ZHALB13인 제품은 어떤 자재들로 구성되어있나요?
Thought: To answer this question, I need to find the components of the product with MATNR "ZHALB13". This requires finding the STLNR associated with MATNR "ZHALB13" in the MAST table and then using that STLNR to find all components (IDNRK) in the STPO table.
Action: text2Cypher
Action Input: MATCH (m:Material {MATNR: "ZHALB13"}) RETURN m.STLNR AS stlnr[0m

2025-07-21T11:29:32 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[36;1m[1;3m[Cypher Query]
MATCH (m:Material {MATNR: "ZHALB13"}) RETURN m.STLNR AS stlnr

[Results]
[1] <Record stlnr='00000003'>[0m

2025-07-21T11:29:33 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3mAction: text2Cypher
Action Input: MATCH (parent:Material {STLNR: '00000003'})-[:HAS_COMPONENT]->(component:Material) RETURN component.IDNRK AS component_materials[0m

2025-07-21T11:29:35 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[36;1m[1;3m[Cypher Query]
MATCH (parent:Material {STLNR: '00000003'})-[:HAS_COMPONENT]->(component:Material) RETURN component.IDNRK AS component_materials

[Results]
[1] <Record component_materials='ZHALB22'>
[2] <Record component_materials='ZROH24'>
[3] <Record component_materials='ZROH25'>[0m

2025-07-21T11:29:36 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3mFinal Answer: 제품 MATNR이 ZHALB13인 경우, 해당 제품은 다음과 같은 자재들로 구성되어 있습니다: ZHALB22, ZROH24, ZROH25.[0m

[1m> Finished chain.[0m


In [58]:
print(result["output"])

제품 MATNR이 ZHALB13인 경우, 해당 제품은 다음과 같은 자재들로 구성되어 있습니다: ZHALB22, ZROH24, ZROH25.


# ---------------------------------------------------------

In [None]:
    
    @tool("get_exchange_rate")
    def get_exchange_rate(query: str) -> str:
        """
        Retrieve real-time exchange rate. Format: 'USD to KOR'
        """
        try:
            base, _, target = query.upper().partition(" TO ")
            res = requests.get(f"https://api.exchangerate.host/latest?base={base}&symbols={target}").json()
            rate = res["rates"].get(target)
            return f"Exchange rate from {base} to {target} is {rate}"
        except Exception as e:
            return f"Failed to retrieve exchange rate: {str(e)}"

    @tool("calculate_cost")
    def calculate_cost(
        bom_data: str,
        usd_to_krw: float = 1400.0,
        weather: str = "none",
        raw_price_increase_percent: float = 0.0,
        semi_price_increase_percent: float = 0.0,
    ) -> dict:
        """
        Calculate final cost of product in KRW and return cost + change rate (%).

        Parameters:
        - bom_data: JSON string (Cypher result) with fields: MATNR, IDNRK, MENGE -> need to query cypher first.
        - usd_to_krw: exchange rate (e.g. 1400)
        - weather: 'rainy' = +10%, 'sunny' = -5%, 'none' = 0%
        - raw_price_increase_percent: float, raw material price change
        - semi_price_increase_percent: float, semi-finished price change

        Returns:
        - dict with keys:
            - "final_krw_cost": float (after all adjustments)
            - "change_percent": float (% change from base)
        """
        try:
            bom_result = json.loads(bom_data)
        except Exception as e:
            raise ValueError(f"Invalid bom_data JSON: {e}")

        if not isinstance(bom_result, list) or not bom_result:
            raise ValueError("bom_data must be a non-empty JSON array")

        raw_base_usd = 100.0
        semi_proc_multiplier = 1.2
        finished_proc_multiplier = 1.1

        def calc_usd_cost(raw_up=0.0, semi_up=0.0):
            raw_costs, semi_costs = {}, {}
            for item in bom_result:
                if item["IDNRK"].startswith("RM"):
                    qty = float(item["MENGE"])
                    unit_price = raw_base_usd * (1 + raw_up / 100)
                    raw_costs[item["IDNRK"]] = raw_costs.get(item["IDNRK"], 0) + qty * unit_price
            for item in bom_result:
                if item["IDNRK"].startswith("SF"):
                    sf = item["IDNRK"]
                    sf_comps = [i for i in bom_result if i["MATNR"] == sf]
                    cost = 0
                    for c in sf_comps:
                        if c["IDNRK"].startswith("RM"):
                            qty = float(c["MENGE"])
                            unit_price = raw_base_usd * (1 + raw_up / 100)
                            cost += qty * unit_price
                    semi_costs[sf] = cost * semi_proc_multiplier * (1 + semi_up / 100)
            root = bom_result[0]["MATNR"]
            final_comps = [i for i in bom_result if i["MATNR"] == root]
            total = 0
            for c in final_comps:
                qty = float(c["MENGE"])
                if c["IDNRK"].startswith("RM"):
                    unit_price = raw_base_usd * (1 + raw_up / 100)
                    total += qty * unit_price
                elif c["IDNRK"].startswith("SF"):
                    total += semi_costs.get(c["IDNRK"], 0)
            return total * finished_proc_multiplier

        # Base USD → base KRW
        base_usd = calc_usd_cost()
        adjusted_usd = calc_usd_cost(raw_price_increase_percent, semi_price_increase_percent)

        base_krw = base_usd * usd_to_krw
        final_krw = adjusted_usd * usd_to_krw

        # Weather adjustment
        weather = weather.lower()
        if "rain" in weather:
            final_krw *= 1.1
        elif "sunny" in weather:
            final_krw *= 0.95

        change = final_krw - base_krw
        change_pct = (change / base_krw * 100) if base_krw else 0.0

        return {
            "final_krw_cost": round(final_krw, 2),
            "change_percent": round(change_pct, 2)
        }

기타 함수 정의

In [None]:
from datetime import datetime, timedelta
import requests

df_grid = pd.read_excel("/home/pjtl2w01admin/csm/graphDB_pjt/data/location_mapping_data.xlsx")
df_grid[["1단계", "2단계", "3단계"]] = df_grid[["1단계", "2단계", "3단계"]].fillna("")

# 질의 파싱 함수
def find_best_location_match(query: str) -> tuple[str, int, int]:
    if "full_location" not in df_grid.columns:
        df_grid["full_location"] = (
            df_grid["1단계"].fillna("").astype(str) +
            df_grid["2단계"].fillna("").astype(str) +
            df_grid["3단계"].fillna("").astype(str)
        )

    for _, row in df_grid.iterrows():
        full_loc = row["full_location"]
        if full_loc in query or query in full_loc:
            location_str = " ".join([str(row["1단계"]), str(row["2단계"]), str(row["3단계"])]).strip()
            return location_str, row["격자 X"], row["격자 Y"]

    raise ValueError("입력한 위치를 기반으로 일치하는 행정구역을 찾을 수 없습니다.")

# 좌표 조회 함수
def parse_datetime(query: str) -> datetime:
    if "내일" in query:
        target_date = datetime.now().timedelta(days=1)
    else:
        target_date = datetime.now()
    
    if "오전" in query:
        hour = 9
    elif "오후" in query:
        hour = 15
    else:
        hour = target_date.hour
    
    return target_date.replace(hour=hour, minute=0, second=0, microsecond=0)


# 기상청 api 호출 함수
def fetch_korea_weather(nx: int, ny: int, dt: datetime):
    base_date = dt.strftime("%Y%m%d")
    base_time = f"{dt.hour:02d}00"

    #초단기실황조회 url
    url = "http://apis.data.go.kr/1360000/VilageFcstInfoService_2.0/getUltraSrtNcst"
    #초단기예보조회 url = "http://apis.data.go.kr/1360000/VilageFcstInfoService_2.0/getUltraSrtFcst"
    params = {
        "serviceKey" : os.environ["KMA_SERVICE_KEY"],
        "numbOfRows": "60",
        "pageNo" : "1",
        "dataType": "JSON",
        "base_date": base_date,
        "base_time": base_time,
        "nx": nx,
        "ny": ny
    }

    res = requests.get(url, params=params)
    data = res.json()
    if "response" not in data or data["response"]["header"]["resultCode"] != "00":
        raise RuntimeError("기상청 API 응답 오류 발생")
    
    return data["reponse"]["body"]["items"]["item"]

In [None]:
query="서울특별시 역삼2동"
dt = parse_datetime(query)
print(dt)
location_str, nx, ny = find_best_location_match(query)
print(f"location: {location_str}, nx: {nx}, ny:{ny}")
#items = fetch_korea_weather(nx, ny, dt)
#print(items)

In [None]:
df_grid.head()

## LangGraph Multi Agent

상태 설정

In [None]:
import os
os.environ['TAVILY_API_KEY'] = "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"
os.environ['OPENWEATHERMAP_API_KEY'] = "3dc94afe001b7646ae38661419697f18"
os.environ["KMA_SERVICE_KEY"] = "WQ3Abccbzez/pI1ffAxH4Ax9CosVcIjOPIJzy21aFAi5Z/62otDWrQcxE/GU3zDckP4fJiE7dqrp2IJxfZnG7Q=="

In [None]:
from typing import Annotated
from typing_extensions import TypedDict
from langgraph.graph.message import add_messages

class State(TypedDict):
    messages: Annotated[list, add_messages]

tool 정의

In [None]:
from typing import Annotated, Literal
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_experimental.tools import PythonREPLTool

tavily_search_tool = TavilySearchResults(max_results=2)
python_repl_tool = PythonREPLTool()

In [None]:
from langchain.tools import BaseTool, StructuredTool, tool
from langchain.utilities.openweathermap import OpenWeatherMapAPIWrapper
import pandas as pd

@tool("weather_abroad")
def weather_abroad(query: str) -> str:
    """
    Retrieves the current weather conditions for a specific location.

    Input: A location name (e.g., 'Seoul', 'New York') written in English.
    Output: A brief weather report including temperature, humidity, and general conditions.

    Use this tool when the user is asking about the weather in a specific place.
    """

    try:
        weather  = OpenWeatherMapAPIWrapper()
        return weather.run(query)
    except Exception as e:
        return f"Weather tool faild for query '{query}'.\nProceed again or try search tool. Error: {str(e)}"

@tool("korea_weather")
def korea_weather(query: str) -> str:
    """
    Retrieves current weather information from the Korea Meteorological Administration (KMA) based on user query.

    Input: A natural language question including a Korean location name and time (e.g., "부산 내일 오전 9시 날씨 알려줘").
    Output: A human-readable weather summary for that location and time, including temperature and rainfall status.
    """
    try:
        location, dt = parse_query(query)
        nx, ny = get_xy(location)
        items = fetch_korea_weather(nx, ny, dt)

에이전트 정의

시각적으로 확인

실행