In [1]:
import sys
import os
sys.path.append(r'/home/pjtl2w01admin/csm/graphDB_pjt')
import importlib
import config
importlib.reload(config)
import logging
import json
import pandas as pd
import re
from io import BytesIO
from typing import List, Dict, Any
from uuid import uuid4
from fastapi import UploadFile
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
from langchain_elasticsearch import ElasticsearchRetriever
from langchain_openai import ChatOpenAI, OpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma, ElasticsearchStore
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_core.documents import Document
from collections import defaultdict
from rag.retrieve_schema import search_related_ontologies

In [2]:
# Logging 설정
logging.basicConfig(level='INFO', format='%(asctime)s %(name)s: %(message)s', datefmt='%Y-%m-%dT%H:%M:%S')
logger = logging.getLogger(__name__)

# TMP 디렉토리 설정
TMP_DIR = os.path.join(os.getcwd(), "tmp")
os.makedirs(TMP_DIR, exist_ok=True)  # 디렉토리가 없으면 생성

# VectorDB 저장 경로
PERSIST_DIRECTORY = "inforactive_vdb"
os.makedirs(PERSIST_DIRECTORY, exist_ok=True) 

In [3]:
ELASTICSEARCH_URL = os.getenv("ELASTICSEARCH_URL", "http://localhost:9200")
ELASTICSEARCH_INDEX_NAME = os.getenv("ELASTICSEARCH_INDEX_NAME", "inforactive_vdb")
OPEN_API_KEY_SERVER = os.getenv("OPEN_API_KEY_SERVER")

TEXT_FIELD = "text"
CONTENT_FIELD = "text"
VECTOR_FIELD = "vector"

## Ontology Selection

In [4]:
ontology_instruction = """
You are a schema reasoning assistant.

Given a user's question and a list of available graph schemas (each with nodes, relationships, and descriptions), your job is :
- to select **all schemas that are needed** to answer the question. 
- explain how they are connected together to form a logical reasoning chain

---

### What you must do:

1. Understand the user's question and break it down into the key data elements needed to answer it.
2. **Determine which schemas are requried,** including those that provide:
  - Final answers
  - Intermediate values necessary to reach them
3. **Describe the reasoning chain**, step by step:
  - What data is needed at each step
  - Which schema provides it
  - How outputs from one schema connect to inputs in the next
  - Include any field mappings where names differ but meanings align
---

### Output Format (in natural language, not JSON):

**Required schemas (in order):**  
- FirstSchemaName  
- SecondSchemaName  
- ThirdSchemaName  
...

**Reasoning chain:**  
1. First, use `FirstSchemaName` to find X (e.g., the components of a product).  
2. Then, map the value `component_id` from `FirstSchemaName` to `material_id` in `SecondSchemaName`, which allows you to find Y (e.g., which plant produces it).  
3. Finally, use `ThirdSchemaName` to find the location of the plant using `plant_id`.  

Be precise and include field name mappings if relevant.

"""

In [5]:
with open("example_schema.json", "r", encoding="utf-8") as f:
    example_schemas = json.load(f)

In [6]:
ontology_example = """
## ProductStructure
- Material {{parent_id: "m1", material_id: "m1", name: "Cake"}}
- Material {{component_id: "m2", name: "Flour"}}
- Material {{component_id: "m3", name: "Egg"}}
- Material {{component_id: "m4", name: "Whipped Cream"}}
- Material {{component_id: "m5", name: "Milk"}}

- MADE_OF {{from: "parent_id:m1", to: "component_id:m2"}}
- MADE_OF {{from: "parent_id:m1", to: "component_id:m3"}}
- MADE_OF {{from: "parent_id:m1", to: "component_id:m4"}}
- MADE_OF {{from: "component_id:m4", to: "component_id:m5"}}

## ProductSupplier
- Material {{material_id: "m2", name: "Flour"}}
- Material {{material_id: "m3", name: "Egg"}}
- Material {{material_id: "m4", name: "Whipped Cream"}}
- Material {{material_id: "m5", name: "Milk"}}
- Plant {{plant_id: "plnt1", name: "CJ"}}
- Plant {{plant_id: "plnt2", name: "Otoki"}}
- Plant {{plant_id: "plnt3", name: "Lotte"}}

- PRODUCED_IN {{from: "material_id:m2", to: "plant_id:plnt1"}}
- PRODUCED_IN {{from: "material_id:m3", to: "plant_id:plnt2"}}
- PRODUCED_IN {{from: "material_id:m4", to: "plant_id:plnt3"}}
- PRODUCED_IN {{from: "material_id:m5", to: "plant_id:plnt3"}}

## PlantLocation
- Plant {{plant_id: "plnt1", name: "CJ"}}
- Plant {{plant_id: "plnt2", name: "Otoki"}}
- Plant {{plant_id: "plnt3", name: "Lotte"}}
- City {{city_id: "c1", name: "Seoul"}}
- City {{city_id: "c2", name: "Busan"}}
- City {{city_id: "c3", name: "Ulsan"}}
- City {{city_id: "c4", name: "Gumi"}}

- LOCATED_IN {{from: "plant_id:plnt1", to: "city_id:c4"}}
- LOCATED_IN {{from: "plant_id:plnt2", to: "city_id:c3"}}
- LOCATED_IN {{from: "plant_id:plnt3", to: "city_id:c1"}}
- LOCATED_IN {{from: "plant_id:plnt3", to: "city_id:c2"}}

## GeoLocation
- City {{city_id: "c1"}}
- City {{city_id: "c2"}}
- City {{city_id: "c3"}}
- City {{city_id: "c4"}}
- City {{city_id: "c5"}}
- Country {{country_id: "k1"}}
- Country {{country_id: "k2"}}

- PART_OF {{from: "city_id:c1", to: "country_id:k1"}}
- PART_OF {{from: "city_id:c2", to: "country_id:k1"}}
- PART_OF {{from: "city_id:c3", to: "country_id:k1"}}
- PART_OF {{from: "city_id:c4", to: "country_id:k1"}}
- PART_OF {{from: "city_id:c5", to: "country_id:k2"}}

## PersonalNetwork
- Person {{person_id: "p1", id: "p1", name: "Amy", job: "Engineer"}}
- Person {{person_id: "p2", id: "p2", name: "Bob", job: "Designer"}}
- Person {{person_id: "p3", id: "p3", name: "Charlie", job: "Chef"}}
- Person {{person_id: "p4", id: "p4", name: "Diana", job: "Manager"}}
- Hobby {{hobby_id: "h1", name: "Climbing"}}
- Hobby {{hobby_id: "h2", name: "Cooking"}}
- Address {{address_id: "a1", city_id: "c1", name: "Seoul"}}
- Address {{address_id: "a2", city_id: "c3", name: "Ulsan"}}
- Address {{address_id: "a3", city_id: "c5", name: "Osaka"}}

- FRIEND_OF {{from: "person_id:p1", to: "person_id:p2"}}
- FRIEND_OF {{from: "person_id:p2", to: "person_id:p4"}}
- COWORKER_OF {{from: "person_id:p2", to: "person_id:p3"}}
- RELATIVE_OF {{from: "person_id:p1", to: "person_id:p3"}}
- FAMILY_OF {{from: "person_id:p3", to: "person_id:p4"}}
- ENJOYS {{from: "person_id:p1", to: "hobby_id:h1"}}
- ENJOYS {{from: "person_id:p3", to: "hobby_id:h2"}}
- LIVES_IN {{from: "person_id:p1", to: "address_id:a1"}}
- LIVES_IN {{from: "person_id:p2", to: "address_id:a1"}}
- LIVES_IN {{from: "person_id:p3", to: "address_id:a2"}}
- LIVES_IN {{from: "person_id:p4", to: "address_id:a3"}}

## EnergyUsagePlan
- Material {{material_id: "m1", name: "Cake"}}
- TimePeriod {{period_id: "tp1", year: "2025", quarter: "Q1"}}
- UsagePlan {{plan_id: "plan1", kwh: 1200, unit_cost: 0.2, estimated_cost: 240}}

- HAS_USAGE_PLAN {{from: "material_id:m1", to: "plan_id:plan1"}}
- FOR_PERIOD {{from: "plan_id:plan1", to: "period_id:tp1"}}
"""

In [7]:
applied_example = """
Q1. 무슨 지역에 홍수가 나면 Cake라는 완성품을 위한 공급에 차질이 생길 수 있나요?

Answer :
**Required schemas (in order):**
- ProductStructure  
- ProductSupplier  
- PlantLocation  
- GeoLocation

**Reasoning chain:**  
1. First, use `ProductStructure` to find which sub-materials (components) are used to make the product `"Cake"`.  
   → This is done by traversing `MADE_OF` relationships from `parent_id = m1`.

2. Then, map each `component_id` from `ProductStructure` to `material_id` in `ProductSupplier`.  
   → This lets us find the `plant_id` where each component is produced via the `PRODUCED_IN` relationship.

3. Next, use `PlantLocation` to find which `city_id` each `plant_id` is located in via the `LOCATED_IN` relationship.

4. Finally, use `GeoLocation` to map each `city_id` to its `country_id`, which allows us to understand the regional scope of supply risk (e.g., flood-prone countries or cities).

-----
Q2. Bob과 가족 관계에 있는 사람이 즐기는 취미가 Cooking일 때, 그 사람이 사는 지역의 공장에서 생산되는 원재료가 들어간 완제품은 무엇인가요?

**Required schemas (in order):**
- PersonalNetwork  
- ProductSupplier  
- PlantLocation  
- ProductStructure

**Reasoning chain:**  
1. First, use `PersonalNetwork` to find Bob (`person_id:p2`) and trace all people who are `FAMILY_OF` or `RELATIVE_OF` Bob.  
   → This leads to `person_id:p3` (Charlie).

2. Still in `PersonalNetwork`, check if that person (Charlie) has the hobby `Cooking` via the `ENJOYS` relationship.  
   → Confirmed: `person_id:p3` → `hobby_id:h2` (Cooking)

3. Use the `LIVES_IN` relationship in `PersonalNetwork` to get the `address_id` of Charlie.  
   → Charlie lives at `address_id:a2`, which has `city_id:c3`.

4. Use `PlantLocation` to find all plants located in that same `city_id:c3`.  
   → `plant_id:plnt2` is located in Ulsan.

5. Use `ProductSupplier` to find materials that are `PRODUCED_IN` `plant_id:plnt2`.  
   → This gives `material_id:m3` (Egg)

6. Use `ProductStructure` to find any `parent_id` (product) that is `MADE_OF` the given `component_id = m3`.  
   → `m1 (Cake)` is composed of `m3 (Egg)`

7. Therefore, the final product affected is `"Cake"` — it contains raw material(s) produced in the region where Charlie lives and works.
"""

In [8]:
query = "2025년 3분기 매출 계획에서 부산에 있는 공급처 문제가 발생하면 가장 큰 영향을 받는 제품은?"
query2 = "2025년 1분기 매출이 가장 높은 제품이 어떤 자재들로 구성되어 있고, 그 자재들을 공급하는 공급처는 어느 지역에 있어?"
query3 = "matnr이 MAT0001와 관련된 모든 자재를 알려줘. 하위자재의 하위자재들까지 다"
query4 = "BoM 85822413번의 수량은 몇개야?"
query5 = "공급업체 VN-1200이 가동불가능하면 영향을 받을 가능성이 있는 모든 자재명을 알려줘"
query6 = "Dallas 지역에서 생산이 불가능할 때 해당 자재를 생산할 수 있는 다른 공급업체를 찾아줘. 지역도 알려줘"
query7 = "한국에 있는 고객 리스트를 보여줘"

LLM 판단

In [9]:
def get_model():
    try:
        model = ChatOpenAI(
            openai_api_key="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",
            model="gpt-4o",
            temperature=0,
            max_tokens=4048
        )
        logger.info("OpenAI 모델이 성공적으로 초기화되었습니다.")
        return model
    except Exception as e:
        logger.error(f"모델 초기화 중 오류 발생: {e}")
        raise e

In [10]:
def run_ontology_selection_prompt (
        ontology_instruction,
        example_schemas,
        ontology_example,
        applied_example,
        schema_data,
        mapping_information,
        user_query
) :
    client = get_model()
    messages = [
        # 1. instruction
        SystemMessage(
            content = ontology_instruction
        ),

        # 2. ontology schema, example, answer 넣어주기
        HumanMessage(
            content = f"""
Here is an example ontology and possible answers to potential questions:
## example schemas : {example_schemas}

## example values : {ontology_example}

## answers to potential questions : {applied_example}

IMPORTANT:
- Study the examples provided above to get the idea of answering upcoming questions
- Remember these are the examples, not the actual schema or data that you should examine through!
"""
        ),

        AIMessage(
            content = "Understood. I will follow this structure in future reasoning"
        ),

        # 3. 실제 사용자 query와 스키마 정보
        HumanMessage(
            content = f"""
Here are the schemas, linking schemas information, and query that you should look for :
## Schemas : {schema_data}

## linking schemas information : {mapping_information}

Now answer this question using the schema and mapping schemas information => Q : {user_query}
"""
        )
    ]


    response = client.invoke(messages)
    
    return response.content

In [11]:
with open("schema_data.json", "r", encoding="utf-8") as f:
    schema_data = json.load(f)

mapping_information = config.SCHEMA_LINK_INFO

In [12]:
selected_schemas = run_ontology_selection_prompt(
    ontology_instruction,
    example_schemas,
    ontology_example,
    applied_example,
    schema_data,
    mapping_information,
    query2
)
selected_schemas

2025-07-29T08:17:40 __main__: OpenAI 모델이 성공적으로 초기화되었습니다.
2025-07-29T08:17:52 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'**Required schemas (in order):**  \n- CUSTOM_SALES_PLAN  \n- MAST  \n- STPO  \n- EINA  \n- LFA1  \n\n**Reasoning chain:**  \n1. **Identify the Product with Highest Sales:**  \n   - Use `CUSTOM_SALES_PLAN` to find the product (`MATNR`) with the highest sales amount in the first quarter of 2025.  \n   - This is done by examining the `SalesPlan` node for `year = 2025` and `quarter = Q1`, and selecting the `MATNR` with the highest `amount`.\n\n2. **Find the BOM Structure for the Product:**  \n   - Use `MAST` to map the identified `MATNR` to its corresponding `STLNR` (BOM number).  \n   - This step identifies the BOM structure used by the product.\n\n3. **Identify Components of the Product:**  \n   - Use `STPO` to find all components (`IDNRK`) included in the BOM (`STLNR`) identified in the previous step.  \n   - This step reveals the materials that make up the product.\n\n4. **Map Components to Suppliers:**  \n   - Use `EINA` to map each component (`IDNRK`) to its supplier (`LIFNR`).  \n 

In [13]:
from IPython.display import Markdown, display

display(Markdown(selected_schemas))

**Required schemas (in order):**  
- CUSTOM_SALES_PLAN  
- MAST  
- STPO  
- EINA  
- LFA1  

**Reasoning chain:**  
1. **Identify the Product with Highest Sales:**  
   - Use `CUSTOM_SALES_PLAN` to find the product (`MATNR`) with the highest sales amount in the first quarter of 2025.  
   - This is done by examining the `SalesPlan` node for `year = 2025` and `quarter = Q1`, and selecting the `MATNR` with the highest `amount`.

2. **Find the BOM Structure for the Product:**  
   - Use `MAST` to map the identified `MATNR` to its corresponding `STLNR` (BOM number).  
   - This step identifies the BOM structure used by the product.

3. **Identify Components of the Product:**  
   - Use `STPO` to find all components (`IDNRK`) included in the BOM (`STLNR`) identified in the previous step.  
   - This step reveals the materials that make up the product.

4. **Map Components to Suppliers:**  
   - Use `EINA` to map each component (`IDNRK`) to its supplier (`LIFNR`).  
   - This step identifies which suppliers provide the materials.

5. **Determine Supplier Locations:**  
   - Use `LFA1` to find the location (`ORT01`) of each supplier (`LIFNR`).  
   - This step provides the city where each supplier is located.

By following these steps, you can determine which materials compose the product with the highest sales in Q1 2025 and where the suppliers of these materials are located.

-------------------------------

RAG 판단 -> LLM 최종 답변

In [14]:
retrieved_schemas = search_related_ontologies(query=query2)
retrieved_schemas

  _embeddings_cache = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
2025-07-29T08:17:55 sentence_transformers.SentenceTransformer: Use pytorch device_name: cuda:0
2025-07-29T08:17:55 sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: BAAI/bge-base-en-v1.5
2025-07-29T08:17:59 rag.retrieve_schema: OpenAI Embedding 모델이 성공적으로 초기화되었습니다.
2025-07-29T08:17:59 elastic_transport.transport: HEAD http://localhost:9200/inforactive_vdb [status:200 duration:0.003s]
2025-07-29T08:17:59 rag.retrieve_schema: Elasticsearch 인덱스 'inforactive_vdb' 가 존재합니다. 삭제 후 재생성합니다.
2025-07-29T08:17:59 elastic_transport.transport: DELETE http://localhost:9200/inforactive_vdb [status:200 duration:0.078s]
2025-07-29T08:17:59 rag.retrieve_schema: 인덱스 'inforactive_vdb' 삭제 완료.
2025-07-29T08:18:00 elastic_transport.transport: PUT http://localhost:9200/inforactive_vdb [status:200 duration:0.681s]
2025-07-29T08:18:00 rag.retrieve_schema: 인덱스 'inforactive_vdb' 768차원으로 재생성 완료

DEBUG: type(data) = <class 'list'>
DEBUG: schema count = 6
총 36개 문서를 처리중입니다


2025-07-29T08:18:00 elastic_transport.transport: HEAD http://localhost:9200/inforactive_vdb [status:200 duration:0.002s]
2025-07-29T08:18:01 elastic_transport.transport: PUT http://localhost:9200/_bulk?refresh=true [status:200 duration:0.238s]
2025-07-29T08:18:01 elastic_transport.transport: GET http://localhost:9200/ [status:200 duration:0.003s]
  return forward_call(*args, **kwargs)
2025-07-29T08:18:01 elastic_transport.transport: POST http://localhost:9200/inforactive_vdb/_search [status:200 duration:0.006s]
2025-07-29T08:18:01 elastic_transport.transport: GET http://localhost:9200/ [status:200 duration:0.002s]
2025-07-29T08:18:01 elastic_transport.transport: POST http://localhost:9200/inforactive_vdb/_search [status:200 duration:0.006s]


벡터 유사도 : [Document(metadata={'_index': 'inforactive_vdb', '_id': '1a5b9df9-540d-4065-938b-1ceba3a6520f', '_score': 0.8813863, '_ignored': ['text.keyword', 'metadata.properties_list.keyword'], '_source': {'metadata': {'table_name': 'KNA1', 'chunk_type': 'node', 'label': 'Extra', 'properties_list': 'MANDT, BBBNR, BBSNR, BUBKZ, UMSAT, UMJAH, JMZAH, JMJAH, UMSA1, HZUOR, J_1KFREPRE, J_1KFTBUS, J_1KFTIND, UPTIM, RIC, LEGALNAT, _VSO_R_PALHGT, _VSO_R_I_NO_LYR, _VSO_R_ULD_SIDE, _VSO_R_LOAD_PREF, PSPNR, J_3GSTDMON, J_3GSTDTAG, J_3GTAGMON, J_3GVMONAT, J_3GEMINBE, J_3GFMGUE, J_3GZUSCHUE'}, 'vector': [-0.011191862635314465, -0.01804390177130699, 0.025941655039787292, -0.006314030848443508, 0.054101359099149704, 0.034678857773542404, 0.027651650831103325, 0.01184951514005661, -0.003872875589877367, -0.03651507943868637, -0.01872679404914379, -0.0077191805467009544, -0.0843634307384491, 0.013264224864542484, -0.01439246628433466, 0.05844118073582649, 0.06626918166875839, 0.011700737290084362, 0.01912

[{'table_name': 'KNA1',
  'schema_explanation': '이 스키마는 SAP 고객(KNA1) 마스터 데이터를 중심으로 고객의 위치, 분류, 세금, 생애주기 정보를 통합 모델링합니다. 이를 통해 고객 기반 지리 추론, 규제 준수 여부 판단, 고객 세분화 등의 작업이 가능합니다. ',
  'nodes': [{'label': 'Customer',
    'primary_key': 'KUNNR',
    'properties': ['KUNNR', 'NAME1', 'NAME2', 'SORTL', 'ANRED'],
    'description': 'SAP 시스템의 고객 또는 비즈니스 파트너 식별자'},
   {'label': 'Address',
    'primary_key': 'ADRNR',
    'properties': ['ADRNR', 'STRAS', 'PSTLZ'],
    'description': '고객의 물리적 주소 정보'},
   {'label': 'City',
    'primary_key': 'ORT01',
    'properties': ['ORT01', 'REGIO'],
    'description': '주소가 위치한 도시'},
   {'label': 'Country',
    'primary_key': 'LAND1',
    'properties': ['LAND1'],
    'description': '도시가 속한 국가'},
   {'label': 'TaxInfo',
    'primary_key': 'concatenatedFields',
    'properties': ['STCD2', 'STCD5'],
    'description': '고객의 세금 정보'},
   {'label': 'Lifecycle',
    'primary_key': 'concatenatedFields',
    'properties': ['ERDAT', 'ERNAM', 'DUEFL'],
    'description': '고객 생성일

In [15]:
selected_schemas = run_ontology_selection_prompt(
    ontology_instruction,
    example_schemas,
    ontology_example,
    applied_example,
    retrieved_schemas,
    mapping_information,
    query2
)
selected_schemas

2025-07-29T08:18:55 __main__: OpenAI 모델이 성공적으로 초기화되었습니다.


2025-07-29T08:19:22 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'**Required schemas (in order):**  \n- CUSTOM_SALES_PLAN  \n- MAST  \n- STPO  \n- EINA  \n- LFA1  \n\n**Reasoning chain:**  \n1. **Identify the Product with Highest Sales:**  \n   - Use `CUSTOM_SALES_PLAN` to find the product (`MATNR`) with the highest sales amount in the first quarter of 2025.  \n   - This involves checking the `SalesPlan` node for `year = 2025` and `quarter = Q1`, and identifying the `MATNR` with the maximum `amount`.\n\n2. **Find the BOM Structure for the Product:**  \n   - Use `MAST` to map the identified `MATNR` to its corresponding `STLNR` (BOM number).  \n   - This step establishes the BOM structure for the product.\n\n3. **Identify Components of the Product:**  \n   - Use `STPO` to find all components (`IDNRK`) included in the BOM (`STLNR`) identified in the previous step.  \n   - This step provides the list of materials that make up the product.\n\n4. **Map Components to Suppliers:**  \n   - Use `EINA` to map each component (`IDNRK`) to its supplier (`LIFNR`).

In [16]:
from IPython.display import Markdown, display

display(Markdown(selected_schemas))

**Required schemas (in order):**  
- CUSTOM_SALES_PLAN  
- MAST  
- STPO  
- EINA  
- LFA1  

**Reasoning chain:**  
1. **Identify the Product with Highest Sales:**  
   - Use `CUSTOM_SALES_PLAN` to find the product (`MATNR`) with the highest sales amount in the first quarter of 2025.  
   - This involves checking the `SalesPlan` node for `year = 2025` and `quarter = Q1`, and identifying the `MATNR` with the maximum `amount`.

2. **Find the BOM Structure for the Product:**  
   - Use `MAST` to map the identified `MATNR` to its corresponding `STLNR` (BOM number).  
   - This step establishes the BOM structure for the product.

3. **Identify Components of the Product:**  
   - Use `STPO` to find all components (`IDNRK`) included in the BOM (`STLNR`) identified in the previous step.  
   - This step provides the list of materials that make up the product.

4. **Map Components to Suppliers:**  
   - Use `EINA` to map each component (`IDNRK`) to its supplier (`LIFNR`).  
   - This step identifies which suppliers provide the materials for the product.

5. **Determine Supplier Locations:**  
   - Use `LFA1` to find the location (`ORT01`) of each supplier (`LIFNR`) identified in the previous step.  
   - This step provides the geographic location of each supplier, indicating where the materials are sourced from.

By following these steps, you can determine the composition of the product with the highest sales in Q1 2025 and the locations of the suppliers for each component material.

----------------------------------------

RAG + LLM 판단

In [17]:
def run_determining_schemas_prompt (
        ontology_instruction,
        example_schemas,
        ontology_example,
        applied_example,
        retrieved_schemas,
        mapping_information,
        user_query
) :
    client = get_model()
    messages = [
        # 1. instruction
        SystemMessage(
            content = ontology_instruction
        ),

        # 2. ontology schema, example, answer 넣어주기
        HumanMessage(
            content = f"""
Here is an example ontology and possible answers to potential questions:
## example schemas : {example_schemas}

## example values : {ontology_example}

## answers to potential questions : {applied_example}

IMPORTANT:
- Study the examples provided above to get the idea of answering upcoming questions
- Remember these are the examples, not the actual schema or data that you should examine through!
"""
        ),

        AIMessage(
            content = "Understood. I will follow this structure in future reasoning"
        ),

        # 3. 실제 사용자 query와 스키마 정보
        HumanMessage(
            content = f"""
Here are the schemas, linking schemas information, and query that you should look for.
There can be unrelated schema in answering the question, so examine carefully in selecting the schema and reasoning actions :
## Schemas : {retrieved_schemas}

## linking schemas information : {mapping_information}

Now answer this question using the schema and mapping schemas information => Q : {user_query}
"""
        )
    ]


    response = client.invoke(messages)
    
    return response.content

In [18]:
#실행
selected_schemas = run_determining_schemas_prompt(
    ontology_instruction,
    example_schemas,
    ontology_example,
    applied_example,
    retrieved_schemas,
    mapping_information,
    query2
)
selected_schemas

2025-07-29T08:23:55 __main__: OpenAI 모델이 성공적으로 초기화되었습니다.
2025-07-29T08:24:09 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


'To answer the question, we need to determine which schemas are required and how they are connected to form a logical reasoning chain.\n\n**Required schemas (in order):**  \n- CUSTOM_SALES_PLAN  \n- MAST  \n- STPO  \n- EINA  \n- LFA1  \n\n**Reasoning chain:**  \n1. **Identify the Product with Highest Sales:**\n   - Use `CUSTOM_SALES_PLAN` to find the product (`MATNR`) with the highest sales amount in the first quarter of 2025.\n   - This involves checking the `SalesPlan` node for the `amount` property and filtering by `TimePeriod` with `year = 2025` and `quarter = Q1`.\n\n2. **Find BOM Structure for the Product:**\n   - Use `MAST` to map the identified `MATNR` to its corresponding `STLNR` (BOM number).\n   - This step identifies the BOM structure used by the product.\n\n3. **Identify Components of the Product:**\n   - Use `STPO` to find all components (`IDNRK`) included in the BOM (`STLNR`) identified in the previous step.\n   - This step provides the list of materials that make up the

In [19]:
from IPython.display import Markdown, display

display(Markdown(selected_schemas))

To answer the question, we need to determine which schemas are required and how they are connected to form a logical reasoning chain.

**Required schemas (in order):**  
- CUSTOM_SALES_PLAN  
- MAST  
- STPO  
- EINA  
- LFA1  

**Reasoning chain:**  
1. **Identify the Product with Highest Sales:**
   - Use `CUSTOM_SALES_PLAN` to find the product (`MATNR`) with the highest sales amount in the first quarter of 2025.
   - This involves checking the `SalesPlan` node for the `amount` property and filtering by `TimePeriod` with `year = 2025` and `quarter = Q1`.

2. **Find BOM Structure for the Product:**
   - Use `MAST` to map the identified `MATNR` to its corresponding `STLNR` (BOM number).
   - This step identifies the BOM structure used by the product.

3. **Identify Components of the Product:**
   - Use `STPO` to find all components (`IDNRK`) included in the BOM (`STLNR`) identified in the previous step.
   - This step provides the list of materials that make up the product.

4. **Map Components to Suppliers:**
   - Use `EINA` to map each component (`IDNRK`) to its supplier (`LIFNR`).
   - This step identifies which suppliers provide the materials for the product.

5. **Determine Supplier Locations:**
   - Use `LFA1` to find the location (`ORT01`) of each supplier (`LIFNR`) identified in the previous step.
   - This step provides the geographic location of each supplier.

By following these steps, we can determine the composition of the product with the highest sales in Q1 2025 and the locations of the suppliers for its components.

-------------------------------------------

## Ontology Combination

In [14]:
def extract_required_schemas(text: str):
    # '\\n' → 진짜 개행 문자로 바꿔줌
    text = text.replace("\\n", "\n")

    lines = text.split("\n")
    schemas = []

    parsing = False
    for line in lines:
        line = line.strip()

        # 시작점 찾기
        if "Required schemas" in line:
            parsing = True
            continue

        # 종료점 만나면 중단
        if "Reasoning chain" in line:
            break

        if parsing and line.startswith("-"):
            schema = line[1:].strip()
            schemas.append(schema)

    return schemas

In [15]:
def load_matching_schemas(schema_data, selected_schemas):
    required_schema_names = extract_required_schemas(selected_schemas)

    matched = []
    for name in required_schema_names:
        match = next((s for s in schema_data if s.get("table_name") == name), None)
        if match:
            matched.append(match)
        else:
            print(f"Warning: '{name}' not found in schema_data.json")


    return matched

In [16]:
def get_prompt():
    system_prompt = """
You are an expert in schema integration.

## Given multiple selected schemas(with nodes and relationships), your task is to merge them into a single unified schema by: 
1. Merging semantically equivalent nodes (even if they have different names or field names).
2. Unifying similar fields under consistent naming conventions.
3. Reconstructing all meaningful relationships across nodes, including:
   - direct relationships,
   - hierarchical or nested structures,
   - and self-referencing (recursive) relationships, if present.

## DO NOT OMIT:
- Any important node or relationship that conveys structural, temporal, or dependency information.
- If a schema contains a hierarchy (e.g., parent-child, container-component), ensure it's expressed explicitly as a relationship.
- If nodes refer to each other recursively (e.g., items composed of other items), represent this using self-referencing relationships.

## Output a unified schema in this format:
{
    "nodes" : [
        {
            "label": string,
            "primary_key": string,
            "properties": [string],
            "description": string
        },
        ...
        {
            "label": string,
            "primary_key": string,
            "properties": [string],
            "description": string
        }
    ]
    "relationships: [
        {
            "from": string,
            "to": string,
            "type": string,
            "properties": [string],
            "description": string
        },
        ...
        {
            "from": string,
            "to": string,
            "type": string,
            "properties": [string],
            "description": string
        }
    ]
}

## IMPORTANT
Do not include markdown formatting such as backticks.
Only return raw JSON.
Ensure all merged concepts are clearly and consistently described.
"""

    return system_prompt

In [17]:
def run_combining_schema(
        system_prompt,
        mapping_information,
        matched_schemas
):
    client = get_model()
    messages = [
        SystemMessage(
            content = system_prompt
        ),
        HumanMessage(
            content = f"""
This is all the schemas needed for the user.
Now combine them into one schema using following information:
## schema linking information : {mapping_information}

## all the schemas to combine : {matched_schemas}
"""
        )
    ]

    response = client.invoke(messages)

    return response.content

In [18]:
system_prompt = get_prompt()
matched_schemas = load_matching_schemas(schema_data, selected_schemas)
combined_schema = run_combining_schema(system_prompt, mapping_information, matched_schemas)

2025-07-28T10:01:00 __main__: OpenAI 모델이 성공적으로 초기화되었습니다.


2025-07-28T10:01:08 httpx: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


In [19]:
combined_schema

'{\n    "nodes": [\n        {\n            "label": "Material",\n            "primary_key": "MATNR",\n            "properties": ["MATNR", "STLNR"],\n            "description": "제품 및 구성품 식별자. 완제품 또는 반제품으로 BOM에서 상위 자재를 나타냄."\n        },\n        {\n            "label": "Component",\n            "primary_key": "IDNRK",\n            "properties": ["IDNRK"],\n            "description": "BOM에 포함된 구성 자재 번호. 하위 자재를 나타냄."\n        },\n        {\n            "label": "Supplier",\n            "primary_key": "LIFNR",\n            "properties": ["LIFNR"],\n            "description": "공급처 ID. 자재를 공급하는 공급처를 식별."\n        },\n        {\n            "label": "City",\n            "primary_key": "ORT01",\n            "properties": ["ORT01"],\n            "description": "공급처가 위치한 도시."\n        },\n        {\n            "label": "TimePeriod",\n            "primary_key": "concatenatedFields",\n            "properties": ["year", "quarter"],\n            "description": "판매계획이 적용되는 연도와 분기 (ex. 2025, Q1)."\n  

In [20]:
from IPython.display import Markdown, display

display(Markdown(combined_schema))

{
    "nodes": [
        {
            "label": "Material",
            "primary_key": "MATNR",
            "properties": ["MATNR", "STLNR"],
            "description": "제품 및 구성품 식별자. 완제품 또는 반제품으로 BOM에서 상위 자재를 나타냄."
        },
        {
            "label": "Component",
            "primary_key": "IDNRK",
            "properties": ["IDNRK"],
            "description": "BOM에 포함된 구성 자재 번호. 하위 자재를 나타냄."
        },
        {
            "label": "Supplier",
            "primary_key": "LIFNR",
            "properties": ["LIFNR"],
            "description": "공급처 ID. 자재를 공급하는 공급처를 식별."
        },
        {
            "label": "City",
            "primary_key": "ORT01",
            "properties": ["ORT01"],
            "description": "공급처가 위치한 도시."
        },
        {
            "label": "TimePeriod",
            "primary_key": "concatenatedFields",
            "properties": ["year", "quarter"],
            "description": "판매계획이 적용되는 연도와 분기 (ex. 2025, Q1)."
        },
        {
            "label": "SalesPlan",
            "primary_key": "concatenatedFields",
            "properties": ["quantity", "unitPrice", "amount"],
            "description": "판매계획 - 수량, 단가, 금액 포함."
        }
    ],
    "relationships": [
        {
            "from": "Material",
            "to": "SalesPlan",
            "type": "HAS_SALES_PLAN",
            "properties": [],
            "description": "특정 제품/자재의 판매계획을 나타냄."
        },
        {
            "from": "SalesPlan",
            "to": "TimePeriod",
            "type": "PLANNED_FOR",
            "properties": [],
            "description": "해당 판매계획이 어느 시점(연도+분기)에 해당하는지 명시."
        },
        {
            "from": "Material",
            "to": "Component",
            "type": "HAS_COMPONENT",
            "properties": ["MENGE"],
            "description": "상위 자재(MATNR)가 하위 자재(IDNRK)를 어떤 수량(MENGE)으로 포함하는지 정의."
        },
        {
            "from": "Component",
            "to": "Supplier",
            "type": "PRODUCED_AT",
            "properties": [],
            "description": "특정 자재/원자재/재료가 만들어지고 공급되는 공급처와 그 자재를 연결."
        },
        {
            "from": "Supplier",
            "to": "City",
            "type": "IS_LOCATED_AT",
            "properties": [],
            "description": "공급처가 위치한 도시를 연결."
        }
    ]
}

In [23]:
# cytoscape에 합쳐진 graphDB schema 보여주기

from pyvis.network import Network
from IPython.display import IFrame

if isinstance(combined_schema, str):
    combined_schema = json.loads(combined_schema)

# 네트워크 객체 생성
net = Network(height="750px", width="100%", notebook=False, directed=True)

# 1. 노드 추가
for node in combined_schema["nodes"]:
    label = node["label"]
    description = node["description"]
    net.add_node(label, label=label, title=description, shape="box")

# 2. 관계 추가
for rel in combined_schema["relationships"]:
    source = rel["from"]
    target = rel["to"]
    edge_label = rel["type"]
    tooltip = rel["description"]
    net.add_edge(source, target, label=edge_label, title=tooltip, arrows="to")

# 3. HTML 파일로 저장
net.write_html("schema_graph.html")

# 4. Jupyter Notebook 안에 띄우기
#display(IFrame("schema_graph.html", width="100%", height="600px"))