# 모델 구조

In [27]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

## Global State

In [28]:
from langgraph.graph import MessagesState
import pandas as pd

class State(MessagesState):
    
    # 입력된 데이터의 ID
    input_id: str = ''  
    
    # 메타데이터 검색을 통해 확인한 제목과 설명
    subject: str = ''   
    description: str = ''
    
    # LLM이 생성한 검색어 리스트
    query : list[str] = []
    
    # 검색어를 통해 검색한 전체 데이터
    data: pd.DataFrame = pd.DataFrame()
    
    # 연관성 검색을 통해 확인한 topK 데이터
    relevant_df: pd.DataFrame = pd.DataFrame()


## 논문 - API 호출용 토큰 생성

In [29]:
import json, base64, requests
from Crypto.Cipher import AES
from datetime import datetime

class AESTestClass:
    def __init__(self, plain_txt, key):
        # iv, block_size 값은 고정
        self.iv = 'jvHJ1EFA0IXBrxxz'
        self.block_size = 16
        self.plain_txt = plain_txt
        self.key = key

    def pad(self):
        # PKCS#7 패딩
        number_of_bytes_to_pad = self.block_size - len(self.plain_txt) % self.block_size
        ascii_str = chr(number_of_bytes_to_pad)
        padding_str = number_of_bytes_to_pad * ascii_str
        return self.plain_txt + padding_str

    def encrypt(self):
        cipher = AES.new(self.key.encode('utf-8'), AES.MODE_CBC, self.iv.encode('utf-8'))
        padded_txt = self.pad()
        encrypted_bytes = cipher.encrypt(padded_txt.encode('utf-8'))
        # URL-safe Base64
        encrypted_str = base64.urlsafe_b64encode(encrypted_bytes).decode('utf-8')
        return encrypted_str
    
def call_access_token(MAC_ADDRESS, API_KEY, CLIENT_ID):
    # 맥주소
    mac = (MAC_ADDRESS or "").strip().strip('"').strip("'").upper().replace(":", "-")
    if not mac:
        raise SystemExit("MAC_ADDRESS가 비어있음")

    # datetime 생성
    dt = datetime.now().strftime('%Y%m%d%H%M%S')

    # JSON 페이로드 생성
    payload = {
        "mac_address": mac,
        "datetime": dt
    }
    plain_json = json.dumps(payload, separators=(',', ':'))

    # AES 암호화 → Base64
    aes = AESTestClass(plain_txt=plain_json, key=API_KEY)
    b64_cipher = aes.encrypt()

    # 인코딩 + 토큰 요청
    endpoint = "https://apigateway.kisti.re.kr/tokenrequest.do"
    params = {
        "accounts": b64_cipher,
        "client_id": CLIENT_ID
    }

    response = requests.get(endpoint, params=params, timeout=10)
    response.raise_for_status()

    data = response.json()
    token = data['access_token'] 
    return token

## 논문 - 함수

In [30]:
import xml.etree.ElementTree as ET
import pandas as pd

def xml_to_df(xml):
    # XML 파싱
    root = ET.fromstring(xml)

    # recordList 찾기
    record_list_element = root.find('recordList')

    # 데이터를 담을 리스트
    records = []

    if record_list_element is not None:
        # 각 record에 대해 반복
        for record_element in record_list_element.findall('record'):
            record_data = {}
            # 각 item에 대해 반복
            for item_element in record_element.findall('item'):
                meta_code = item_element.get('metaCode')
                # CDATA 섹션의 텍스트 추출
                value = item_element.text.strip() if item_element.text else ''
                record_data[meta_code] = value
            records.append(record_data)

    df = pd.DataFrame(records)
    return df

## 논문 - 제목, 초록 가져오기

In [None]:
import os, requests, xmltodict

def ARTI_browse(state: State):
    
    CLIENT_ID = os.getenv("SCIENCEON_CLIENT_ID")
    ARTI_KEY = os.getenv("SCIENCEON_API_KEY")
    MAC_ADDRESS = os.getenv("MAC_ADDRESS")

    access_token = call_access_token(MAC_ADDRESS, ARTI_KEY, CLIENT_ID)

    url = "https://apigateway.kisti.re.kr/openapicall.do"
    params = {
        "client_id": CLIENT_ID,
        "token": access_token,
        "version": 1.0,
        "action": "browse",
        "target": "ARTI",
        "cn": state['input_id'],
        "include": "",
        "exclude": None,
    }
    
    res = requests.get(url, params=params, timeout=20)
    xml = res.text
    dict_data = xmltodict.parse(xml)
    with open("../data/input_data.json", "w", encoding="utf-8") as f:
        json.dump(dict_data, f, ensure_ascii=False, indent=4)

    df = xml_to_df(xml)
    
    subject, description = df['Title'].iloc[0], df['Abstract'].iloc[0]
    
    print('\n[subject]\n', subject)
    print('\n[description]\n', description)
    
    return {'subject': subject, 'description': description}


## 데이터셋 - 제목, 설명 가져오기

In [32]:
import os, requests, json

def DATA_browse(state: State):

    API_KEY = os.getenv("DATAON_META_API_KEY")
    assert API_KEY and API_KEY.strip(), "환경변수(DATAON_META_API_KEY)가 비어있어요!"

    url = "https://dataon.kisti.re.kr/rest/api/search/dataset/" + state["input_id"]
    params = {"key": API_KEY}

    res = requests.get(url, params=params, timeout=20)
    data = res.json()

    # json 저장
    with open("../data/input_data.json", "w", encoding="utf-8") as f:
        json.dump(data['records'], f, ensure_ascii=False, indent=4)

    subject, description = data['records']['dataset_title_etc_main'], data['records']['dataset_expl_etc_main']
    print('\n[subject]\n', subject)
    print('\n[description]\n', description)

    return {'subject':subject, 'description' : description} 

## 검색어 생성

In [49]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from typing_extensions import Annotated
from pydantic import Field, BaseModel

# Schema
class QueryResult(BaseModel):
    query: Annotated[
        list[str],
        Field(
            ..., 
            max_length=5, 
            min_length=3,
            description="가장 적절한 검색어들의 리스트, 길이 최소 3개/최대 5개", 
        )
    ]
    
# Prompt
query_template = '''
주어진 제목과 설명을 바탕으로, 가장 연관성이 높은 논문과 데이터셋을 검색하려 합니다.

가장 의미적으로 관련성이 높은 논문과 데이터셋을 검색할 수 있을 쿼리를 만들어 주세요.

[조건]
1. 쿼리는 2~3단어로 구성되어야 합니다.
2. 검색 방식은 쿼리와 정확히 일치하는 내용이 있는 논문이나 데이터를 반환하는 형식입니다.

[Input]
연구 주제: {subject}
연구 설명: {description}

[Output]
가장 적절한 3~5개의 쿼리를 JSON으로 출력해주세요.
'''

query_prompt = PromptTemplate.from_template(query_template)

# Node
def generate_query(state: State):

    prompt = query_prompt.invoke(
        {
            'subject': state['subject'], 
            'description': state['description'],
        }
    )

    # sLLM
    sllm = ChatOpenAI(model='gpt-4o-mini', temperature=0)

    structured_sllm = sllm.with_structured_output(QueryResult)
    res = structured_sllm.invoke(prompt)
    query = res.query
    print('\n[query]\n', query)

    return {'query': query}


## 논문 - 쿼리에 따른 검색

In [34]:
import os, requests

def ARTI_search(state: State):
    
    CLIENT_ID = os.getenv("SCIENCEON_CLIENT_ID")
    ARTI_KEY = os.getenv("SCIENCEON_API_KEY")
    MAC_ADDRESS = os.getenv("MAC_ADDRESS")

    access_token = call_access_token(MAC_ADDRESS, ARTI_KEY, CLIENT_ID)

    url = "https://apigateway.kisti.re.kr/openapicall.do"
    df = pd.DataFrame
    
    for query in state['query']:
        params = {
            "client_id": CLIENT_ID,
            "token": access_token,
            "version": 1.0,
            "action": "search",
            "target": "ARTI",
            "searchQuery": query,
            'curPage': 1, # 현재페이지 번호
            'rowCount': 10, # 디스플레이 건수(기본값 10, 최대값 100)
        }
    
        res = requests.get(url, params=params, timeout=20)
        xml = res.text
        tmp = xml_to_df(xml)
        
        df = pd.concat([df, tmp], ignore_index=True)
        
    df = df.drop_duplicates(subset='cn')
    df.to_csv('../data/search_results_article.csv', index=False, encoding='utf-8')
    print('\n[total article length]\n', len(df))

    return df

## 데이터셋 - 쿼리에 따른 검색

In [35]:
import os, requests
import pandas as pd

def DATA_search(state: State):
    
    API_KEY = os.getenv("DATAON_SEARCH_API_KEY")
    assert API_KEY and API_KEY.strip(), "환경변수(DATAON_API_KEY)가 비어있어요!"

    url = "https://dataon.kisti.re.kr/rest/api/search/dataset/"
    print('\n[response]')
    df = pd.DataFrame()
    for query in state['query']:
        params = {"key": API_KEY, "query": query, "from": 0, "size": 10}
        # key / CHAR / 필수 / API_KEY
        # query / CHAR / 필수 / 검색키워드
        # from / CHAR / 옵션 / 페이지시작위치
        # size / CHAR / 옵션 / 페이지사이즈

        res = requests.get(url, params=params, timeout=20)
        data = res.json()
        print(data['response'])
        
        if "records" in data:
            tmp = pd.DataFrame(data["records"])
            tmp["query"] = query
            df = pd.concat([df, tmp], ignore_index=True)

    df = df.drop_duplicates(subset='svc_id')
    df.to_csv('../data/search_results_dataset.csv', index=False, encoding='utf-8')
    print('\n[total dataset length]\n', len(df))
    
    return {'data': df}

## 연관된 데이터 선정

In [41]:
from typing_extensions import Annotated
from pydantic import Field, BaseModel
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

# Schema
class IDRelevance(BaseModel):
    relevant_id: Annotated[
        list[str],
        Field(
            ..., 
            max_length=5, 
            min_length=3,
            description=(
                "가장 관련성이 높은 데이터의 ID 목록, 길이 최소 3개/최대 5개"
                "각 ID에 대해 relevance(관련도 점수)와 reason(선정 사유)을 동일한 인덱스 위치에 작성합니다. "
            ), 
        )
    ]
    relevance: Annotated[
        list[
            Annotated[
                int, 
                Field(ge=0, le=100, description="0~100 정수")
            ]
        ],
        Field(
            ..., 
            max_length=5, 
            min_length=3,
            description="각 ID에 대응하는 관련도 점수 (0~100 정수)",
        )
    ]
    reason: Annotated[
        list[str],
        Field(
            ..., 
            max_length=5, 
            min_length=3,
            description="각 ID가 선정된 이유를 설명하는 문자열 목록. relevant_id와 인덱스가 일치해야 합니다.",
        )
    ]

# Prompt
relevance_template = '''
당신은 데이터 과학자입니다. 아래는 연구 데이터 목록입니다.

각 데이터 항목은 다음 컬럼을 가지고 있습니다:
- ID: 각 데이터의 고유키
- 제목
- 설명

[목표]
주어진 연구 주제와 가장 관련성 높은 3~5개의 데이터 항목을 선택하세요.
각 항목에 대해 관련도 점수(0~100)와 선정 사유를 함께 작성하세요.


[Input]
연구 주제: {subject}
연구 설명: {description}

[Data]
데이터 목록:
{data}

[Output]
다음 형식의 JSON을 출력하세요:

{{
  "relevant_id": [],
  "relevance": [],
  "reason": []
}}
'''

relevance_prompt = PromptTemplate.from_template(relevance_template)

# Node
def select_relevance(state: State):

    df, subject, description = state['data'], state['subject'], state['description']

    prompt = relevance_prompt.invoke(
        {
            'subject': subject, 
            'description': description,
            'data': df[['svc_id', 'dataset_title_etc_main', 'dataset_expl_etc_main']].to_dict(orient="records"),
        }
    )

    sllm = ChatOpenAI(model='gpt-4o-mini', temperature=0)

    structured_sllm = sllm.with_structured_output(IDRelevance)
    res = structured_sllm.invoke(prompt)

    
        
    result_df = pd.DataFrame({
        'relevant_id': res.relevant_id,
        'relevance': res.relevance,
        'reason': res.reason
    })
    
    return {'relevant_df': result_df}

# 그래프 구조

In [43]:
from langgraph.graph import START, END, StateGraph

def build_graph():
    builder = StateGraph(State)
    
    builder.add_sequence([DATA_browse, generate_query, DATA_search, select_relevance]) 
    
    builder.add_edge(START, 'DATA_browse')
    builder.add_edge('select_relevance', END)
    
    return builder.compile()

graph = build_graph()

In [38]:
cn = 'JAKO200411922932805'
print('[input_id]\n', cn)



[input_id]
 JAKO200411922932805


In [48]:
svc_id = 'b37f0c9413eeb7c45f6fe31cbe3a41ef'
print('[input_id]\n', svc_id)

res = graph.invoke({
    'input_id': svc_id
})

print('\n[return]')

merged_df = pd.merge(
    res['relevant_df'],
    res['data'][['svc_id', 'dataset_title_etc_main', 'dataset_expl_etc_main', 'dataset_lndgpg']],
    left_on='relevant_id',
    right_on='svc_id',
    how='left'
)

output_df = merged_df[['dataset_title_etc_main', 'dataset_expl_etc_main', 'relevance', 'reason', 'dataset_lndgpg']]
output_df = output_df.rename(columns={
    'dataset_title_etc_main': '제목',
    'dataset_expl_etc_main': '설명',
    'relevance': '점수',
    'reason': '추천 사유',
    'dataset_lndgpg': 'URL'
})

display(output_df)

[input_id]
 b37f0c9413eeb7c45f6fe31cbe3a41ef

[subject]
 Architectural Urbanism: Melbourne/Seoul - KTA projects

[description]
 BACKGROUND: 'Architectural Urbanism: Melbourne/Seoul' was a two-city exhibition funded by RMIT University and the Korean National University, supported by the Australian Government through the Australian International Cultural Council. Kerstin Thompson Architecture (KTA) showed five works - Carrum Downs Police Station, MUMA Gallery, Lake Conneware House, Napier Street Housing and Royal Botanic Gardens Visitor Centre - as one of ten architectural firms (5 from Melbourne, 5 from Seoul) selected to exhibit. The exhibited work included large scale photographs, working drawings and exegetical text. CONTRIBUTION: The exhibition explored architectural approaches that worked 'within the city rather than upon it', architecture that 'intervenes and inserts, rather than overlays or eradicates'. Within this context, the public projects exhibited by KTA explore relationshi

Unnamed: 0,제목,설명,점수,추천 사유,URL
0,Architectural Urbanism: Melbourne/Seoul - KTA ...,BACKGROUND: 'Architectural Urbanism: Melbourne...,100,This dataset directly pertains to the 'Archite...,https://doi.org/10.25439/rmt.27353883.v1
1,"Urban Architectures (Court House, George+Murph...","RESEARCH BACKGROUND: Court House, George+Murph...",90,This dataset includes works by Baracco + Wrigh...,https://doi.org/10.25439/rmt.27347361.v1
2,Grace Park House,RESEARCH BACKGROUND: Grace Park House is an am...,85,Grace Park House is a significant project that...,https://doi.org/10.25439/rmt.27347415.v1
3,Urban box,RESEARCH BACKGROUND: Urban Box was one of 4 pr...,80,Urban Box is another project featured in the '...,https://doi.org/10.25439/rmt.27347322.v1
4,Geelong Library and Heritage Centre,RESEARCH BACKGROUND: The Geelong Library and H...,75,The Geelong Library and Heritage Centre repres...,https://doi.org/10.25439/rmt.27355041.v1
