# 추천사유 생성

## 결과물 형태

In [1]:
# Schema
from typing_extensions import Annotated
from pydantic import Field, BaseModel

class IDRelevance(BaseModel):
    relevant_id: Annotated[
        list[str],
        Field(
            ..., 
            description=(
                "데이터의 ID 목록"
            ), 
        )
    ]
    reason: Annotated[
        list[str],
        Field(
            ..., 
            description="각 ID가 선정된 이유를 설명하는 문자열 목록. relevant_id와 인덱스가 일치해야 합니다.",
        )
    ]

## 프롬프트

In [2]:
# Prompt
from langchain_core.prompts import PromptTemplate

reason_template = '''
You are a data scientist. Your task is to explain clearly why each listed paper or dataset
was selected for recommendation, so that users can easily understand the reasoning.

[Writing Guidelines]
1) You must provide exactly one explanation for each ID. Do not give overall summaries.
2) Each reason must explicitly describe the connection between the research topic and the data item.
3) Each reason should include at least two of the following elements:
   
Keyword or topical similarity
Alignment in methodology or model
Match in domain or application context
4) Use an objective, descriptive tone. Avoid exaggeration or subjective evaluation.
5) Each reason should be 1–2 sentences long, about 60 words or fewer.
6) The number of items in "relevant_id" and "reason" must be identical,
   and the index i of each list must correspond to the same item.
7) Do not output any text other than JSON, and use only the keys "relevant_id" and "reason".

[Self-check]
Before finalizing your output, verify that the lengths of "relevant_id" and "reason" are equal.
If they differ, adjust the list of reasons to match the number of IDs.
If a reason sounds too generic, directly reference at least one supporting term
from the input topic, or from that item’s title or keywords.

[Input]
Research Topic: {title}
Research Description: {description}
Keywords: {keyword}

[Data]
Data list:
{data}

[Output(JSON)]
{{
  "relevant_id": ["ID1", "ID2", "ID3"],
  "reason": [
    "ID1 aligns with the keyword 'Transformer' from the input topic and covers a similar NLP task.",
    "ID2 applies the same 'image classification' methodology described in the input.",
    "ID3 belongs to the 'climate data analysis' domain and shares the same application context as the topic."
  ]
}}
'''


reason_prompt = PromptTemplate.from_template(reason_template)


## 예시 데이터

In [3]:
# title, description, keyword
import json

with open("../data/input_data.json", "r", encoding="utf-8") as f:
    input_data = json.load(f)

try:
    title = input_data['dataset_title_etc_main']
    description = input_data['dataset_expl_etc_main']
    keyword = input_data['dataset_expl_etc_main']
    input_id = input_data['svc_id']

except:
    items = input_data["MetaData"]["recordList"]["record"]["item"]
    title = next(i["#text"] for i in items if i["@metaCode"] == "Title")
    description = next(i["#text"] for i in items if i["@metaCode"] == "Abstract")
    keyword = next(i["#text"] for i in items if i["@metaCode"] == "Keyword")
    input_id = next(i["#text"] for i in items if i["@metaCode"] == "CN")
    

In [4]:
# data
import pandas as pd

df_article = pd.read_csv('../data/search_results_article.csv', encoding='UTF-8', low_memory=False)
df_data = pd.read_csv('../data/search_results_dataset.csv', encoding='UTF-8', low_memory=False)

cleaned_df_data = (
    df_data[
        ['svc_id', 'dataset_title_etc_main', 'dataset_expl_etc_main','dataset_pub_dt_pc', 'dataset_kywd_etc_main', 'dataset_creator_etc_main', 'dataset_lndgpg', 'query']
    ]
    .rename(
        columns={
            'svc_id': 'ID',
            'dataset_title_etc_main': 'title',
            'dataset_expl_etc_main': 'description',
            'dataset_pub_dt_pc': 'pubyear',
            'dataset_kywd_etc_main': 'keyword',
            'dataset_creator_etc_main': 'author',
            'dataset_lndgpg': 'URL',
        }
    )
)
cleaned_df_data['category'] = 'dataset'

cleaned_df_arti = (
    df_article[
        ['CN', 'Title', 'Abstract', 'Pubyear', 'Keyword', 'Author', 'ContentURL', 'query']
    ]
    .rename(
        columns={
            'CN': 'ID',
            'Title': 'title',
            'Abstract': 'description',
            'Pubyear': 'pubyear',
            'Keyword': 'keyword',
            'Author': 'author',
            'ContentURL': 'URL'
        }
    )
)
cleaned_df_arti['category'] = 'article'

df = pd.concat([cleaned_df_arti, cleaned_df_data], ignore_index=True)


In [5]:
# relevance_data
relevance_df = pd.read_csv('../data/relevance_results.csv', encoding='UTF-8', low_memory=False)

## 작동 방식

In [6]:
# Node
from langchain_openai import ChatOpenAI

relevant_ids = relevance_df['ID'].tolist()
filtered_df = df[df['ID'].isin(relevant_ids)]

prompt = reason_prompt.invoke(
    {
        'title': title, 
        'description': description,
        'keyword': keyword,
        'data': filtered_df[['ID', 'title', 'description', 'keyword']].to_dict(orient="records"),
    }
)

sllm = ChatOpenAI(model='gpt-4o-mini', temperature=0)

structured_sllm = sllm.with_structured_output(IDRelevance)
res = structured_sllm.invoke(prompt)
    
tmp = pd.DataFrame({
    'ID': res.relevant_id,
    'reason': res.reason
})

relevance_df = pd.merge(
    relevance_df[['ID', 'relevance']],
    tmp,
    on='ID',
    how='left'
)

# relevance_df.to_csv('../data/relevance_results.csv', index=False, encoding='utf-8')

display(relevance_df)

Failed to send compressed multipart ingest: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('422 Client Error: unknown for url: https://api.smith.langchain.com/runs/multipart', '{"error":"Unprocessable entity: invalid JSON part for post.620177fb-aff8-455d-bc35-a849c1c8fd0a: could not unmarshal run: invalid character \'-\' after top-level value"}\n')


KeyboardInterrupt: 