# 검색어 생성

## 결과물 형태

In [6]:
# Schema
from typing_extensions import Annotated
from pydantic import Field, BaseModel

class QueryResult(BaseModel):
    query: Annotated[
        list[str],
        Field(
            ..., 
            max_length=5, 
            min_length=5,
            description="가장 적절한 검색어들의 리스트", 
        )
    ]
    

## 프롬프트

In [7]:
from langchain_core.prompts import PromptTemplate

query_length = 5
# Prompt
query_template_1 = f'''
Generate search queries to find the most semantically relevant papers and datasets based on the given title, description and keywords.

[Search Engine Constraints]
- The search is exact-match based.
- Keep queries short and cohesive.
- Each query should consist of 2–3 words, preferably a combination of proper nouns or technical topics.

[Generation Procedure]
1. Identify Core Keywords: Summarize the main objects, methodology, domain, and application context of the research topic in one sentence.
2. Generate Candidates: Tentatively generate 8–10 query candidates, each with 2–3 words.
3. Apply Filtering Rules:
    - Remove generic, overly broad, or ambiguous expressions (e.g., “AI model”, “data analysis”).
    - Keep a balanced mix — 12 method-centered, 12 domain-centered, and 1–2 application-scenario-centered queries.
    - Exclude entries consisting only of abbreviations, but allow widely used ones (e.g., “BERT” is allowed, “ML” alone is not).
    - Do not use hyphens, special characters, or quotation marks.
4. Final Selection: Retain only the top {query_length}.

[Prohibitions]
- Single-word queries are not allowed.
- Queries longer than 4 words are not allowed.
- Do not leave only stopword combinations (e.g., “for research”).
- Do not violate the JSON schema.

'''

query_template_2 = '''
[Input]
- Research Topic: {title}
- Research Description: {description}
- Keywords: {keyword}

[Output]
Output in the following JSON format:

{{
"query": []
}}
'''

query_prompt = PromptTemplate.from_template(query_template_1 + query_template_2)

## 예시 데이터

In [8]:
import json

# title/description/keyword
with open("../data/input_data.json", "r", encoding="utf-8") as f:
    input_data = json.load(f)

try:
    title, description, keyword = input_data['dataset_title_etc_main'], input_data['dataset_expl_etc_main'], input_data['dataset_kywd_etc_main']

except:
    items = input_data["MetaData"]["recordList"]["record"]["item"]
    title = next(i["#text"] for i in items if i["@metaCode"] == "Title")
    description = next(i["#text"] for i in items if i["@metaCode"] == "Abstract")
    keyword = next(i["#text"] for i in items if i["@metaCode"] == "Keyword")


## 작동 방식

In [10]:
from langchain_openai import ChatOpenAI

prompt = query_prompt.invoke(
    {
        'title': title, 
        'description': description,
        'keyword': keyword
    }
)

# sLLM
sllm = ChatOpenAI(model='gpt-4o-mini', temperature=0)

structured_sllm = sllm.with_structured_output(QueryResult)
res = structured_sllm.invoke(prompt)

query = res.query

# with open('../data/queries.txt', 'w', encoding='utf-8') as sf:
#     f.write('\n'.join(query))

print(query)

['Ross Sea core', 'Antarctic climate', 'gravity core', 'marine sediments', 'sedimentation study']
