# 연관성 점수 부여

## 예시 데이터

In [None]:
# title, description, keyword
import json

with open("../data/input_data.json", "r", encoding="utf-8") as f:
    input_data = json.load(f)

try:
    title = input_data['dataset_title_etc_main']
    description = input_data['dataset_expl_etc_main']
    keyword = input_data['dataset_expl_etc_main']
    input_id = input_data['svc_id']

except:
    items = input_data["MetaData"]["recordList"]["record"]["item"]
    title = next(i["#text"] for i in items if i["@metaCode"] == "Title")
    description = next(i["#text"] for i in items if i["@metaCode"] == "Abstract")
    keyword = next(i["#text"] for i in items if i["@metaCode"] == "Keyword")
    input_id = next(i["#text"] for i in items if i["@metaCode"] == "CN")
    
state = {'title': title, 'description': description, 'keyword': keyword, 'input_id': input_id}


In [None]:
# data
import pandas as pd

df_article = pd.read_csv('../data/search_results_article.csv', encoding='UTF-8', low_memory=False)
df_data = pd.read_csv('../data/search_results_dataset.csv', encoding='UTF-8', low_memory=False)

cleaned_df_data = (
    df_data[
        ['svc_id', 'dataset_title_etc_main', 'dataset_expl_etc_main','dataset_pub_dt_pc', 'dataset_kywd_etc_main', 'dataset_creator_etc_main', 'dataset_lndgpg', 'query']
    ]
    .rename(
        columns={
            'svc_id': 'ID',
            'dataset_title_etc_main': 'title',
            'dataset_expl_etc_main': 'description',
            'dataset_pub_dt_pc': 'pubyear',
            'dataset_kywd_etc_main': 'keyword',
            'dataset_creator_etc_main': 'author',
            'dataset_lndgpg': 'URL',
        }
    )
)
cleaned_df_data['category'] = 'dataset'

cleaned_df_arti = (
    df_article[
        ['CN', 'Title', 'Abstract', 'Pubyear', 'Keyword', 'Author', 'ContentURL', 'query']
    ]
    .rename(
        columns={
            'CN': 'ID',
            'Title': 'title',
            'Abstract': 'description',
            'Pubyear': 'pubyear',
            'Keyword': 'keyword',
            'Author': 'author',
            'ContentURL': 'URL'
        }
    )
)
cleaned_df_arti['category'] = 'article'

df = pd.concat([cleaned_df_arti, cleaned_df_data], ignore_index=True)

## 임베딩을 통한 연관성 점수 계산

In [8]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import pandas as pd
from tqdm import tqdm

targets = ['title', 'description', 'keyword']

MAX_LENGTH = 2000

dfs = {}

for target in targets:
    print(f'\n[embedding_{target}]')
    
    df[target] = df[target].fillna('')
    df[target] = df[target].str.slice(0, MAX_LENGTH)

    texts = df[target].tolist()

    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    docs = [Document(page_content=text, metadata={"ID": row.ID}) 
            for text, row in zip(texts, df.itertuples())]

    batch_size = 500
    stores = []
    for i in tqdm(range(0, len(docs), batch_size)):
        batch = docs[i:i+batch_size]
        store = FAISS.from_documents(batch, embeddings)
        stores.append(store)

    vectorstore = stores[0]
    for s in stores[1:]:
        vectorstore.merge_from(s)

    query = state[target]
    query_embedding = embeddings.embed_query(query)

    results_with_score = vectorstore.similarity_search_with_score_by_vector(query_embedding, k=20)
    
    dfs[f'df_{target}'] = pd.DataFrame([
        {
            "ID": r.metadata.get("ID"),
            "relevance": score,
            "target": target
        }
        for r, score in results_with_score
    ])
    
merged_df = dfs['df_title'].merge(
    dfs['df_description'][["ID", "relevance"]], 
    on="ID",
    how="outer",
    suffixes=("_title", "_desc")
).merge(
    dfs['df_keyword'][["ID", "relevance"]].rename(columns={"relevance": "relevance_key"}), 
    on="ID",
    how="outer"
)

merged_df = merged_df.fillna(2.0)

merged_df = merged_df.drop_duplicates(subset='ID')

merged_df = merged_df[merged_df['ID'] != state['input_id']]

# 2. 가중치 합산
a, b, c = 10, 3, 1
merged_df["relevance_raw"] = (
    merged_df["relevance_title"] * a + 
    merged_df["relevance_desc"] * b + 
    merged_df["relevance_key"] * c
) / (a + b + c)

merged_df["relevance"] = 100 * (1 - merged_df["relevance_raw"] / 2)

result_df = (merged_df[["ID", "relevance"]]
            .sort_values("relevance", ascending=False)
            .reset_index(drop=True)).head(5)

result_df.to_csv('../data/relevance_results.csv', index=False, encoding='utf-8')

display(result_df)


[embedding_title]


100%|██████████| 1/1 [00:00<00:00,  2.06it/s]



[embedding_description]


100%|██████████| 1/1 [00:01<00:00,  1.90s/it]



[embedding_keyword]


100%|██████████| 1/1 [00:01<00:00,  1.47s/it]


Unnamed: 0,ID,relevance
0,d023e479d6a3e09f0d7988cf38a4436b,92.56636
1,ff96e62579ae3046d133440562968c39,92.310448
2,21f628ecb675030dedda1149f466adae,91.947922
3,e83e64b3b4ea6a9982da08310ea27b1b,91.604317
4,83d26621eaf49e20d987f3d7d4005122,91.574333
