# 연관성 점수 계산 평가

## 평가용 데이터

In [34]:
# title, description, keyword
import json

with open("input_data.json", "r", encoding="utf-8") as f:
    input_data = json.load(f)

try:
    title = input_data['dataset_title_etc_main']
    description = input_data['dataset_expl_etc_main']
    keyword = input_data['dataset_expl_etc_main']
    input_id = input_data['svc_id']

except:
    items = input_data["MetaData"]["recordList"]["record"]["item"]
    title = next(i["#text"] for i in items if i["@metaCode"] == "Title")
    description = next(i["#text"] for i in items if i["@metaCode"] == "Abstract")
    keyword = next(i["#text"] for i in items if i["@metaCode"] == "Keyword")
    input_id = next(i["#text"] for i in items if i["@metaCode"] == "CN")
    
state = {'title': title, 'description': description, 'keyword': keyword, 'input_id': input_id}

print(title)
print(description)
print(keyword)


한국인의 3차원 무릎관절 구축 및 형상 측정
It is necessary to have a model that describes the feature of the knee Joint with a sufficient accuracy. Koreans, however, do not have their own knee joint model to be used in the total knee replacement arthroplasty. They have to use European or American models which do not match Koreans. Three-dimensional visualization techniques are found to be useful in a wide range of medical applications. Three-dimensional imaging studies such as CT(computed tomography) and MRI(magnetic resonance image) provide the primary source of patient-specific data. Three-dimensional knee joint models were constructed by image processing of the CT data of 10 subjects. Using the constructed model, the dimensions of Korean knee joint were measured. And this study proposed a three-dimensional model and data, which can be helpful to develop Korean knee implants and to analyze knee joint movements.
단층촬영사진 . 한국인 무릎 관절 모델 . 인공 관절 치환술


In [45]:
# data
import pandas as pd
import json

df_article = pd.read_csv('search_results_article.csv', encoding='UTF-8', low_memory=False)
df_data = pd.read_csv('search_results_dataset.csv', encoding='UTF-8', low_memory=False)

cleaned_df_data = (
    df_data[
        ['svc_id', 'dataset_title_etc_main', 'dataset_expl_etc_main','dataset_pub_dt_pc', 'dataset_kywd_etc_main', 'dataset_creator_etc_main', 'dataset_lndgpg', 'query']
    ]
    .rename(
        columns={
            'svc_id': 'ID',
            'dataset_title_etc_main': 'title',
            'dataset_expl_etc_main': 'description',
            'dataset_pub_dt_pc': 'pubyear',
            'dataset_kywd_etc_main': 'keyword',
            'dataset_creator_etc_main': 'author',
            'dataset_lndgpg': 'URL',
        }
    )
)
cleaned_df_data['category'] = 'dataset'

cleaned_df_arti = (
    df_article[
        ['CN', 'Title', 'Abstract', 'Pubyear', 'Keyword', 'Author', 'ContentURL', 'query']
    ]
    .rename(
        columns={
            'CN': 'ID',
            'Title': 'title',
            'Abstract': 'description',
            'Pubyear': 'pubyear',
            'Keyword': 'keyword',
            'Author': 'author',
            'ContentURL': 'URL'
        }
    )
)
cleaned_df_arti['category'] = 'article'

original_df = pd.concat([cleaned_df_arti, cleaned_df_data], ignore_index=True)

original_df.loc[[0, 5, 10], 'relevance_check'] = True

with open('search_data.json', 'w', encoding='utf-8') as f:
    json.dump(original_df.to_dict(orient='records'), f, ensure_ascii=False, indent=4)

In [46]:
# json에서 받아오기
with open('search_data.json', 'r', encoding='utf-8') as f:
    original_df = pd.DataFrame(json.load(f))

## 연관성 점수 계산 코드

In [36]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import pandas as pd
from tqdm import tqdm

targets = ['title', 'description', 'keyword']

MAX_LENGTH = 2000

dfs = {}

for target in targets:
    print(f'\n[embedding_{target}]')
    
    original_df[target] = original_df[target].fillna('')
    original_df[target] = original_df[target].str.slice(0, MAX_LENGTH)

    texts = original_df[target].tolist()

    embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

    docs = [Document(page_content=text, metadata={"ID": row.ID}) 
            for text, row in zip(texts, original_df.itertuples())]

    batch_size = 500
    stores = []
    for i in tqdm(range(0, len(docs), batch_size)):
        batch = docs[i:i+batch_size]
        store = FAISS.from_documents(batch, embeddings)
        stores.append(store)

    vectorstore = stores[0]
    for s in stores[1:]:
        vectorstore.merge_from(s)

    query = state[target]
    query_embedding = embeddings.embed_query(query)

    results_with_score = vectorstore.similarity_search_with_score_by_vector(query_embedding, k=20)
    
    dfs[f'df_{target}'] = pd.DataFrame([
        {
            "ID": r.metadata.get("ID"),
            "relevance": score,
            "target": target
        }
        for r, score in results_with_score
    ])
    
merged_df = dfs['df_title'].merge(
    dfs['df_description'][["ID", "relevance"]], 
    on="ID",
    how="outer",
    suffixes=("_title", "_desc")
).merge(
    dfs['df_keyword'][["ID", "relevance"]].rename(columns={"relevance": "relevance_key"}), 
    on="ID",
    how="outer"
)

merged_df = merged_df.fillna(2.0)

merged_df = merged_df.drop_duplicates(subset='ID')

merged_df = merged_df[merged_df['ID'] != state['input_id']]

# 2. 가중치 합산
a, b, c = 10, 3, 1
merged_df["relevance_raw"] = (
    merged_df["relevance_title"] * a + 
    merged_df["relevance_desc"] * b + 
    merged_df["relevance_key"] * c
) / (a + b + c)

merged_df["relevance"] = 100 * (1 - merged_df["relevance_raw"] / 2)

result_df = (merged_df[["ID", "relevance"]]
            .sort_values("relevance", ascending=False)
            .reset_index(drop=True))

# result_df.to_csv('../data/relevance_results.csv', index=False, encoding='utf-8')

display(result_df)


[embedding_title]


100%|██████████| 1/1 [00:02<00:00,  2.17s/it]



[embedding_description]


100%|██████████| 1/1 [00:01<00:00,  1.44s/it]



[embedding_keyword]


100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


Unnamed: 0,ID,relevance
0,DIKO0010027329,92.396706
1,NART73608690,53.660782
2,NART51664194,51.526726
3,f1bd34e52434a57a01b9ab55a6670891,51.36689
4,NPAP13253595,50.31488
5,NART116074894,48.328461
6,NART92905321,47.865391
7,7ce0df55f74ec28dcb81928845f63ec4,46.194683
8,NART68187596,46.167149
9,90fdbc2e1fd69bafb7e5cff6ea228f15,45.372944


## Recall@10 평가 결과

In [44]:
df_true = original_df[['ID', 'relevance_check']]

true_ids = df_true[df_true["relevance_check"]==True]["ID"].tolist()
pred_ids = result_df['ID'].tolist()

k = 10
topk_pred = pred_ids[:k]

hit = len(set(topk_pred) & set(true_ids))
recall_at_k = hit / len(true_ids) if len(true_ids) > 0 else 0


print(f"Recall@{k}: {recall_at_k:.4f}")

Recall@10: 0.0693
