In [13]:
# input data
import json
import pandas as pd

with open("../data/input_data.json", "r", encoding="utf-8") as f:
    input_data = json.load(f)
    
subject, description = input_data['dataset_title_etc_main'], input_data['dataset_expl_etc_main']

df_article = pd.read_csv('../data/search_results_article.csv', encoding='UTF-8', low_memory=False)
df_data = pd.read_csv('../data/search_results_dataset.csv', encoding='UTF-8', low_memory=False)

cleaned_df_data = (
    df_data[
        ['svc_id', 'dataset_title_etc_main', 'dataset_expl_etc_main','dataset_pub_dt_pc', 'dataset_kywd_etc_main', 'dataset_creator_etc_main', 'dataset_lndgpg', 'query']
    ]
    .rename(
        columns={
            'svc_id': 'ID',
            'dataset_title_etc_main': 'title',
            'dataset_expl_etc_main': 'description',
            'dataset_pub_dt_pc': 'pubyear',
            'dataset_kywd_etc_main': 'keyword',
            'dataset_creator_etc_main': 'author',
            'dataset_lndgpg': 'URL',
        }
    )
)
cleaned_df_data['category'] = 'dataset'

cleaned_df_arti = (
    df_article[
        ['CN', 'Title', 'Abstract', 'Pubyear', 'Keyword', 'Author', 'ContentURL', 'query']
    ]
    .rename(
        columns={
            'CN': 'ID',
            'Title': 'title',
            'Abstract': 'description',
            'Pubyear': 'pubyear',
            'Keyword': 'keyword',
            'Author': 'author',
            'ContentURL': 'URL'
        }
    )
)
cleaned_df_arti['category'] = 'article'

df = pd.concat([cleaned_df_arti, cleaned_df_data], ignore_index=True)

# 임베딩을 통한 점수 생성 방법

In [14]:
# text-embedding-3-small 사용

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import pandas as pd
from tqdm import tqdm

df['title'] = df['title'].fillna('')
df['description'] = df['description'].fillna('')

MAX_LENGTH = 1000
df['description'] = df['description'].str.slice(0, MAX_LENGTH)

texts = (df['title'] + " " +
        df['description']).tolist()

# 2. 임베딩 객체 생성
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# 3. Document 객체로 변환
docs = [Document(page_content=text, metadata={"ID": row.ID}) 
        for text, row in zip(texts, df.itertuples())]

batch_size = 500
stores = []
for i in tqdm(range(0, len(docs), batch_size)):
    batch = docs[i:i+batch_size]
    store = FAISS.from_documents(batch, embeddings)
    stores.append(store)

# 부분 벡터스토어 통합
vectorstore = stores[0]
for s in stores[1:]:
    vectorstore.merge_from(s)

# 5. 쿼리 임베딩 생성
query = subject + " " + description
query_embedding = embeddings.embed_query(query)

# 6. 유사 문서 검색
results_with_score = vectorstore.similarity_search_with_score_by_vector(query_embedding, k=20)

# 7. 결과 출력
relevance_df = pd.DataFrame([
    {
        "ID": r.metadata.get("ID"),
        "relevance": score,
    }
    for r, score in results_with_score
]).sort_values(by="relevance", ascending=True)

relevance_df = relevance_df.merge(
    df[['ID', 'title', 'description']],
    on='ID',
    how='left'
)

display(relevance_df)

100%|██████████| 1/1 [00:01<00:00,  1.30s/it]


Unnamed: 0,ID,relevance,title,description
0,b37f0c9413eeb7c45f6fe31cbe3a41ef,0.032391,Architectural Urbanism: Melbourne/Seoul - KTA ...,BACKGROUND: 'Architectural Urbanism: Melbourne...
1,a83eb22a4a62949d83c70662acdf439a,0.572106,"Urban Architectures (Court House, George+Murph...","RESEARCH BACKGROUND: Court House, George+Murph..."
2,4f8e04de8044dd201965353514748c13,0.765774,Urban box,RESEARCH BACKGROUND: Urban Box was one of 4 pr...
3,8a296da6f0f22b607daeabd129f789c3,0.815034,Saturation City,BACKGROUND In 2010 the researcher's collaborat...
4,27be08ea55b6e29be994c74b13b82595,0.817217,Crownie,'Peter Corrigan: cities of hope' exhibition at...
5,35eeb76af7e1bdb0e31f1d28dbd315fb,0.841749,Section (Charles Street House) + Model (Tarilt...,RESEARCH BACKGROUND: X-field is an interdiscip...
6,c9956a897c5342cadc56442d408817a2,0.874056,Urban Still Lifes (with mops and brooms): Beij...,RESEARCH BACKGROUND: The 'Urban Still Life (wi...
7,5af7ee092c0425a4424cd6fec7a3e1ac,0.914127,K2K Design Proposal,"BACKGROUND: The K2K design proposal, developed..."
8,87b79f3640c97ad515c13351fe526224,0.918718,Carved Ground: residence and studio,RESEARCH BACKGROUND: 'Carved ground' is one st...
9,NART70277933,0.929998,Large screens as creative clusters,This paper begins by situating the large scree...


## 분야별로 나누기

In [15]:
# 제목

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import pandas as pd
from tqdm import tqdm

df['title'] = df['title'].fillna('')

texts = (df['title']).tolist()

# 2. 임베딩 객체 생성
embeddings_title = OpenAIEmbeddings(model="text-embedding-3-small")

# 3. Document 객체로 변환
docs_title = [Document(page_content=text, metadata={"ID": row.ID}) 
        for text, row in zip(texts, df.itertuples())]

batch_size = 500
stores = []
for i in tqdm(range(0, len(docs_title), batch_size)):
    batch = docs_title[i:i+batch_size]
    store = FAISS.from_documents(batch, embeddings_title)
    stores.append(store)

# 부분 벡터스토어 통합
vectorstore = stores[0]
for s in stores[1:]:
    vectorstore.merge_from(s)

# 5. 쿼리 임베딩 생성
query = subject
query_embedding = embeddings_title.embed_query(query)

# 6. 유사 문서 검색
results_with_score = vectorstore.similarity_search_with_score_by_vector(query_embedding, k=len(texts))

# 7. 결과 출력
relevance_df_title = pd.DataFrame([
    {
        "ID": r.metadata.get("ID"),
        "relevance": score,
    }
    for r, score in results_with_score
]).sort_values(by="relevance", ascending=True)

relevance_df_title = relevance_df_title.merge(
    df[['ID', 'category',  'title', 'description']],
    on='ID',
    how='left'
)

display(relevance_df_title)

100%|██████████| 1/1 [00:01<00:00,  1.21s/it]


Unnamed: 0,ID,relevance,category,title,description
0,b37f0c9413eeb7c45f6fe31cbe3a41ef,8.833030e-07,dataset,Architectural Urbanism: Melbourne/Seoul - KTA ...,BACKGROUND: 'Architectural Urbanism: Melbourne...
1,a83eb22a4a62949d83c70662acdf439a,7.597810e-01,dataset,"Urban Architectures (Court House, George+Murph...","RESEARCH BACKGROUND: Court House, George+Murph..."
2,NPAP14424286,8.514366e-01,article,Innovative tools for implementing the smart ci...,
3,NART90662159,8.575770e-01,article,Urban Sustainability through Public Architecture,<P>As the sustainability of contemporary citie...
4,NART130570112,8.681008e-01,article,Sustainable Urbanism and Architectural Design:...,<P>This academic article delves into the inter...
...,...,...,...,...,...
160,384ac3e2a4cf0d16d11758f78c26426c,1.873814e+00,dataset,"Additional file 23 of Implicating genes, pleio...",<b>External Organisations</b><br/>Queen Mary U...
161,0d0b434d5bc6537b3d40f690a425179a,1.874088e+00,dataset,"Additional file 25 of Implicating genes, pleio...",<b>External Organisations</b><br/>Queen Mary U...
162,0efc6d4fbaf6ae848785b3f7716cafa7,1.875190e+00,dataset,"Additional file 29 of Implicating genes, pleio...",<b>External Organisations</b><br/>Queen Mary U...
163,5d24ce68a454432a4d8934cd99f506a1,1.875922e+00,dataset,"Additional file 8 of Implicating genes, pleiot...",<b>External Organisations</b><br/>Queen Mary U...


In [16]:
# 설명

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import pandas as pd
from tqdm import tqdm

df['description'] = df['description'].fillna('')

MAX_LENGTH = 1000
df['description'] = df['description'].str.slice(0, MAX_LENGTH)

texts = (df['description']).tolist()

# 2. 임베딩 객체 생성
embeddings_description = OpenAIEmbeddings(model="text-embedding-3-small")

# 3. Document 객체로 변환
docs_description = [Document(page_content=text, metadata={"ID": row.ID}) 
        for text, row in zip(texts, df.itertuples())]

batch_size = 500
stores = []
for i in tqdm(range(0, len(docs_description), batch_size)):
    batch = docs_description[i:i+batch_size]
    store = FAISS.from_documents(batch, embeddings_description)
    stores.append(store)

# 부분 벡터스토어 통합
vectorstore = stores[0]
for s in stores[1:]:
    vectorstore.merge_from(s)

# 5. 쿼리 임베딩 생성
query = description
query_embedding = embeddings_description.embed_query(query)

# 6. 유사 문서 검색
results_with_score = vectorstore.similarity_search_with_score_by_vector(query_embedding, k=len(texts))

# 7. 결과 출력
relevance_df_description = pd.DataFrame([
    {
        "ID": r.metadata.get("ID"),
        "relevance": score,
    }
    for r, score in results_with_score
]).sort_values(by="relevance", ascending=True)

relevance_df_description = relevance_df_description.merge(
    df[['ID', 'category',  'title', 'description']],
    on='ID',
    how='left'
)

display(relevance_df_description)

100%|██████████| 1/1 [00:01<00:00,  1.09s/it]


Unnamed: 0,ID,relevance,category,title,description
0,b37f0c9413eeb7c45f6fe31cbe3a41ef,0.045485,dataset,Architectural Urbanism: Melbourne/Seoul - KTA ...,BACKGROUND: 'Architectural Urbanism: Melbourne...
1,a83eb22a4a62949d83c70662acdf439a,0.536102,dataset,"Urban Architectures (Court House, George+Murph...","RESEARCH BACKGROUND: Court House, George+Murph..."
2,4f8e04de8044dd201965353514748c13,0.689401,dataset,Urban box,RESEARCH BACKGROUND: Urban Box was one of 4 pr...
3,8a296da6f0f22b607daeabd129f789c3,0.722418,dataset,Saturation City,BACKGROUND In 2010 the researcher's collaborat...
4,27be08ea55b6e29be994c74b13b82595,0.748554,dataset,Crownie,'Peter Corrigan: cities of hope' exhibition at...
...,...,...,...,...,...
160,NART80860267,1.699124,article,Reduced orbitofrontal cortical thickness in ma...,<P><B>Background</B></P><P>The orbitofrontal c...
161,NART70983995,1.699389,article,COMT genotype affects brain white matter pathw...,<P><B>Abstract</B></P><P>Increased dopamine av...
162,NART56990519,1.719598,article,Point target probabilistic multiple hypothesis...,<P>Probabilistic Multiple Hypothesis Tracking ...
163,NPAP13611771,1.738940,article,Movie Recommendation System Using Deep Learning,<P>It is a challenge to design a movie recomme...


In [17]:
# 키워드

from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
import pandas as pd
from tqdm import tqdm

df['keyword'] = df['keyword'].fillna('')

texts = (df['keyword']).tolist()

# 2. 임베딩 객체 생성
embeddings_keyword = OpenAIEmbeddings(model="text-embedding-3-small")

# 3. Document 객체로 변환
docs_keyword = [Document(page_content=text, metadata={"ID": row.ID}) 
        for text, row in zip(texts, df.itertuples())]

batch_size = 500
stores = []
for i in tqdm(range(0, len(docs_keyword), batch_size)):
    batch = docs_keyword[i:i+batch_size]
    store = FAISS.from_documents(batch, embeddings_keyword)
    stores.append(store)

# 부분 벡터스토어 통합
vectorstore = stores[0]
for s in stores[1:]:
    vectorstore.merge_from(s)

# 5. 쿼리 임베딩 생성
query = description
query_embedding = embeddings_keyword.embed_query(query)

# 6. 유사 문서 검색
results_with_score = vectorstore.similarity_search_with_score_by_vector(query_embedding, k=len(texts))

# 7. 결과 출력
relevance_df_keyword = pd.DataFrame([
    {
        "ID": r.metadata.get("ID"),
        "relevance": score,
    }
    for r, score in results_with_score
]).sort_values(by="relevance", ascending=True)

relevance_df_keyword = relevance_df_keyword.merge(
    df[['ID', 'category',  'title', 'description']],
    on='ID',
    how='left'
)

display(relevance_df_keyword)

100%|██████████| 1/1 [00:01<00:00,  1.00s/it]


Unnamed: 0,ID,relevance,category,title,description
0,NART70277933,1.012413,article,Large screens as creative clusters,This paper begins by situating the large scree...
1,b64e2464140cba265a4242ff3f39f786,1.073443,dataset,Data underlying the PhD thesis: Improving Resi...,"<p>The dataset comprises questionnaire data, a..."
2,16a6f3a0aca07cdb01f995ff888abde2,1.073497,dataset,Data underlying the PhD thesis: Improving Resi...,"<p>The dataset comprises questionnaire data, a..."
3,NART98585046,1.253776,article,A guide to architecture for the public health ...,<P><B>Abstract</B></P> <P><B>Background</B></...
4,22a8ba2b86a9dd291f48148552bbd942,1.259563,dataset,Data underlying the publication: The role of u...,<p>This dataset contains the results of a pape...
...,...,...,...,...,...
160,06de034416353eceb124aaee9b78efd1,1.659591,dataset,Stadtplanungsplan Sant Pau de Segúries - März ...,Le Plan d’urbanisme municipal (POUM) de Sant P...
161,NART70983995,1.665227,article,COMT genotype affects brain white matter pathw...,<P><B>Abstract</B></P><P>Increased dopamine av...
162,eda8eb47c92f99bc97c1ad14fa1c4887,1.683394,dataset,Origin of minicircular mitochondrial genomes i...,Sample preparation Culture strains of Tsunamia...
163,NART80860267,1.705854,article,Reduced orbitofrontal cortical thickness in ma...,<P><B>Background</B></P><P>The orbitofrontal c...


In [18]:
# 1. 세 개의 데이터프레임 한 번에 병합
merged_df = relevance_df_title.merge(
    relevance_df_description[["ID", "relevance"]], 
    on="ID",
    suffixes=("_title", "_desc")
).merge(
    relevance_df_keyword[["ID", "relevance"]].rename(columns={"relevance": "relevance_key"}), 
    on="ID",
)

# 2. 가중치 합산
a, b, c = 10, 3, 1
merged_df["relevance_raw"] = (
    merged_df["relevance_title"] * a + 
    merged_df["relevance_desc"] * b + 
    merged_df["relevance_key"] * c
) / (a + b + c)

merged_df["relevance_score"] = 100 * (1 - merged_df["relevance_raw"] / 2)

# 3. 필요한 열만 선택 및 정렬
result_df = (merged_df[["ID", "relevance_score", "category", "title", "description"]]
             .rename(columns={"relevance_score": "relevance"})
             .sort_values("relevance", ascending=False)
             .reset_index(drop=True))


display(result_df)

Unnamed: 0,ID,relevance,category,title,description
0,b37f0c9413eeb7c45f6fe31cbe3a41ef,93.680328,dataset,Architectural Urbanism: Melbourne/Seoul - KTA ...,BACKGROUND: 'Architectural Urbanism: Melbourne...
1,a83eb22a4a62949d83c70662acdf439a,61.288715,dataset,"Urban Architectures (Court House, George+Murph...","RESEARCH BACKGROUND: Court House, George+Murph..."
2,NART90662159,52.417946,article,Urban Sustainability through Public Architecture,<P>As the sustainability of contemporary citie...
3,NART130570112,51.114548,article,Sustainable Urbanism and Architectural Design:...,<P>This academic article delves into the inter...
4,175af073d420cd3aa8b18fc8f7360e4e,50.354046,dataset,Master plan for Dandenong Civic Centre - Rush\...,RESEARCH BACKGROUND: Paul Carter's Material Th...
...,...,...,...,...,...
160,0d0b434d5bc6537b3d40f690a425179a,11.634874,dataset,"Additional file 25 of Implicating genes, pleio...",<b>External Organisations</b><br/>Queen Mary U...
161,5d24ce68a454432a4d8934cd99f506a1,11.631500,dataset,"Additional file 8 of Implicating genes, pleiot...",<b>External Organisations</b><br/>Queen Mary U...
162,4af1b0a382e6e6eb8d45722bc496798b,11.513578,dataset,"Additional file 27 of Implicating genes, pleio...",<b>External Organisations</b><br/>Queen Mary U...
163,b33846af9bccac4abbce4af23a43aa0a,9.950542,dataset,He must not cry,An inaugural partnership between ACMI and Asia...


In [None]:
# 로컬 모델 사용
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
import pandas as pd

# 검색할 주제와 설명
subject = '기후 데이터 기반의 도시 열섬 현상 분석'
description = '위성 원격탐사 데이터와 국내 기상 관측소 데이터를 결합하여 도시 열섬 현상의 시공간적 패턴을 분석하고, 인공지능 기반 예측 모델을 통해 기후 변화 대응 전략을 모색하는 연구'

# 데이터 로드
processed_df = pd.read_csv('./generated_input_data.csv', encoding='UTF-8', low_memory=False)
vector_df = processed_df[['ID', '제목', '설명', '키워드']]

# 텍스트 결합
texts = (vector_df['제목'] + " " + vector_df['설명'] + " " + vector_df['키워드']).tolist()

# 임베딩 객체 생성
embeddings = HuggingFaceEmbeddings(
    model_name="../model/all-mpnet-base-v2",
    model_kwargs={"device": "cpu"}
)

# Document 객체 생성
docs = [Document(page_content=text, metadata={"id": row.ID}) 
        for text, row in zip(texts, vector_df.itertuples())]

# Chroma 벡터스토어 생성
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,
    # persist_directory="./chroma_db"
)

# 쿼리 생성
query = subject + " " + description

# 유사 문서 20개 검색
k = 20
similar_docs = vectorstore.similarity_search(query, k=k)

# 결과 DataFrame 생성
df_results = pd.DataFrame([
    {
        "ID": doc.metadata.get("id"),
        "제목": vector_df.loc[vector_df['ID'] == doc.metadata.get("id"), "제목"].values[0],
        "설명": vector_df.loc[vector_df['ID'] == doc.metadata.get("id"), "설명"].values[0]
    }
    for doc in similar_docs
])

display(df_results)
