### Embeddings

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
sys.path.append(os.path.abspath(os.path.join(os.path.join(os.getcwd(), '..'), '..')))

from pathlib import Path
from vectorstore import VectorDB
from schemas.models import get_milvus_settings
from langchain_openai.embeddings import OpenAIEmbeddings

BASE_DIR = Path.cwd().parent
data_folder = os.path.join(BASE_DIR,'vectordb', 'data') 
file_list = os.listdir(data_folder)

milvus_settings = get_milvus_settings()
vector_db = VectorDB(settings=milvus_settings, embedding=OpenAIEmbeddings())

[ 2024-10-17 20:34:27.942505 ] : connect_to_milvus()


In [2]:
retriever = vector_db.get_retriever(collection_name="document_embeddings")
retriever.invoke("안녕하세요")

[ 2024-10-17 20:34:31.349486 ] : get_vectorstore()


[Document(metadata={'doc_name': 'guide.pdf', 'page_number': 21, 'creation_time': 1729026894.7537396, 'modification_time': 1728916150.4873421, 'key': 'guide.pdf_21_a40e5ec5ad704498f5ce554ec271953985290c9860974d38125d32dea0167615', 'pk': 453224949084361492}, page_content='※ 개인신용정보 전송 요구로부터 5분이 경과하였을 경우 지연으로 봄'),
 Document(metadata={'doc_name': 'guide.pdf', 'page_number': 92, 'creation_time': 1729026894.7537396, 'modification_time': 1728916150.4873421, 'key': 'guide.pdf_92_9fe4a62eaaac988dcdc8b0fb1cf824de4ec654ea130cf9b71426fa4ac52c761c', 'pk': 453224949084361799}, page_content='되었을 시, 서면, 전화, 전자우편, 휴대전화 문자메시지(SMS) 등을 통해 지체없이 해당 \n고객에게 통지하여야 한다.\n개인신용정보 누설의 예\n∙ 신용정보회사등이 개인신용정보에 대하여 통제를 상실하거나 권한 없는 자의 접근을 허용한 \n경우로서 아래의 예시 및 이와 유사한 경우 등에는 개인신용정보 누설로 볼 수 있음'),
 Document(metadata={'doc_name': 'api.pdf', 'page_number': 307, 'creation_time': 1729026884.7069488, 'modification_time': 1728916150.4709036, 'key': 'api.pdf_307_714dd023c10cf0d6efec63f7043e5efbd262ae7aac9e4a5fb70c32091be97fc3', 'pk':

In [2]:
vector_db._drop_database(collection_name="document_embeddings")

[ 2024-10-16 08:07:40.757646 ] : drop_database()


In [3]:
for file in file_list:
    print(file)
    file_path = os.path.join(data_folder, file)
    vector_db.core_embedding(collection_name="document_embeddings", file_path=file_path)

guide.pdf
[ 2024-10-16 08:07:44.134683 ] : core_embedding()
[ 2024-10-16 08:07:44.336309 ] : get_filtered_docs()
[ 2024-10-16 08:07:44.336330 ] : check_collection_exists()
[ 2024-10-16 08:07:51.549148 ] : Milvus collection 'document_embeddings' updated.
api.pdf
[ 2024-10-16 08:07:51.549353 ] : core_embedding()
[ 2024-10-16 08:07:53.883038 ] : get_filtered_docs()
[ 2024-10-16 08:07:53.883118 ] : check_collection_exists()
[ 2024-10-16 08:07:54.045926 ] : get_collection()


100%|██████████| 2019/2019 [01:01<00:00, 32.90it/s]


[ 2024-10-16 08:09:15.847838 ] : Milvus collection 'document_embeddings' updated.


### Embedding 수행 결과 확인

In [5]:
from collections import Counter

collection = vector_db._get_collection(collection_name="document_embeddings")
results = collection.query(
    expr="",                    # 조건 없이 모든 데이터 가져옴
    output_fields=["doc_name"],
    limit=10000       
)
# doc_name 필드를 카운트
doc_names = [result["doc_name"] for result in results]
doc_name_counts = Counter(doc_names)

# 결과 출력
for doc_name, count in doc_name_counts.items():
    print(f"Document Name: {doc_name}, Count: {count}")

[ 2024-10-16 08:10:39.187087 ] : get_collection()
Document Name: guide.pdf, Count: 544
Document Name: api.pdf, Count: 2019
