In [None]:
# 벡터DB
# pip install faiss-cpu

In [1]:
import os
import warnings
from dotenv import load_dotenv

# 크로마DB와 FAISS를 함께 사용하기 위한 환경변수 설정
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings('ignore')

load_dotenv()

True

In [5]:
# 문서 읽기
# pip install pymupdf tiktoken
from langchain_community.document_loaders import PyMuPDFLoader

# 해당 폴더 내의 PDF 파일을 모두 읽는다
rag_folder_path = 'rag-dataset'             # 폴더 경로
pdfs = []
for root, dirs, files in os.walk(rag_folder_path):
    for file in files:
        if file.endswith('.pdf'):
            pdfs.append(os.path.join(root, file))

docs = []
for pdf in pdfs:
    loader = PyMuPDFLoader(pdf)
    temp = loader.load()
    docs.extend(temp)

docs
# len(docs)
# docs[0].page_content

[Document(metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'file_path': 'rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'total_pages': 15, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-10-21T11:38:50+00:00', 'trapped': '', 'page': 0}, page_content='Citation: Espeño, P.R.; Ong, A.K.S.;\nGerman, J.D.; Gumasing, M.J.J.; Casas,\nE.S. Analysis of Actual Fitness\nSupplement Consumption among\nHealth and Fitness Enthusiasts. Foods\n2024, 13, 1424. https://doi.org/\n10.3390/foods13091424\nAcademic Editors: Ilija Djekic\nand Nada Smigic\nReceived: 30 March 2024\nRevised: 15 April 2024\nAccepted: 18 April 2024\nPublished: 6 May 2024\nCopyright: © 2024 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed\nunder\nthe\nterms\nand\nconditions of the Creative Commo

In [6]:
# 문서 청크나누기
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 1000 == 200~300개의 토큰 청크, 100 == 20~30개의 토큰이 오버랩 (맥락을 유지할 수 있게 실제 텍스트를 분석하여 조정)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_splitter.split_documents(docs)

chunks

[Document(metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'file_path': 'rag-dataset\\gym supplements\\1. Analysis of Actual Fitness Supplement.pdf', 'total_pages': 15, 'format': 'PDF 1.7', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-10-21T11:38:50+00:00', 'trapped': '', 'page': 0}, page_content='Citation: Espeño, P.R.; Ong, A.K.S.;\nGerman, J.D.; Gumasing, M.J.J.; Casas,\nE.S. Analysis of Actual Fitness\nSupplement Consumption among\nHealth and Fitness Enthusiasts. Foods\n2024, 13, 1424. https://doi.org/\n10.3390/foods13091424\nAcademic Editors: Ilija Djekic\nand Nada Smigic\nReceived: 30 March 2024\nRevised: 15 April 2024\nAccepted: 18 April 2024\nPublished: 6 May 2024\nCopyright: © 2024 by the authors.\nLicensee MDPI, Basel, Switzerland.\nThis article is an open access article\ndistributed\nunder\nthe\nterms\nand\nconditions of the Creative Commo

In [8]:
import tiktoken

# [디버깅] 토큰 개수 조회 : 단어 유형에 따라 달라짐
encoding = tiktoken.encoding_for_model("gpt-4o-mini")
len(encoding.encode(chunks[0].page_content)), len(encoding.encode(chunks[1].page_content)), len(encoding.encode(docs[0].page_content))

(294, 219, 968)

In [18]:
# 문서 벡터DB 임베딩
from langchain_ollama import OllamaEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

In [19]:
ollama_url = 'http://localhost:11434'           # Ollama 설치 필수
embed_model = 'nomic-embed-text'                # ollama pull nomic-embed-text (설치 필수)

embeddings = OllamaEmbeddings(model = embed_model, base_url = ollama_url)

In [20]:
# 벡터 생성
vector = embeddings.embed_query("hello world")

index = faiss.IndexFlatL2(len(vector))

# 벡터스토어 생성
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),                # 램에 저장
    index_to_docstore_id={},
)

In [21]:
# 벡터스토어에 PDF 청크들을 넣는다
ids = vector_store.add_documents(documents=chunks)

In [22]:
### Retreival
pdf_file_count = 5
question = "how to gain muscle mass?"
docs = vector_store.search(query=question, k=pdf_file_count, search_type="similarity")

docs

[Document(id='8bad2d12-2318-401d-bb4e-dc36b513dc78', metadata={'producer': 'iLovePDF', 'creator': '', 'creationdate': '', 'source': 'rag-dataset\\gym supplements\\2. High Prevalence of Supplement Intake.pdf', 'file_path': 'rag-dataset\\gym supplements\\2. High Prevalence of Supplement Intake.pdf', 'total_pages': 11, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-10-21T11:39:04+00:00', 'trapped': '', 'page': 8}, page_content='and strength gain among men. We detected more prevalent protein and creatine supplementation\namong younger compared to older ﬁtness center users, whereas the opposite was found for vitamin\nsupplementation. Other authors made similar observations [23] and it might reﬂect the diﬀerent\ntraining goals among age groups, i.e., more focus on strength and muscles among the younger and\nmore focus on health among the older age groups.\nComparable to other studies [4], we detected a positive correlation between training fre

In [23]:
# 램에 셋팅해놓은 벡터스토어를 DB파일로 내보낸다
db_name = "health_supplements"

vector_store.save_local(db_name)