In [None]:
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import pickle
from tqdm import tqdm
from dotenv import load_dotenv


load_dotenv()


True

In [2]:
def embed_pdf_documents(pdf_paths, save_path):
    docs = []
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=150,
        separators=[". ", "\n\n", "\n", " "]
    )
    for path in tqdm(pdf_paths, desc="PDF 로딩"):
        loader = PyPDFLoader(path)
        raw_docs = loader.load()  # Document 객체 리스트 반환 (페이지별)
        splitted_docs = text_splitter.split_documents(raw_docs)
        # 메타데이터에 파일명, 페이지 정보 추가 추천
        for doc in splitted_docs:
            doc.metadata["source"] = os.path.basename(path)
            doc.metadata["type"] = "pdf"
        docs.extend(splitted_docs)
    print(f"PDF 청크 총 {len(docs)}개")
    
    # 임베딩
    texts = [doc.page_content for doc in docs]
    model = OpenAIEmbeddings(model="text-embedding-3-small")
    vectors = model.embed_documents(texts)
    
    # 저장
    with open(os.path.join(save_path, "pdf_documents.pkl"), "wb") as f:
        pickle.dump(docs, f)
    with open(os.path.join(save_path, "pdf_embeddings.pkl"), "wb") as f:
        pickle.dump(vectors, f)
    print("✅ PDF 임베딩 및 저장 완료")



In [None]:
def main():
    # 1. PDF 임베딩
    pdf_paths = [
        "C:/Users/sbin0/Desktop/3-1/인공지능서비스개발/민법(법률)(제20432호)(20250131).pdf",
        "C:/Users/sbin0/Desktop/3-1/인공지능서비스개발/민사집행법(법률)(제20434호)(20250131).pdf",
        "C:/Users/sbin0/Desktop/3-1/인공지능서비스개발/민사조정법(법률)(제16910호)(20200305).pdf",
        "C:/Users/sbin0/Desktop/3-1/인공지능서비스개발/민사소송법(법률)(제20003호)(20250301).pdf",
        "C:/Users/sbin0/Desktop/3-1/인공지능서비스개발/합의서.pdf",
        "C:/Users/sbin0/Desktop/3-1/인공지능서비스개발/고소장.pdf"
    ]
    save_dir = "precomputed"
    os.makedirs(save_dir, exist_ok=True)
    embed_pdf_documents(pdf_paths, save_dir)
    
if __name__ == "__main__":
    main()


PDF 로딩:   0%|          | 0/4 [00:00<?, ?it/s]

PDF 로딩: 100%|██████████| 4/4 [00:01<00:00,  2.19it/s]


PDF 청크 총 1385개
✅ PDF 임베딩 및 저장 완료


JSON 로딩 (1): 100%|██████████| 19050/19050 [02:43<00:00, 116.77it/s]


폴더 1: JSON 청크 445219개
✅ json_001 임베딩 및 저장 완료


JSON 로딩 (2): 100%|██████████| 19111/19111 [02:55<00:00, 108.59it/s]


폴더 2: JSON 청크 552634개
✅ json_001_002 임베딩 및 저장 완료


JSON 로딩 (3): 100%|██████████| 19103/19103 [16:56<00:00, 18.80it/s]  


폴더 3: JSON 청크 621104개
✅ json_001_002_003 임베딩 및 저장 완료


JSON 로딩 (4): 100%|██████████| 19027/19027 [23:00<00:00, 13.79it/s]  


폴더 4: JSON 청크 623994개
✅ json_001_002_003_004 임베딩 및 저장 완료
