In [5]:
from glob import glob

for g in glob('../data/*.pdf'):
    print(g)

../data\2040_seoul_plan.pdf
../data\OneNYC_2050_Strategic_Plan.pdf


In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def read_pdf_and_split_text(pdf_path, chunk_size=1000, chunk_overlap=100):
    """
        주어진 PDF 파일을 읽고 텍스트를 분할합니다.

        매개변수 :
            pdf_path(str) :PDF파일의 경로
            chunk_size(int, 선택적) : 각 텍스트 청크의 크기, 기본 값은 1000입니다.
            chunk_overlap(int, 선택적) : 청크 간의 중첩 크기. 기본 값은 100입니다.
        반환값:
            list : 분할된 텍스트 청크의 리스트
    """
    print(f"PDF : {pdf_path}----------------------")

    pdf_loader = PyPDFLoader(pdf_path)
    data_from_pdf = pdf_loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = chunk_size,
        chunk_overlap = chunk_overlap
    )

    splits = text_splitter.split_documents(data_from_pdf)

    print(f"Numer of splits : {len(splits)}\n")
    
    return splits




  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%pip install langchain_huggingface

Collecting langchain_huggingface
  Downloading langchain_huggingface-1.2.0-py3-none-any.whl.metadata (2.8 kB)
Downloading langchain_huggingface-1.2.0-py3-none-any.whl (30 kB)
Installing collected packages: langchain_huggingface
Successfully installed langchain_huggingface-1.2.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://download.pytorch.org/whl/cu124
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.3 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
%pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.2.2-py3-none-any.whl.metadata (16 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy (from sentence-transformers)
  Downloading scipy-1.17.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.3.0 (from scikit-learn->sentence-transformers)
  Downloading joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn->sentence-transformers)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading sentence_transformers-5.2.2-py3-none-any.whl (494 kB)
Downloading scikit_learn-1.8.0-cp312-cp312-win_amd64.whl (8.0 MB)
   ---------------------------------------- 0.0/8.0 MB ? eta -:--:--
   ------------- -------------------------- 2.6/8.0 MB 13.7 MB/s eta 0:00:01
   --------------------------- ------------ 5.5/8.0 MB 13.4 MB/s eta 0:00:01
   ---------------


[notice] A new release of pip is available: 25.3 -> 26.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-m3",
    model_kwargs = {'device':'cpu'},
    encode_kwargs = {'normalize_embeddings': True}


)

In [3]:
embeddings.embed_documents("안녕하세요")

[[0.002456849440932274,
  0.03226263448596001,
  -0.007424331270158291,
  0.005268442444503307,
  -0.05809727683663368,
  -0.03042869083583355,
  -0.00830000638961792,
  0.03674247860908508,
  0.009622770361602306,
  -0.007633928209543228,
  0.017204441130161285,
  0.03887706995010376,
  -0.029649006202816963,
  -0.021820027381181717,
  -0.0008699532481841743,
  -0.032562535256147385,
  0.03178010880947113,
  -0.02246292680501938,
  0.023502610623836517,
  -0.012333753518760204,
  -0.03881017863750458,
  -0.017907725647091866,
  0.03872939199209213,
  0.01371566392481327,
  0.025562576949596405,
  0.022995982319116592,
  -0.027591736987233162,
  0.027073420584201813,
  -0.009775801561772823,
  -0.026509815827012062,
  -0.006837829481810331,
  -0.02930246852338314,
  0.028823168948292732,
  -0.07535600662231445,
  -0.03274376690387726,
  -0.004341269377619028,
  -0.023116104304790497,
  0.021545160561800003,
  -0.054298121482133865,
  0.05281256511807442,
  0.04333125799894333,
  -0.020

In [8]:
from langchain_chroma import Chroma
import os

persist_directory = '../chroma_store'

if os.path.exists(persist_directory):
    print("Loading existing Chroma store")
    vectorstore = Chroma(
        persist_directory=persist_directory,
        embedding_function=embeddings
    )
else:
    print("Creating new Chroma store")

    all_splits = []
    for g in glob('../data/*.pdf'):
        all_splits.extend(read_pdf_and_split_text(g))

    print(f"Total number of splits : {len(all_splits)}")

    vectorstore = Chroma.from_documents(
        documents = all_splits,
        embedding=embeddings,
        persist_directory=persist_directory
    )

Loading existing Chroma store


In [None]:
retriever = vectorstore.as_retrieve(search_kwargs={"k":5})

chunks = retriever.invoke("서울시 온실가스 저감 정책")

for chunk in chunks:
    print(chunk.metadata)
    print(chunk.page_content)