## 로컬 환경에서 PDF 검색하기 2단계 step1
- PDF 문서를 로드하고 한국어 임베딩 모델을 사용하여 임베딩 데이터 생성
- 임베딩 데이터를 csv 파일로 만들어 저장하기 
- 저장한 csv 데이터를 읽어서 FAISS 인덱스 생성하기
- 생성한 FAISS 인덱스를 검색하기 

- 생성한 FAISS 인덱스에 langchain 프레임워크 적용하여 llm 검색하기 

### - 사용한 임베딩 모델 jhgan/ko-sroberta-multitask
### - 사용한 LLM 모델 llama3.2

In [1]:
# 라이브러리 설치 (필요한 경우 실행)
%pip install -U langchain langchain_core sentence-transformers faiss-cpu pymupdf

Collecting langchain
  Downloading langchain-0.3.17-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_core
  Downloading langchain_core-0.3.34-py3-none-any.whl.metadata (5.9 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-3.4.1-py3-none-any.whl.metadata (10 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.4 kB)
Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting PyYAML>=5.3 (from langchain)
  Downloading PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.38-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.11.12-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.3 (from langchain)
  Downloading langchain_text_splitters-0.3.6-py3-none-an

In [3]:
%pip install -U langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.16-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB

In [5]:
# 필요한 라이브러리 임포트
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document
from langchain_core.embeddings import Embeddings  
from sentence_transformers import SentenceTransformer

# Step 1: 문서 로드
pdf_file_path = "data/"
pdf_file_name = "AI기반_인파분석플랫폼구축_제안서"

loader = PyMuPDFLoader(pdf_file_path + pdf_file_name + ".pdf") 
docs = loader.load()

# Step 2: 문서 분할
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

In [6]:
# Step 3: SentenceTransformer 모델을 LangChain의 Embeddings 클래스로 감싸기
class KoSentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        """문서 리스트를 벡터로 변환"""
        return self.model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        """검색 쿼리를 벡터로 변환"""
        return self.model.encode([text], convert_to_numpy=True).tolist()[0]


# Step 4: 모델 로드 및 FAISS 인덱스 생성
embedding_model = KoSentenceTransformerEmbeddings("jhgan/ko-sroberta-multitask")
# faiss_index = FAISS.from_documents(split_documents, embedding_model)

In [8]:
%pip install pandas

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (89 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-macosx_11_0_arm64.whl (11.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.4/11.4 MB[0m [31m19.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pytz-2025.1-py2.py3-none-any.whl (507 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.9/507.9 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m346.8/346.8 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25

In [9]:
import csv
import pandas as pd
import os

# Step 5: 문서 임베딩 및 CSV 저장
# 원본 문서와 임베딩 데이터를 CSV 에 함께 저장함
# 원본 문서를 저장하는 이유 : 검색 결과를 보여줘야 하기 때문. 
def save_embeddings_to_csv(documents, embedding_model, filename=pdf_file_name+".csv", file_path="./csv/"):
    # 경로가 존재하지 않은 경우 디렉토리 생성
    os.makedirs(file_path, exist_ok=True)
    full_path = os.path.join(file_path, filename)

    # 문서 임베딩 수행
    embeddings = embedding_model.embed_documents([doc.page_content for doc in documents])
    
    # CSV 저장
    with open(full_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["document", "embedding"])
        
        for doc, embedding in zip(documents, embeddings):
            writer.writerow([doc.page_content, embedding])
    
    print(f"임베딩 데이터가 {full_path} 파일에 저장되었습니다.")
    return full_path

# 함수 실행
documents = split_documents  # FAISS에 넣은 문서 리스트 사용
full_path = save_embeddings_to_csv(documents, embedding_model)

임베딩 데이터가 ./csv/AI기반_인파분석플랫폼구축_제안서.csv 파일에 저장되었습니다.


In [10]:
import faiss
import numpy as np
import pandas as pd


# CSV 파일 불러오기 
def load_embeddings_from_csv(filepath):
    df = pd.read_csv(filepath)
    df["embedding"] = df["embedding"].apply(lambda x: np.fromstring(x[1:-1], sep=','))  # 문자열을 numpy 배열로 변환
    return df

# FAISS 인덱스 생성
def create_faiss_index(embedding_dim, df):
    index = faiss.IndexFlatL2(embedding_dim)  # L2 거리 기반 인덱스
    embeddings = np.vstack(df["embedding"].values).astype("float32")
    index.add(embeddings)  
    return index, df

# CSV에서 데이터 불러오기
df_embeddings = load_embeddings_from_csv(full_path)

# FAISS 인덱스 생성
embedding_dim = len(df_embeddings["embedding"].iloc[0])  # 벡터 차원 수 확인
faiss_index, df_embeddings = create_faiss_index(embedding_dim, df_embeddings)

print("FAISS 인덱스가 성공적으로 생성되었습니다!")

FAISS 인덱스가 성공적으로 생성되었습니다!


In [11]:
# 생성한 FAISS 인덱스로 검색하기 
def search_faiss_index(query_embedding, index, df, k=5):
    query_vector = np.array(query_embedding).astype("float32").reshape(1, -1)
    distances, indices = index.search(query_vector, k)
    
    results = []
    for i in range(k):
        idx = indices[0][i]
        results.append((df.iloc[idx]["document"], distances[0][i]))  # (문서 내용, 거리) 반환
    return results

# 예제 쿼리 실행
query_text = "포항에서 열리는 축제의 이름은은?"  # 검색할 문장
query_embedding = embedding_model.embed_query(query_text)  # 쿼리를 임베딩

search_results = search_faiss_index(query_embedding, faiss_index, df_embeddings)

# 결과 출력
for rank, (doc, distance) in enumerate(search_results):
    print(f"Rank {rank+1}: {doc} (Score: {distance})")



Rank 1: 방문객수
연도
151만명
2019 년
189만명
2018 년
203만명
2017 년
187만명
2016 년
103만명
2015 년
방문객수
행사명
연도
3.5만명
청년문화페스티벌
2023 년
1.5만명
붐업페스티벌
2018 년
-
-
-
-
-
- (Score: 150.69439697265625)
Rank 2: SFR-002
Sec 
1-2
위치: 경북포항시북구상대로
59-1
상태: 정상 (Score: 161.13368225097656)
Rank 3: SFR-006
•
1/1.8” CMOS Image Sensor
•
야간초저조도기능
•
최대30fps, 3840x2160
•
지능형영상분석기능(가상선/영역, 출입감지, 방향감지, 
움직임감지등)
•
AI 기반객체감지기능(사람, 자동차, 오토바이, 자전거, 
번호판등)
* 객체상세분류: 성별,색상,모자,마스크,가방등
•
AI 기반분석속성값제공
실종자인상착의
안경:  
없음
상의색상: 파란셔츠
하의색상:     검은색
가방:        있음
사람
자동차
동물
기타
성별, 상/하의색상, 
바지/치마, 마스크, 안경, 
가방, 모자, 얼굴등
번호판, 자전거, 색상, 
차종등
개, 고양이, 멧돼지, 
고라니등
비행기, 선박등 (Score: 189.25851440429688)
Rank 4: SFR-006
심각(170%∼)
혼잡(∼170%)
주의(∼150%)
보통(∼130%)
기준(100%)
5.6 이상
5.6 이하
5.0 이하
4.3 이하
3.3명
심각(170%∼)
혼잡(∼170%)
주의(∼150%)
보통(∼130%)
기준(100%)
136 이상
136 이하
120 이하
104 이하
80
1구역
2구역
3구역
4구역
Sec.1-1
Sec. 1-2
Sec. 2-1
Sec. 2-2
Sec. 3-1
Sec. 3-2
Sec. 4-1
Sec. 3-3
Sec. 4-3
Sec. 4-2 (Score: 195.55972290039062)
Rank 5: SFR-007
! (Score: 198.93283081054688)


In [12]:
# Step 6 : 프롬프트 생성 
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Answer in Korean,and make sure the answer ends with '입니다'.

#Context: 
{context}

#Question:
{question}

#Answer(Ensure the response ends with '입니다'):"""
)

In [16]:
%pip install -U langchain-ollama


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting langchain-ollama
  Downloading langchain_ollama-0.2.3-py3-none-any.whl.metadata (1.9 kB)
Collecting ollama<1,>=0.4.4 (from langchain-ollama)
  Downloading ollama-0.4.7-py3-none-any.whl.metadata (4.7 kB)
Downloading langchain_ollama-0.2.3-py3-none-any.whl (19 kB)
Downloading ollama-0.4.7-py3-none-any.whl (13 kB)
Installing collected packages: ollama, langchain-ollama
Successfully installed langchain-ollama-0.2.3 ollama-0.4.7

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama import OllamaLLM

# LLM 및 프롬프트 설정
llm = OllamaLLM(model="llama3.2")
prompt = PromptTemplate.from_template("Context: {context}\nQuestion: {question}")

# 체인 구성
chain = (
    {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 입력 데이터
context_text = "Artificial Intelligence is transforming the world."
query_text = "How is AI impacting businesses?"

# 실행 및 출력
llm_input = {"context": context_text, "question": query_text}
response = chain.invoke(llm_input)

print(response)


The impact of Artificial Intelligence (AI) on businesses is multifaceted and can be seen in various aspects, including:

1. **Automation**: AI-powered automation is being used to streamline processes, reduce manual labor, and increase efficiency. This enables businesses to focus on more strategic tasks.

2. **Data Analysis**: AI algorithms can quickly analyze vast amounts of data, providing valuable insights that help businesses make informed decisions. This can be particularly useful in industries such as finance, healthcare, and marketing.

3. **Customer Service**: Chatbots and virtual assistants powered by AI are being used to provide 24/7 customer support, improving the overall customer experience and reducing response times.

4. **Predictive Maintenance**: AI-powered predictive maintenance is helping businesses reduce downtime, extend equipment lifespan, and improve overall operational efficiency.

5. **Innovation and Product Development**: AI is enabling businesses to develop new

In [18]:
# Step 7 : 언어 모델 (LLM) 생성
from langchain_ollama import OllamaLLM

llm = OllamaLLM(model="llama3.2")

response = llm.invoke("Tell me about AI")
print(response)


Artificial intelligence (AI) refers to the development of computer systems that can perform tasks that would typically require human intelligence, such as learning, problem-solving, decision-making, and perception. The term "artificial" is used because these systems are not human-made but rather machine-made, whereas "intelligence" suggests a level of cognitive ability.

There are several key types of AI:

1. **Narrow or Weak AI**: This type of AI is designed to perform a specific task, such as facial recognition, language translation, or playing chess.
2. **General or Strong AI**: This type of AI aims to create machines that can perform any intellectual task that humans can, including reasoning, problem-solving, and learning.
3. **Superintelligence**: This type of AI is significantly more intelligent than the best human minds, potentially able to solve complex problems that are unsolvable by humans.

AI has many applications:

1. **Virtual Assistants**: AI-powered virtual assistants l

In [20]:
%pip install from langchain.schema.runnable import RunnablePassthrough

UsageError: Line magic function `%from` not found.


In [19]:
# 8. LangChain 체인 구성
chain = (
    {"context": RunnablePassthrough(), "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 9. LLM 실행 및 응답 출력
llm_input = {"context": context_text, "question": query_text}
response = chain.invoke(llm_input)

NameError: name 'RunnablePassthrough' is not defined

In [15]:
print("💡 LLM 응답:")
print(response)

💡 LLM 응답:


NameError: name 'response' is not defined