# 환경설정

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
!pip install pymupdf

Collecting pymupdf
  Downloading PyMuPDF-1.24.12-cp39-abi3-win_amd64.whl.metadata (3.4 kB)
Downloading PyMuPDF-1.24.12-cp39-abi3-win_amd64.whl (16.0 MB)
   ---------------------------------------- 0.0/16.0 MB ? eta -:--:--
   ------------- -------------------------- 5.5/16.0 MB 30.5 MB/s eta 0:00:01
   -------------------------------- ------- 12.8/16.0 MB 32.2 MB/s eta 0:00:01
   ---------------------------------------  15.7/16.0 MB 33.0 MB/s eta 0:00:01
   ---------------------------------------- 16.0/16.0 MB 25.2 MB/s eta 0:00:00
Installing collected packages: pymupdf
Successfully installed pymupdf-1.24.12


### 1. 문서 로드 (Load Documents)

In [2]:
from langchain_community.document_loaders import PyMuPDFLoader

# PYMUPDFLoader 객체 정의
loader = PyMuPDFLoader("data/snow-white.pdf")

# 문서 로드
docs = loader.load()

print(f"문서의 페이지수 : {len(docs)}")

문서의 페이지수 : 6


In [3]:
print(docs[0].page_content)

백설공주
옛날어느왕국에공주님이태어났어요.
“어쩜이렇게어여쁠까? 살결이눈처럼하얗구나. 백
설공주라고불러야겠다.”
왕과왕비는갓태어난딸을보며기뻐했어요.
하지만기쁨도잠시, 왕비는곧세상을떠나고말았어
요.



In [4]:
# 메타데이터
print(docs[0].__dict__)

{'id': None, 'metadata': {'source': 'data/snow-white.pdf', 'file_path': 'data/snow-white.pdf', 'page': 0, 'total_pages': 6, 'format': 'PDF 1.5', 'title': 'PowerPoint 프레젠테이션', 'author': 'PC', 'subject': '', 'keywords': '', 'creator': 'Microsoft® PowerPoint® 2013', 'producer': 'Microsoft® PowerPoint® 2013', 'creationDate': "D:20230912112024+09'00'", 'modDate': "D:20230912112024+09'00'", 'trapped': ''}, 'page_content': '백설공주\n옛날어느왕국에공주님이태어났어요.\n“어쩜이렇게어여쁠까? 살결이눈처럼하얗구나. 백\n설공주라고불러야겠다.”\n왕과왕비는갓태어난딸을보며기뻐했어요.\n하지만기쁨도잠시, 왕비는곧세상을떠나고말았어\n요.\n', 'type': 'Document'}


### 2. 문서 분할(split Documents)

In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=10)

split_documents = text_splitter.split_documents(docs)

print(f"분할된 청크의수 : {len(split_documents)}")

분할된 청그의수 : 21


### 3. 임베딩(Embedding) 생성

In [6]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

### 4. DB 생성(벡터스토어 생성) 및 저장
* FAISS(Facebook AI Similarity Search)
    * 페이스북에서 개발한 유사도 검색 및 클러스트링 라이브러리
    * 벡터 데이터셋에서 빠른 유사도 검색

In [11]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp311-cp311-win_amd64.whl.metadata (4.5 kB)
Downloading faiss_cpu-1.9.0-cp311-cp311-win_amd64.whl (14.9 MB)
   ---------------------------------------- 0.0/14.9 MB ? eta -:--:--
   ------------------ --------------------- 6.8/14.9 MB 34.9 MB/s eta 0:00:01
   ---------------------------------------  14.7/14.9 MB 44.0 MB/s eta 0:00:01
   ---------------------------------------- 14.9/14.9 MB 34.6 MB/s eta 0:00:00
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [7]:
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)

In [8]:
for doc in vectorstore.similarity_search("난쟁이"):
    print(doc.page_content)

저녁이되자, 일곱난쟁이가돌아왔어요.
난쟁이들은쓰러진백설공주를보고엉엉울었어요.
백설공주는깊은잠에빠진것처럼보였지요.
“백설공주님, 못된왕비의꾐에넘어갔군요.”
간사과랍니다. 잠깐문을열어보세요.”
백설공주는고개를저었어요.
“난쟁이들이문을열어주지말라고했어요.”
백설공주가거절하자, 왕비는창문틈새로사과를쑥내밀었어
요.
왕자는깨어난백설공주를보고기뻐했어요.
“공주님, 나는이웃나라왕자입니다.”
“왕자님이나를다시살려주셨군요.”
“나와결혼해주시겠어요?”
“네, 좋아요!”
밤이되자오두막주인인일곱난쟁이가돌아왔어요.
난쟁이들은집안이어질러진것을보고깜짝놀랐지요.
일곱째난쟁이가큰소리로외쳤어요.
“누가내침대에서자고있어!”


### 5. 검색기(Retriever) 생성

In [9]:
# 벡터스토어에 있는 정보를 검색하고 생성
retriever = vectorstore.as_retriever()

In [10]:
# 임베딩해서 수치한 값과 유사한 doc를 찾아온다
retriever.invoke("백설공주와 일곱난쟁이는 어디서 만났어?")

[Document(metadata={'source': 'data/snow-white.pdf', 'file_path': 'data/snow-white.pdf', 'page': 0, 'total_pages': 6, 'format': 'PDF 1.5', 'title': 'PowerPoint 프레젠테이션', 'author': 'PC', 'subject': '', 'keywords': '', 'creator': 'Microsoft® PowerPoint® 2013', 'producer': 'Microsoft® PowerPoint® 2013', 'creationDate': "D:20230912112024+09'00'", 'modDate': "D:20230912112024+09'00'", 'trapped': ''}, page_content='백설공주\n옛날어느왕국에공주님이태어났어요.\n“어쩜이렇게어여쁠까? 살결이눈처럼하얗구나. 백\n설공주라고불러야겠다.”\n왕과왕비는갓태어난딸을보며기뻐했어요.'),
 Document(metadata={'source': 'data/snow-white.pdf', 'file_path': 'data/snow-white.pdf', 'page': 4, 'total_pages': 6, 'format': 'PDF 1.5', 'title': 'PowerPoint 프레젠테이션', 'author': 'PC', 'subject': '', 'keywords': '', 'creator': 'Microsoft® PowerPoint® 2013', 'producer': 'Microsoft® PowerPoint® 2013', 'creationDate': "D:20230912112024+09'00'", 'modDate': "D:20230912112024+09'00'", 'trapped': ''}, page_content='저녁이되자, 일곱난쟁이가돌아왔어요.\n난쟁이들은쓰러진백설공주를보고엉엉울었어요.\n백설공주는깊은잠에빠진것처럼보였지요.\n“백설공주님, 못된왕비의꾐에넘어갔군요.

### 6. 프롬프트 생성

당신은 질문-답변 작업을 위한 어시스턴트 입니다. <br>
주어진 문맥을 사용하여 질문에 답변하세요 <br>
유치원 선생님이 아이에게 말하는 것처럼 매우 친절하고 부드러운 어조를 사용하세요 <br>
따뜻하고 친근한 방식으로 말하세요 <br>
답을 모르는 경우에는 모른다고 말씀하세요 <br>
한국어로 답변하세요 <br>

In [11]:
from langchain_core.prompts import PromptTemplate
prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question.
Use a very kind and gentle tone like a kindergarten teacher talking to a child.
Speak in a warm and friendly way.
If you don't know the answer, just say that you don't know. 
Answer in Korean.

#Context: 
{context}

#Question:
{question}

#Answer:"""
)

### 7. LLM 모델 생성

In [12]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(mode="gpt-4o", temperature=0)

                mode was transferred to model_kwargs.
                Please confirm that mode is what you intended.
  if await self.run_code(code, result, async_=asy):


### 8. Chain생성 

In [13]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [14]:
# 체인 실행

question = "백설공주랑 왕비중 누가 더 아름다워?"
response = chain.invoke(question)

print(response)

TypeError: Completions.create() got an unexpected keyword argument 'mode'