참고 링크: https://github.com/TeamOTK/Search/blob/main/src/Search.py

In [1]:
from pypdf import PdfReader
from openai import OpenAI
import os
from dotenv import load_dotenv
import numpy as np

load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

PDF에서 텍스트를 읽어와서 청크로 분할

In [2]:
reader = PdfReader("data/spiderman1.pdf")
chunks = []
chunk_length = 1000

for page in reader.pages:
    text_page = page.extract_text()
    chunks.extend([text_page[i: i+chunk_length] for i in range(0, len(text_page), chunk_length)])

크로마디비에 벡터 저장

In [3]:
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_texts(chunks, embeddings)

In [4]:
retriever = vectorstore.as_retriever()

In [5]:
from langchain.memory import ConversationBufferMemory

memory = ConversationBufferMemory(
    memory_key="chat_history",
    input_key="input",
    return_messages=True,
    output_key="output",
)

In [14]:
from langchain_openai.chat_models import ChatOpenAI

llm = ChatOpenAI(
    model_name="ft:gpt-4o-mini-2024-07-18:personal:spiderman-1:9s1amhcg",
    temperature=0
)

In [15]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    verbose=True,
    return_source_documents=True
)

In [17]:
from langchain.agents import Tool

tools = [
    Tool(
        name="doc_search_tool",
        func=qa,
        description=(
            "This tool is used to retrieve information from the knowledge base"
        )
    )
]

In [28]:
system_message = """
    You always follow these guidelines:

        -If the answer isn't available within the context, state that fact
        -Otherwise, answer to your best capability, refering to source of documents provided
        -Only use examples if explicitly requested
        -Do not introduce examples outside of the context
        -Do not answer if context is absent
        -Limit responses to three or four sentences for clarity and conciseness
        -You must answer in Koreans
        -You must answer like you're spider-man
        -Use a first-person perspective. Do not say "peter parker ~"
"""

In [29]:
from langchain.agents import initialize_agent, AgentType

agent = initialize_agent(
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,
    tools=tools,
    llm=llm,
    memory=memory,
    return_source_documents=True,
    return_intermediated_steps=True,
    agent_kwargs={"system_message": system_message}
)

In [30]:
query = "피터, 다른 우주의 피터 파커를 만났을 때 무슨 일을 했어?"
result = agent(query)
print(result["output"])



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
피터는 다른 우주의 피터 파커와 함께 여러 가지 일을 했고, 그 중 가장 중요한 것은 악당을 물리치는 것이었어.


참고
https://rimiyeyo.tistory.com/entry/RAG%EB%A5%BC-%EC%88%98%ED%96%89%ED%95%98%EA%B8%B0-%EC%9C%84%ED%95%9C-LangChain%EC%9D%98-RetrievalQA-%EA%B5%AC%EC%A1%B0%EC%99%80-%EA%B5%AC%ED%98%84%EB%B0%A9%EB%B2%95