In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [7]:
import os
from dotenv import load_dotenv

load_dotenv()

# os.getenv("OPENAI_API_KEY")

True

In [None]:
# 임베딩 모델 추천

# BGM 3m


In [8]:
# 1단계 : 문서 로드
loader = PyMuPDFLoader("data/SPRI_AI_Brief_2023년12월호_F.pdf")
docs = loader.load()

# 2단계 : 문서 분할
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

# 3단계 : 임베딩
embeddings = OpenAIEmbeddings()

# 4단계 : 벡터스토어
vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)

# 5단계 : 검색기 생성 (Retriever)
retriever = vectorstore.as_retriever()

# 6단계 : 프롬프트
prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Answer in Korean.

#Question:
{question}

#Context:
{context}

#Answer:"""
)

# 7단계 : LLM 생성
llm = ChatOpenAI(model_name="gpt-5-nano", temperature=0, api_key=os.getenv("OPENAI_API_KEY"))

# 8단계 : chain
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# chain 실행
# 스트림 방식
# pip install langchain_teddynote
from langchain_teddynote.messages import stream_response

question = "삼성전자가 자체 개발한 AI 의 이름은?"
response = chain.stream(question)
stream_response(response)

삼성 가우스입니다.

In [None]:
# chain 실행
# 출력 방식
question = "삼성전자가 자체 개발한 AI 의 이름은?"
response = chain.invoke(question)
print(response)

삼성 가우스입니다.


In [12]:
from langchain_teddynote.messages import stream_response

question = "구글은 엔스로픽에 얼마를 투자했나요?"
response = chain.stream(question)
stream_response(response)

구글은 앤스로픽에 최대 20억 달러를 투자하기로 합의했으며, 그 중 5억 달러를 우선 투자했고 향후 15억 달러를 추가로 투자할 계획입니다.