In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.documents import Document

In [2]:
import os
import warnings
from dotenv import load_dotenv

# .env 로드
load_dotenv()
# os.getenv("OPENAI_API_KEY")

# 경고 무시
warnings.filterwarnings("ignore")

# ./cache/ 경로에 다운로드 받도록 설정 (모델 다운로드 받을 경로 지정)
os.environ["HF_HOME"] = "./cache/"


In [3]:
# 한국어 임베딩 모델 추천
# https://github.com/teddylee777/Kor-IR?tab=readme-ov-file
# HUGGINGFACEHUB_API_TOKEN

# intfloat/multilingual-e5-large-instruct
# intfloat/multilingual-e5-large
# BAAI/bge-3m


In [None]:
# 1단계 : 문서 로드
loader = PyMuPDFLoader("data/SPRI_AI_Brief_2023년12월호_F.pdf")
docs = loader.load()

# 2단계 : 문서 분할
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

# 3단계 : 임베딩
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

# 4단계 : 벡터스토어
vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)

# 5단계 : 검색기 생성 (Retriever)
retriever = vectorstore.as_retriever()

# 6단계 : 프롬프트
prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Answer in Korean.

#Question:
{question}

#Context:
{context}

#Answer:"""
)

# 7단계 : LLM 생성
llm = ChatOpenAI(model_name="gpt-5-nano", temperature=0, api_key=os.getenv("OPENAI_API_KEY"))

# 8단계 : chain
chain = (
    {"context":retriever, "question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [5]:
# chain 실행
# 스트림 방식
# pip install langchain_teddynote
from langchain_teddynote.messages import stream_response

question = "삼성전자가 자체 개발한 AI 의 이름은?"
response = chain.stream(question)
stream_response(response)

삼성 가우스. (page+1=13)

In [6]:
# chain 실행
# 출력 방식
question = "삼성전자가 자체 개발한 AI 의 이름은?"
response = chain.invoke(question)
print(response)

삼성전자가 자체 개발한 AI의 이름은 '삼성 가우스'입니다.
참고: 해당 내용은 문서의 12페이지에 기재되어 있으며, page+1은 13입니다.


In [7]:
from langchain_teddynote.messages import stream_response

question = "구글은 엔스로픽에 얼마를 투자했나요?"
response = chain.stream(question)
stream_response(response)

구글은 앤스로픽에 최대 20억 달러를 투자하기로 합의했고, 그 중 우선 5억 달러를 투자했습니다. 향후 15억 달러를 추가로 투자할 계획도 밝혔습니다. (page+1: 14)

In [None]:
# 허깅페이스 임베딩 모델 설치
# ./cache/ 경로에 다운로드 받도록 설정 (모델 다운로드 받을 경로 지정) 7:00
os.environ["HF_HOME"] = "./cache/"

# pip install langchain-huggingface sentence-transformers
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# model_name = "intfloat/multilingual-e5-large-instruct"
# model_name = "intfloat/multilingual-e5-large"
model_name = "BAAI/bge-m3"

hf_embeddings = HuggingFaceEmbeddings(
    model_name = model_name,
    model_kwargs={"device": "cuda"},             # cuda, cpu, mps
    encode_kwargs={"normalize_embeddings": True},
)

In [17]:
%time
# Document
embedded_documents1 = hf_embeddings.embed_documents(texts)

print(f'Model: \t\t{model_name}')
print(f'Dimension: \t{len(embedded_documents1[0])}')

CPU times: total: 0 ns
Wall time: 0 ns
Model: 		BAAI/bge-m3
Dimension: 	1024


In [23]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda

# 1단계 : 문서 로드
loader = PyMuPDFLoader("data/SPRI_AI_Brief_2023년12월호_F.pdf")
docs = loader.load()

# 2단계 : 문서 분할
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

# 3단계 : 임베딩
embeddings = hf_embeddings

# 4단계 : 벡터스토어
vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)

# 5단계 : 검색기 생성 (Retriever)
retriever = vectorstore.as_retriever()

# 6단계 : 프롬프트
prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
You must include `page` number in your answer.
Answer in Korean.

#Question:
{question}

#Context:
{context}

#Answer:"""
)

# 7단계 : LLM 생성
llm = ChatOpenAI(model_name="gpt-5-nano", temperature=0, api_key=os.getenv("OPENAI_API_KEY"))

# 8단계 : chain
def format_docs(docs):
    return "\n\n".join(
        f"[page {d.metadata.get('page', 0) + 1}] {d.page_content}" for d in docs
    )

chain = (
    {"context":retriever | RunnableLambda(format_docs),
      "question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [27]:
# 테스트
question = "삼성전자가 자체 개발한 AI 의 이름은?"
response = chain.invoke(question)
print(response)

삼성전자가 자체 개발한 AI의 이름은 '삼성 가우스'입니다. (page 13)


In [None]:
import os
import getpass
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_google_genai import ChatGoogleGenerativeAI       # pip install -U langchain-google-genai

# 1단계 : 문서 로드
loader = PyMuPDFLoader("data/SPRI_AI_Brief_2023년12월호_F.pdf")
docs = loader.load()

# 2단계 : 문서 분할
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
split_documents = text_splitter.split_documents(docs)

# 3단계 : 임베딩
embeddings = hf_embeddings

# 4단계 : 벡터스토어
vectorstore = FAISS.from_documents(documents=split_documents, embedding=embeddings)

# 5단계 : 검색기 생성 (Retriever)
retriever = vectorstore.as_retriever()

# 6단계 : 프롬프트
prompt = PromptTemplate.from_template(
    """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
You must include `page` number in your answer.
Answer in Korean.

#Question:
{question}

#Context:
{context}

#Answer:"""
)

# 7단계 : LLM 생성
llm = ChatGoogleGenerativeAI(model_name="gemini-2.5-flash", 
                            temperature=0,
                            max_tokens=None,
                            timeout=None,
                            max_retries=2,
                            api_key=os.getenv("GOOGLE_API_KEY")
                            )

# 8단계 : chain
def format_docs(docs):
    return "\n\n".join(
        f"[page {d.metadata.get('page', 0) + 1}] {d.page_content}" for d in docs
    )

chain = (
    {"context":retriever | RunnableLambda(format_docs),
      "question":RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)