In [5]:
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.document_loaders import WebBaseLoader
import bs4
from langchain import hub

In [2]:
%pip install pymupdf

Note: you may need to restart the kernel to use updated packages.


In [10]:
# 단계 1: 문서 로드(Load Documents)
# 뉴스기사 내용을 로드하고, 청크로 나누고, 인덱싱합니다.
file_path = "data/CHI_2025_LBW_Korlingma.pdf"
loader = PyPDFLoader(file_path=file_path)


# 단계 2: 문서 분할(Split Documents)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)

split_docs = loader.load_and_split(text_splitter=text_splitter)

# 단계 3: 임베딩 & 벡터스토어 생성(Create Vectorstore)
# 벡터스토어를 생성합니다.
vectorstore = FAISS.from_documents(documents=split_docs, embedding=OpenAIEmbeddings())

# 단계 4: 검색(Search)
# 뉴스에 포함되어 있는 정보를 검색하고 생성합니다.
retriever = vectorstore.as_retriever()

# 단계 5: 프롬프트 생성(Create Prompt)
# 프롬프트를 생성합니다.
prompt = hub.pull("rlm/rag-prompt")

# 단계 6: 언어모델 생성(Create LLM)
# 모델(LLM) 을 생성합니다.
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)


def format_docs(docs):
    # 검색한 문서 결과를 하나의 문단으로 합쳐줍니다.
    return "\n\n".join(doc.page_content for doc in docs)


# 단계 7: 체인 생성(Create Chain)
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 단계 8: 체인 실행(Run Chain)
# 문서에 대한 질의를 입력하고, 답변을 출력합니다.
question = "Who is the author of this paper?"
response = rag_chain.invoke(question)

# 결과 출력
print(f"PDF Path: {file_path}")
print("===" * 20)
print(f"[HUMAN]\n{question}\n")
print(f"[AI]\n{response}")

PDF Path: data/CHI_2025_LBW_Korlingma.pdf
[HUMAN]
Who is the author of this paper?

[AI]
The author of the paper is not explicitly mentioned in the provided context.


In [11]:
question = "Can you briefly summarize this paper?"
response = rag_chain.invoke(question)

# 결과 출력
print(f"PDF Path: {file_path}")
print("===" * 20)
print(f"[HUMAN]\n{question}\n")
print(f"[AI]\n{response}")

PDF Path: data/CHI_2025_LBW_Korlingma.pdf
[HUMAN]
Can you briefly summarize this paper?

[AI]
The paper investigates the integration of pragmatic markers (PMs) in conversational agents, focusing on their use in both emotional and work-related contexts. It employs a study design that compares interactions with and without PMs, using a real-time agent that adjusts its tone based on sentiment analysis. The key contribution is the 'passive yet attentive listener' framework, which aims to balance human-like interaction with appropriate agent role perception, highlighting the importance of context-awareness in PM implementation.
