# 02-rag.ipynb

# 문서 로드&저장 파트

In [16]:
from dotenv import load_dotenv

# .env 파일에 있는 환경변수들 불러오기
load_dotenv()

True

In [17]:
# 1. Document Load (PDF)
# 지원하는 문서 로더: https://docs.langchain.com/oss/python/integrations/document_loaders
from langchain_community.document_loaders import PyPDFLoader

# 불러올 파일 위치
file_path = './nke-10k-2023.pdf'
# PDF를 변환해줄 로더
loader = PyPDFLoader(file_path)
# 로더가 PDF를 파이썬에서 쓸 수 있도록 변환 (PDF 1page -> 1 Document)
docs = loader.load()

print(len(docs)) # 원본 PDF 페이지 수 확인

107


In [18]:
# 2. Splitting
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Document를 잘라줄 스플리터
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
# 쪼개기
chunks = text_splitter.split_documents(docs)

print(len(chunks)) # 전체 chunk 개수
print(chunks[0].page_content) # 첫번째 청크의 원본 텍스트 내용

516
Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM                         TO                         .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other jurisdiction of incorporation) (IRS Employer Identification No.)
One Bowerman Drive, Beaverton, Oregon 97005-6453
(Address of principal executive offices and zip code)
(503) 671-6453
(Registrant's telephone number, including area code)
SECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:
Class B Common Stock NKE New York Stock Exchange
(Title of each class) (Trading symbol) (Name of each exchange on which registered)


In [19]:
# 3. Embedding (숫자로 바꾸기)
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model='text-embedding-3-small')

# 아래는 테스트용 (실제 텍스트 -> 벡터로 바뀌는 과정)
v1 = embeddings.embed_query(chunks[0].page_content) # 청크1 벡터로 변환
v2 = embeddings.embed_query(chunks[1].page_content) # 청크2 벡터로 변환
# 차원수는 같아야 한다.
print(len(v1) == len(v2))
print(v1[:10])

True
[0.0536494180560112, 0.04885420575737953, 0.018872251734137535, -0.004884233698248863, 0.025020569562911987, -0.008195779286324978, -0.006397574674338102, 0.024189716205000877, -0.0011713554849848151, 0.010801100172102451]


In [20]:
# 4. Vector Store에 저장
from langchain_core.vectorstores import InMemoryVectorStore

# 테스트/개발용 메모리 벡터스토어
vector_store = InMemoryVectorStore(embeddings)

# PDF 쪼개놓은 chunks를 벡터스토어에 저장 (저장 후 ID들이 나옴)
ids = vector_store.add_documents(documents=chunks)

# 검색 파트

In [21]:
# 벡터스토어 -> 검색기
retriever = vector_store.as_retriever(
    search_type='similarity', # 검색방식: 유사도
    search_kwargs={'k':3} # 검색결과 개수: 3개
)

# 검색
retriever.invoke('나이키의 미국 영업점 개수?')

[Document(id='02c2739d-ab2b-41b4-bd14-72fe97c7611d', metadata={'producer': 'EDGRpdf Service w/ EO.Pdf 22.0.40.0', 'creator': 'EDGAR Filing HTML Converter', 'creationdate': '2023-07-20T16:22:00-04:00', 'title': '0000320187-23-000039', 'author': 'EDGAR Online, a division of Donnelley Financial Solutions', 'subject': 'Form 10-K filed on 2023-07-20 for the period ending 2023-05-31', 'keywords': '0000320187-23-000039; ; 10-K', 'moddate': '2023-07-20T16:22:08-04:00', 'source': './nke-10k-2023.pdf', 'total_pages': 107, 'page': 4, 'page_label': '5', 'start_index': 3125}, page_content='direct to consumer operations sell products through the following number of retail stores in the United States:\nU.S. RETAIL STORES NUMBER\nNIKE Brand factory stores 213 \nNIKE Brand in-line stores (including employee-only stores) 74 \nConverse stores (including factory stores) 82 \nTOTAL 369 \nIn the United States, NIKE has eight significant distribution centers. Refer to Item 2. Properties for further informati

## Agent 통합

In [None]:
# 검색기 (retriever)를 Tool(함수)로 만들기

# 검색어(query)를 인자로 받음
def search_vectorstore(query: str) -> str:
    """Retrieve info to help answer a query about NIKE"""
    # 검색기 대신 벡터스토어 바로 활용하기 (chunk 2개만 검색)
    docs = vector_store.similarity_search(query, k=2)
    result = ''

    for doc in docs:
        result += doc.page_content + '\n\n'
    
    return result
    
print(search_vectorstore('나이키 영업점 개수'))

Table of Contents
ITEM 1B. UNRESOLVED STAFF COMMENTS
None.
ITEM 2. PROPERTIES
The following is a summary of principal properties owned or leased by NIKE:
The NIKE World Campus, owned by NIKE and located near Beaverton, Oregon, USA, is an approximately 400-acre site consisting of over 40 buildings which, together
with adjacent leased properties, functions as our world headquarters and is occupied by approximately 11,400 employees engaged in management, research, design,
development, marketing, finance and other administrative functions serving nearly all of our segments. We lease a similar, but smaller, administrative facility in Hilversum,
the Netherlands, which serves as the headquarters for our Europe, Middle East & Africa geography and management of certain brand functions for our non-U.S.
operations. We also lease an office complex in Shanghai, China, our headquarters for our Greater China geography, occupied by employees focused on implementing our




In [31]:
from langchain.agents import create_agent

prompt = """너는 2023 나이키 10k 보고서를 검색하는 도구를 다룰 수 있어. 
사용자 질문에 답변하기 위해 필요하면 사용해. 경제분석 전문가처럼 답변해."""

agent = create_agent(
    model="openai:gpt-4.1-mini",
    tools=[search_vectorstore],
    system_prompt=prompt
)

In [32]:
content = "나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함."

agent.invoke(
    {
        "messages": [
            {"role": "user", "content": content}
        ]
    }
)

{'messages': [HumanMessage(content='나이키 영업점 숫자와 각 영업점 평균 매출액이 궁금함.', additional_kwargs={}, response_metadata={}, id='f82e71b7-6374-42e2-b84c-f17accab7fe3'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 34, 'prompt_tokens': 118, 'total_tokens': 152, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_d94e893512', 'id': 'chatcmpl-DCfzNFkTXP8j2ZPMiBC9RjPlbZ95j', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019c8e49-92c9-7760-9653-faacdc7c38d0-0', tool_calls=[{'name': 'search_vectorstore', 'args': {'query': '나이키 2023년 영업점 수와 각 영업점 평균 매출액'}, 'id': 'call_oXOPM6qism4nrsEgaTo9kM36', 'type': 'tool_call'}], invalid_tool_calls=[], usage_m

# Web문서(HTML) RAG + Agent

In [None]:
# HTML은 문서 본문 외에 필요하지 않은 내용이 많다. 전처리가 필요하다!
import bs4
from langchain_community.document_loaders import WebBaseLoader

# 전처리
bs4_strainer = bs4.SoupStrainer(class_=('post-title', 'post-header', 'post-content'))
# 로더
loader = WebBaseLoader(
    web_path="https://lilianweng.github.io/posts/2023-06-23-agent/", # 여러개일 때는 리스트로
    bs_kwargs={'parse_only': bs4_strainer}, # 처리기 넣기
)

docs = loader.load()
# 문서 페이지 수, 총 글자 수
print(len(docs), len(docs[0].page_content))

1 43047


In [39]:
# Split
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, chunk_overlap = 100, add_start_index=True
)
chunks = text_splitter.split_documents(docs)

# Embedding
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# Store
from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)
vector_store.add_documents(documents=chunks)

['e0943d34-d031-4ba8-8608-339e19709e00',
 '7761e951-137c-45f3-8adc-10fcebd25ffa',
 '578f7c3b-0356-4908-a3fa-667557914710',
 '837aeae1-4c97-455d-b91e-8d1b7ad6b192',
 '723786ee-bfb3-4db4-89c8-dcc238343d8b',
 'ef7df6ef-d464-45fb-ab72-a384f124cf1e',
 '2615bd28-5d7f-482d-b575-d61636ab4de9',
 'ebd15514-10d5-4ebd-860c-af37a951f258',
 '2ad3bbfc-24de-4fa4-9330-a6ef63f126e0',
 'd225468c-1977-42f3-81da-72ee8754ae20',
 'bb49158e-b510-424d-a673-fa11470d8765',
 'c9e00954-f8c2-4ca2-be7e-a4eade550948',
 'af09cf0a-68e4-4f61-8908-33206846aa62',
 '79503bf9-1365-4d89-8170-3c0dee21799a',
 'a739d4ef-86a3-41f6-8b9f-ad91e2327ab1',
 '94687cdf-1dde-401c-b947-8bac06123681',
 '9aaa166a-3c06-424f-a0b5-70d43df3da01',
 'b40b2ee6-e1e6-4856-8c29-0107fca54d29',
 'aa5ae900-54f8-47fa-ab50-642c32780843',
 '13f5b0f2-2137-49f7-9cdf-35f69481d06b',
 '6619a218-03c4-42cc-9a10-59ae64b3103a',
 '4e2ebe87-eceb-472e-96e4-c2f4e3eeaa6f',
 '143a4eeb-83e3-4cd2-b40a-a6ad390d3074',
 '93580048-5075-4c1c-8f62-9e4e89d3fbec',
 '48a9df94-eab6-

In [None]:
# Search -> Agent 이용 시 필수 기능 아님 (생략 가능)
retriever = vector_store.as_retriever(
    search_type = 'similarity',
    search_kwargs={'k':3}
)
retriever.invoke('페이지 내용 요약')

[Document(id='b8384484-dedd-43de-8bfa-3655008978d4', metadata={'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'start_index': 31986}, page_content='},\n  {\n    "role": "user",\n    "content": "{{There are 10 levels in total. The main character is a plumber named Mario, who can walk and jump. It is a classical platform game just like Super Mario. The main character moves from left to right, trying to get to the destination, where there are many obstacles and attacks from enemies in the process.}}\\n\\nIs anything else unclear? If yes, only answer in the form:\\n{remaining unclear areas} remaining questions.\\n{Next question}\\nIf everything is sufficiently clear, only answer \\"Nothing more to clarify.\\"."\n  },\n  {\n    "role": "assistant",\n    "content": "Remaining unclear areas: 2 remaining questions.\\nCan you provide more information about how the MVC components are split into separate files?"\n  },\n  {\n    "role": "user",\n    "content": "{{Make your own as

In [None]:
# Agent
def search_vectorstore(query:str) -> str:
    """검색한 내용을 분류해서 답변을 내놓는 도구"""
    docs = vector_store.similarity_search(query, k=2)
    result = ''
    for doc in docs:
        result += doc.page_content + '\n\n'
    return result

from langchain.agents import create_agent

prompt = "너는 이 웹페이지 내용을 기반으로 사용자의 질문에 대답하는 도구야"

agent = create_agent(
    model='openai:gpt-4.1-mini',
    tools = [search_vectorstore],
    system_prompt=prompt
)

content = '페이지 내용 전체 요약 부탁해'

agent.invoke(
    {
        "messages": [
            {"role": "user", "content": content}
        ]
    }
)

{'messages': [HumanMessage(content='페이지 내용 전체 요약 부탁해', additional_kwargs={}, response_metadata={}, id='be61d208-67f0-43ab-8b5d-66f3f64de24f'),
  AIMessage(content='', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 19, 'prompt_tokens': 79, 'total_tokens': 98, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4.1-mini-2025-04-14', 'system_fingerprint': 'fp_a391f2cee0', 'id': 'chatcmpl-DCi95YOEzO4iBmlT2KCZOwtIlTAfV', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None}, id='lc_run--019c8ec8-11db-7510-8c4e-6e8e1c15f259-0', tool_calls=[{'name': 'search_vectorstore', 'args': {'query': '페이지 내용 전체 요약'}, 'id': 'call_ijP2tqB58cKL9DQHYFhanEwG', 'type': 'tool_call'}], invalid_tool_calls=[], usage_metadata={'input_tokens': 79, 'out