## 신민석(20239469)

In [48]:
from langchain_community.document_loaders import PyPDFLoader

In [51]:
loader = PyPDFLoader('./nike-10k-2023.pdf')

In [52]:
docs = loader.load()

In [53]:
len(docs)

107

In [8]:
print(docs[0].page_content[:200])

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
F


In [9]:
print(docs[0].metadata)

{'source': './nike-10k-2023.pdf', 'page': 0}


## split documents into smaller chunks

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [12]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)

In [13]:
splits = splitter.split_documents(docs)

In [14]:
len(splits)

516

## Generate Embedding Vectors

In [15]:
from langchain_openai import OpenAIEmbeddings

In [16]:
embeddings = OpenAIEmbeddings(model='text-embedding-3-large')

In [17]:
vec1 = embeddings.embed_query(splits[0].page_content)
vec2 = embeddings.embed_query(splits[1].page_content)

In [18]:
len(vec1)

3072

In [19]:
import numpy as np

In [20]:
np.dot(vec1, vec2)

0.7443979585703858

In [21]:
np.linalg.norm(vec2)

0.9999999502824668

## Store embedding vectors in memory DB

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore

In [26]:
store = InMemoryVectorStore(embeddings)

In [27]:
ids = store.add_documents(documents=splits)

## semantic search

In [9]:
## keyword search - BM25
## semantic search - similarity among embeddings vectors (사용자의 쿼리와 가까운 검색사항을 가져오는 것이 핵심) - 코사인 유사도

In [29]:
results = store.similarity_search(
    "how many distribution centers does Nike have in the US?"
)

In [30]:
print(results[0].page_content)

operations. We also lease an office complex in Shanghai, China, our headquarters for our Greater China geography, occupied by employees focused on implementing our
wholesale, NIKE Direct and merchandising strategies in the region, among other functions.
In the United States, NIKE has eight significant distribution centers. Five are located in or near Memphis, Tennessee, two of which are owned and three of which are
leased. Two other distribution centers, one located in Indianapolis, Indiana and one located in Dayton, Tennessee, are leased and operated by third-party logistics
providers. One distribution center for Converse is located in Ontario, California, which is leased. NIKE has a number of distribution facilities outside the United States,
some of which are leased and operated by third-party logistics providers. The most significant distribution facilities outside the United States are located in Laakdal,


In [31]:
results = store.similarity_search(
    "나이키가 언제 설립됐어?"
)

print(results[0].page_content)

Table of Contents
PART I
ITEM 1. BUSINESS
GENERAL
NIKE, Inc. was incorporated in 1967 under the laws of the State of Oregon. As used in this Annual Report on Form 10-K (this "Annual Report"), the terms "we," "us," "our,"
"NIKE" and the "Company" refer to NIKE, Inc. and its predecessors, subsidiaries and affiliates, collectively, unless the context indicates otherwise.
Our principal business activity is the design, development and worldwide marketing and selling of athletic footwear, apparel, equipment, accessories and services. NIKE is
the largest seller of athletic footwear and apparel in the world. We sell our products through NIKE Direct operations, which are comprised of both NIKE-owned retail stores
and sales through our digital platforms (also referred to as "NIKE Brand Digital"), to retail accounts and to a mix of independent distributors, licensees and sales


## chatbot agent to answer user questions

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import HumanMessage, SystemMessage

In [36]:
model = ChatOpenAI(model='gpt-4o-mini')

In [None]:
context = results[0].page_content

In [39]:
system = f"아래 내용을 바탕으로 사용자의 질문에 대답해줘. \n\n{context}"

In [40]:
messages = [
    SystemMessage(system),
    HumanMessage("Nike가 언제 설립됐어?")
]

In [41]:
resp = model.invoke(messages)

In [42]:
print(resp.content)

Nike는 1967년에 오리건 주의 법률에 따라 설립되었습니다.


## Generalize as a fuction

In [46]:
def query(question):
    results = store.similarity_search(question)
    context = results[0].page_content # + results[1].page_content ## 두번째, 세번째 결과까지 
    
    system = f"아래 내용을 바탕으로 사용자의 질문에 대답해줘. \n\n{context}"
    
    messages = [
    SystemMessage(system),
    HumanMessage(question),
    ]

    resp = model.invoke(messages)
    
    print(resp.content)

In [None]:
query('Nike 최근 매출은 얼마야?')

Nike의 최근 매출은 2023년 기준으로 총 51,217 백만 달러입니다.


In [None]:
##Few shot learning