In [3]:
from langchain_community.document_loaders import PyPDFLoader

In [7]:
pdf_file_path = r'E:\Others\document_assistant\nke-10k-2023.pdf'

loader = PyPDFLoader(pdf_file_path)
docs = loader.load()

In [9]:
print(docs[:2])

[Document(metadata={'source': 'E:\\Others\\document_assistant\\nke-10k-2023.pdf', 'page': 0}, page_content="Table of Contents\nUNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-K\n(Mark One)\n☑  ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\nFOR THE FISCAL YEAR ENDED MAY 31, 2023\nOR\n☐  TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934\nFOR THE TRANSITION PERIOD FROM                         TO                         .\nCommission File No. 1-10635\nNIKE, Inc.\n(Exact name of Registrant as specified in its charter)\nOregon 93-0584541\n(State or other jurisdiction of incorporation) (IRS Employer Identification No.)\nOne Bowerman Drive, Beaverton, Oregon 97005-6453\n(Address of principal executive offices and zip code)\n(503) 671-6453\n(Registrant's telephone number, including area code)\nSECURITIES REGISTERED PURSUANT TO SECTION 12(B) OF THE ACT:\nClass B Common Stock NKE New Y

In [11]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [12]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=100)

chunks = text_splitter.split_documents(docs)
len(chunks)

460

In [14]:
from langchain_huggingface import HuggingFaceEmbeddings

In [18]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')

In [15]:
from langchain_community.vectorstores import FAISS

In [24]:
from langchain.schema import Document as LangchainDocument

In [35]:
vectorstore = FAISS.from_documents([LangchainDocument(page_content=chunk.page_content) for chunk in chunks],embeddings)

In [37]:
results = vectorstore.similarity_search(
    "What is the net profit in the fiscal 2023?"
)

print(results[0])

page_content='Table of Contents
RESULTS OF OPERATIONS
(Dollars in millions, except per share data) FISCAL 2023 FISCAL 2022 % CHANGE FISCAL 2021 % CHANGE
Revenues $ 51,217 $ 46,710 10 %$ 44,538 5 %
Cost of sales 28,925 25,231 15 % 24,576 3 %
Gross profit 22,292 21,479 4 % 19,962 8 %
Gross margin 43.5 % 46.0 % 44.8 %
Demand creation expense 4,060 3,850 5 % 3,114 24 %
Operating overhead expense 12,317 10,954 12 % 9,911 11 %
Total selling and administrative expense 16,377 14,804 11 % 13,025 14 %
% of revenues 32.0 % 31.7 % 29.2 %
Interest expense (income), net (6) 205 — 262 — 
Other (income) expense, net (280) (181) — 14 — 
Income before income taxes 6,201 6,651 -7 % 6,661 0 %
Income tax expense 1,131 605 87 % 934 -35 %
Effective tax rate 18.2 % 9.1 % 14.0 %
NET INCOME $ 5,070 $ 6,046 -16 %$ 5,727 6 %
Diluted earnings per common share $ 3.23 $ 3.75 -14 %$ 3.56 5 %
2023 FORM 10-K 31'


In [38]:
def retrieve(query: str):
    """Retrieve information related to a query."""
    retrieved_docs = vectorstore.similarity_search(query, k=2)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\n" f"Content: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [39]:
retrieve("What is the net profit in the fiscal 2023?")

('Source: {}\nContent: Table of Contents\nRESULTS OF OPERATIONS\n(Dollars in millions, except per share data) FISCAL 2023 FISCAL 2022 % CHANGE FISCAL 2021 % CHANGE\nRevenues $ 51,217 $ 46,710 10 %$ 44,538 5 %\nCost of sales 28,925 25,231 15 % 24,576 3 %\nGross profit 22,292 21,479 4 % 19,962 8 %\nGross margin 43.5 % 46.0 % 44.8 %\nDemand creation expense 4,060 3,850 5 % 3,114 24 %\nOperating overhead expense 12,317 10,954 12 % 9,911 11 %\nTotal selling and administrative expense 16,377 14,804 11 % 13,025 14 %\n% of revenues 32.0 % 31.7 % 29.2 %\nInterest expense (income), net (6) 205 — 262 — \nOther (income) expense, net (280) (181) — 14 — \nIncome before income taxes 6,201 6,651 -7 % 6,661 0 %\nIncome tax expense 1,131 605 87 % 934 -35 %\nEffective tax rate 18.2 % 9.1 % 14.0 %\nNET INCOME $ 5,070 $ 6,046 -16 %$ 5,727 6 %\nDiluted earnings per common share $ 3.23 $ 3.75 -14 %$ 3.56 5 %\n2023 FORM 10-K 31\n\nSource: {}\nContent: Table of Contents\nNOTE 10 — EARNINGS PER SHARE\nThe follo