#### Tuesday, November 5, 2024

mamba activate langchain

[Build a PDF ingestion and Question/Answering system](https://python.langchain.com/docs/tutorials/pdf_qa/)

This all runs in one pass.

In [1]:
from langchain_community.document_loaders import PyPDFLoader

In [2]:
file_path = "../414759-1-_5_Nike-NPS-Combo_Form-10-K_WR.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

106


In [3]:
print(docs[0].page_content[0:100])
print(docs[0].metadata)

FORM 10-KFORM 10-K
{'source': '../414759-1-_5_Nike-NPS-Combo_Form-10-K_WR.pdf', 'page': 0}


#### Question and Answering with RAG

In [4]:
import getpass
import os

from langchain_openai import ChatOpenAI

# llm = ChatOpenAI(model="gpt-3.5-turbo")
llm = ChatOpenAI(base_url="http://localhost:1234/v1", 
                   # model = "hermes-3-llama-3.1-8b",  # do not pass in an unrecognized model name ... 
                   api_key="lm-studio", 
                   temperature=0)


In [5]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
# model_kwargs = {'device': 'cpu'}
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
hfEmbeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from tqdm.autonotebook import tqdm, trange


In [6]:
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = InMemoryVectorStore.from_documents(
    # documents=splits, embedding=OpenAIEmbeddings()
    documents=splits, embedding=hfEmbeddings
)

retriever = vectorstore.as_retriever()

In [7]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "What was Nike's revenue in 2023?"})

results

{'input': "What was Nike's revenue in 2023?",
 'context': [Document(id='e5c476e9-d819-490b-abb9-5628c099f3c1', metadata={'source': '../414759-1-_5_Nike-NPS-Combo_Form-10-K_WR.pdf', 'page': 89}, page_content='YEAR ENDED MAY 31,\n(Dollars in millions) 2023 2022 2021\nREVENUES\nNorth America $ 21,608 $ 18,353 $ 17,179 \nEurope, Middle East & Africa  13,418  12,479  11,456 \nGreater China  7,248  7,547  8,290 \nAsia Pacific & Latin America  6,431  5,955  5,343 \nGlobal Brand Divisions  58  102  25 \nTotal NIKE Brand  48,763  44,436  42,293 \nConverse  2,427  2,346  2,205 \nCorporate  27  (72)  40 \nTOTAL NIKE, INC. REVENUES $ 51,217 $ 46,710 $ 44,538 \nEARNINGS BEFORE INTEREST AND TAXES\nNorth America $ 5,454 $ 5,114 $ 5,089 \nEurope, Middle East & Africa  3,531  3,293  2,435 \nGreater China  2,283  2,365  3,243 \nAsia Pacific & Latin America  1,932  1,896  1,530 \nGlobal Brand Divisions  (4,841)  (4,262)  (3,656) \nConverse  676  669  543 \nCorporate  (2,840)  (2,219)  (2,261) \nInterest 

In [8]:
print(results["context"][0].page_content)

YEAR ENDED MAY 31,
(Dollars in millions) 2023 2022 2021
REVENUES
North America $ 21,608 $ 18,353 $ 17,179 
Europe, Middle East & Africa  13,418  12,479  11,456 
Greater China  7,248  7,547  8,290 
Asia Pacific & Latin America  6,431  5,955  5,343 
Global Brand Divisions  58  102  25 
Total NIKE Brand  48,763  44,436  42,293 
Converse  2,427  2,346  2,205 
Corporate  27  (72)  40 
TOTAL NIKE, INC. REVENUES $ 51,217 $ 46,710 $ 44,538 
EARNINGS BEFORE INTEREST AND TAXES
North America $ 5,454 $ 5,114 $ 5,089 
Europe, Middle East & Africa  3,531  3,293  2,435 
Greater China  2,283  2,365  3,243 
Asia Pacific & Latin America  1,932  1,896  1,530 
Global Brand Divisions  (4,841)  (4,262)  (3,656) 
Converse  676  669  543 
Corporate  (2,840)  (2,219)  (2,261) 
Interest expense (income), net  (6)  205  262 
TOTAL NIKE, INC. INCOME BEFORE INCOME TAXES $ 6,201 $ 6,651 $ 6,661 
ADDITIONS TO PROPERTY, PLANT AND EQUIPMENT
North America $ 283 $ 146 $ 98 
Europe, Middle East & Africa  215  197  153


In [9]:
print(results["context"][0].metadata)

{'source': '../414759-1-_5_Nike-NPS-Combo_Form-10-K_WR.pdf', 'page': 89}
