In [1]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2024-Annual-Report.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
pages = loader.load_and_split()

# for page in pages:
#     print(page.metadata, page)

In [2]:
for page in pages[:5]:
    print(page.page_content, page)

ANNUAL REPORT
2 0 2 4 page_content='ANNUAL REPORT
2 0 2 4' metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.0 (Macintosh)', 'creationdate': '2022-02-14T21:08:55-06:00', 'author': 'T&C Composition', 'gts_pdfxconformance': 'PDF/X-1a:2001', 'gts_pdfxversion': 'PDF/X-1:2001', 'keywords': '25-4123-1_2', 'moddate': '2025-04-09T12:45:58-07:00', 'subject': 'Annual Report', 'title': 'Amazon.com, Inc.', 'trapped': '/False', 'source': '/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2024-Annual-Report.pdf', 'total_pages': 91, 'page': 0, 'page_label': '1'}
Dear Shareholders:
2024 was a strong year for Amazon.
Our total revenue grew 11% year-over-year (“Y oY”) from $575B to $638B. By segment, North America
revenue increased 10% Y oY from $353B to $387B, International revenue grew 9% Y oY from $131B to $143B,
and AWS revenue increased 19% Y oY , from $91B to $108B. For perspective, just 10 years ago, AWS
revenue was $4.6B; and in that same year, Amazon’s total revenue

# 문서 추가
loader2 = PyPDFLoader('/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2025-Proxy-Statement.pdf')
docs2 = loader2.load()

docs.extend(docs2)

# 문서 분할

In [38]:
# Add this before your BM25 retriever creation
print("Debugging docs:")
print(f"Type of docs: {type(docs)}")
print(f"Length of docs: {len(docs) if docs else 0}")

if docs:
    print(f"First doc type: {type(docs[0])}")
    if hasattr(docs[0], 'page_content'):
        print(f"First doc content: {docs[0].page_content[:200]}...")
    else:
        print(f"First doc: {docs[0]}")


Debugging docs:
Type of docs: <class 'list'>
Length of docs: 91
First doc type: <class 'langchain_core.documents.base.Document'>
First doc content: ANNUAL REPORT
2 0 2 4...


In [52]:
docs

[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.0 (Macintosh)', 'creationdate': '2022-02-14T21:08:55-06:00', 'author': 'T&C Composition', 'gts_pdfxconformance': 'PDF/X-1a:2001', 'gts_pdfxversion': 'PDF/X-1:2001', 'keywords': '25-4123-1_2', 'moddate': '2025-04-09T12:45:58-07:00', 'subject': 'Annual Report', 'title': 'Amazon.com, Inc.', 'trapped': '/False', 'source': '/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2024-Annual-Report.pdf', 'total_pages': 91, 'page': 0, 'page_label': '1'}, page_content='ANNUAL REPORT\n2 0 2 4'),
 Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.0 (Macintosh)', 'creationdate': '2022-02-14T21:08:55-06:00', 'author': 'T&C Composition', 'gts_pdfxconformance': 'PDF/X-1a:2001', 'gts_pdfxversion': 'PDF/X-1:2001', 'keywords': '25-4123-1_2', 'moddate': '2025-04-09T12:45:58-07:00', 'subject': 'Annual Report', 'title': 'Amazon.com, Inc.', 'trapped': '/False', 'source': '/Users/reejungki

In [54]:
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(docs)

# 임베딩

In [42]:
from langchain_huggingface import HuggingFaceEmbeddings

model_huggingface = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask'
                                          , model_kwargs = {'device':'cpu'}
                                          , encode_kwargs = {'normalize_embeddings':True})

# 벡터 저장소

In [43]:
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(texts, 
                                   model_huggingface)

In [44]:
results = vectorstore.similarity_search("how much was the revenue in 2024?, k=2")

In [45]:
for r in results:
    print(r.page_content)

2023, and 2024. Expected future amortization expense of acquired finite-lived intangible assets as of December 31, 2024 is as 
follows (in millions):
 
Year Ended December 31,
2025 $ 994 
2026  918 
2027  796 
2028  632 
2029  615 
Thereafter  3,484 
$ 7,439 
56
General and Administrative
The decrease in general and administrative costs in 2024, compared to the prior year, is primarily due to a decrease in 
payroll and related expenses.
Other Operating Expense (Income), Net
Other operating expense (income), net was $767 million and $763 million during 2023 and 2024, and was primarily 
related to asset impairments and the amortization of intangible assets.
Operating Income (Loss) 
Operating income (loss) by segment is as follows (in millions):
Year Ended December 31,
2023 2024
Operating Income (Loss)
North America $ 14,877 $ 24,967 
International  (2,656)  3,792 
AWS  24,631  39,834 
Consolidated $ 36,852 $ 68,593 
Operating income was $36.9 billion and $68.6 billion for 2023 and 2024. 

# 문서 저장소 ID 확인
vectorstore.index_to_docstore_id


# 저장된 문서의 ID: Document 확인
vectorstore.docstore._dict

In [55]:
# Retriever: 문서에 포함되어 있는 정보를 검색하고 생성


# Sparse Retriever 
# TF-IDF 또는 BM25와 같은 전통적 정보검색 기법
# 키워드 선택이 검색 품질을 좌우 (간단하고 명확한 키워드 검색에 유리)
from langchain_community.retrievers import BM25Retriever
bm25_retriever = BM25Retriever.from_documents(texts)
bm25_retriever.k = 4 # set number of documents to retrieve 

# Dense Retriever 
# vector간의 거리 (ex.cosine similarity)
# 키워드가 일치 하지 않아도 의미적으로 관련(뉘앙스와 문맥 일치) 문서 검색
# 복잡한 쿼리 유리
faiss_retriever = vectorstore.as_retriever()


In [56]:
# 검색기에 쿼리를 날려 검색된 chunk 결과를 확인합니다.
bm25_retriever.invoke("what was the revenue of Amazon in 2024")


[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.0 (Macintosh)', 'creationdate': '2022-02-14T21:08:55-06:00', 'author': 'T&C Composition', 'gts_pdfxconformance': 'PDF/X-1a:2001', 'gts_pdfxversion': 'PDF/X-1:2001', 'keywords': '25-4123-1_2', 'moddate': '2025-04-09T12:45:58-07:00', 'subject': 'Annual Report', 'title': 'Amazon.com, Inc.', 'trapped': '/False', 'source': '/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2024-Annual-Report.pdf', 'total_pages': 91, 'page': 61, 'page_label': '62'}, page_content='changes in estimates were not material to our consolidated results of operations for the years ended December 31, 2023 and \n2024. As of December 31, 2023 and 2024, our total self-insurance liabilities were $6.3 billion and $8.5 billion and are included \nin “Accrued expenses and other” on our consolidated balance sheets.  \nUnearned Revenue\nUnearned revenue is recorded when payments are received or due in advance of performing our service obli

In [57]:
from langchain_core.prompts import PromptTemplate


    
from langchain.prompts import PromptTemplate

template = """<|system|>
You are an assistant for question-answering tasks. 
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If you don't know the answer, just say that you don't know. 
Answer in Korean. <|end|>

<|user|>
{question}<|end|>
<|assistant|>"""

prompt = PromptTemplate.from_template(template)



In [64]:
# Make sure these are all properly defined:
print("bm25_retriever:", type(bm25_retriever))
print("prompt:", type(prompt))
print("llm:", type(llm))

bm25_retriever: <class 'langchain_community.retrievers.bm25.BM25Retriever'>
prompt: <class 'langchain_core.prompts.prompt.PromptTemplate'>
llm: <class 'langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint'>


In [65]:
import os
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEndpoint
from dotenv import load_dotenv

os.chdir('/Users/reejungkim/Documents/Git/working-in-progress')
load_dotenv()

# 사용할 모델의 저장소 ID를 설정합니다.
repo_id = "microsoft/Phi-3-mini-4k-instruct"

llm = HuggingFaceEndpoint(
    repo_id=repo_id,  # 모델 저장소 ID를 지정합니다.
    max_new_tokens=256,  # 생성할 최대 토큰 길이를 설정합니다.
    temperature=0.1,
    huggingfacehub_api_token=os.environ["huggingface_read"],  # 허깅페이스 토큰
)

# LLMChain을 초기화하고 프롬프트와 언어 모델을 전달합니다.
# 체인(Chain) 생성
# 단계 8: 체인(Chain) 생성
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    """Format retrieved documents into a single string"""
    return "\n\n".join([f"Document {i+1}: {doc.page_content}" for i, doc in enumerate(docs)])



chain = (
    {
        "context": lambda x: format_docs(bm25_retriever.invoke(x["question"])),
        "question": lambda x: x["question"]
    }
    | prompt
    | llm
    | StrOutputParser()
)


# 질문을 전달하여 LLMChain을 실행하고 결과를 출력합니다.

# Simpler approach that's easier to debug
def retrieve_and_format(question):
    docs = bm25_retriever.invoke(question)
    return format_docs(docs)

# Test this function first
test_context = retrieve_and_format("what is the revenue of Amazon in 2024")
print("Context:", test_context[:500])

Context: Document 1: changes in estimates were not material to our consolidated results of operations for the years ended December 31, 2023 and 
2024. As of December 31, 2023 and 2024, our total self-insurance liabilities were $6.3 billion and $8.5 billion and are included 
in “Accrued expenses and other” on our consolidated balance sheets.  
Unearned Revenue
Unearned revenue is recorded when payments are received or due in advance of performing our service obligations and is 
recognized over the service


In [31]:
print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} template="You are an assistant for question-answering tasks. \nUsing the information contained in the context,\ngive a comprehensive answer to the question.\nRespond only to the question asked, response should be concise and relevant to the question.\nProvide the number of the source document when relevant.\nIf you don't know the answer, just say that you don't know. \nAnswer in Korean.\n\n#Question: \n{question} \n#Context: \n{context} \n\n#Answer:"


### Chroma 기반 문서 벡터화 

from langchain_chroma import Chroma  
db = Chroma.from_documents(texts, model_huggingface )

retriever = Chroma.as_retriever()

question = 'who is Brad D. Smith'
answer = retriever.invoke(question) 


In [None]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)