In [1]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2024-Annual-Report.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
pages = loader.load_and_split()

# for page in pages:
#     print(page.metadata, page)

# 문서 추가
loader2 = PyPDFLoader('/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2025-Proxy-Statement.pdf')
docs2 = loader2.load()

docs.extend(docs2)

In [2]:
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)

In [4]:
texts = text_splitter.split_documents(docs)

In [5]:
len(docs)

91

In [6]:
texts[0]

Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.0 (Macintosh)', 'creationdate': '2022-02-14T21:08:55-06:00', 'author': 'T&C Composition', 'gts_pdfxconformance': 'PDF/X-1a:2001', 'gts_pdfxversion': 'PDF/X-1:2001', 'keywords': '25-4123-1_2', 'moddate': '2025-04-09T12:45:58-07:00', 'subject': 'Annual Report', 'title': 'Amazon.com, Inc.', 'trapped': '/False', 'source': '/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2024-Annual-Report.pdf', 'total_pages': 91, 'page': 0, 'page_label': '1'}, page_content='ANNUAL REPORT\n2 0 2 4')

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings

model_huggingface = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask'
                                          , model_kwargs = {'device':'cpu'}
                                          , encode_kwargs = {'normalize_embeddings':True})

In [11]:
from langchain_community.vectorstores import FAISS

In [12]:
vectorstore = FAISS.from_documents(texts, 
                                   model_huggingface)

In [18]:
results = vectorstore.similarity_search("how much was the revenue in 2024?, k=2")

In [20]:
for r in results:
    print(r.page_content)

2023, and 2024. Expected future amortization expense of acquired finite-lived intangible assets as of December 31, 2024 is as 
follows (in millions):
 
Year Ended December 31,
2025 $ 994 
2026  918 
2027  796 
2028  632 
2029  615 
Thereafter  3,484 
$ 7,439 
56
General and Administrative
The decrease in general and administrative costs in 2024, compared to the prior year, is primarily due to a decrease in 
payroll and related expenses.
Other Operating Expense (Income), Net
Other operating expense (income), net was $767 million and $763 million during 2023 and 2024, and was primarily 
related to asset impairments and the amortization of intangible assets.
Operating Income (Loss) 
Operating income (loss) by segment is as follows (in millions):
Year Ended December 31,
2023 2024
Operating Income (Loss)
North America $ 14,877 $ 24,967 
International  (2,656)  3,792 
AWS  24,631  39,834 
Consolidated $ 36,852 $ 68,593 
Operating income was $36.9 billion and $68.6 billion for 2023 and 2024. 

# 문서 저장소 ID 확인
vectorstore.index_to_docstore_id


# 저장된 문서의 ID: Document 확인
vectorstore.docstore._dict

In [17]:
# Retriever: 문서에 포함되어 있는 정보를 검색하고 생성


# Sparse Retriever 
# TF-IDF 또는 BM25와 같은 전통적 정보검색 기법
# 키워드 선택이 검색 품질을 좌우 (간단하고 명확한 키워드 검색에 유리)
from langchain_community.retrievers import BM25Retriever
bm25_retriever = BM25Retriever.from_documents(docs)

# Dense Retriever 
# vector간의 거리 (ex.cosine similarity)
# 키워드가 일치 하지 않아도 의미적으로 관련(뉘앙스와 문맥 일치) 문서 검색
# 복잡한 쿼리 유리
faiss_retriever = vectorstore.as_retriever()


ImportError: Could not import rank_bm25, please install with `pip install rank_bm25`.

In [None]:
# 검색기에 쿼리를 날려 검색된 chunk 결과를 확인합니다.
bm25_retriever.invoke("what was the revenue of Amazon in 2024")


### Chroma 기반 문서 벡터화 

In [9]:
from langchain_chroma import Chroma  
db = Chroma.from_documents(texts, model_huggingface )

KeyboardInterrupt: 

In [None]:
retriever = Chroma.as_retriever()

question = 'who is Brad D. Smith'
answer = retriever.invoke(question) 
