In [1]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2024-Annual-Report.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()
pages = loader.load_and_split()

# for page in pages:
#     print(page.metadata, page)

In [2]:
for page in pages[:1]:
    print(page.page_content, page)

ANNUAL REPORT
2 0 2 4 page_content='ANNUAL REPORT
2 0 2 4' metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.0 (Macintosh)', 'creationdate': '2022-02-14T21:08:55-06:00', 'author': 'T&C Composition', 'gts_pdfxconformance': 'PDF/X-1a:2001', 'gts_pdfxversion': 'PDF/X-1:2001', 'keywords': '25-4123-1_2', 'moddate': '2025-04-09T12:45:58-07:00', 'subject': 'Annual Report', 'title': 'Amazon.com, Inc.', 'trapped': '/False', 'source': '/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2024-Annual-Report.pdf', 'total_pages': 91, 'page': 0, 'page_label': '1'}


# 문서 추가
loader2 = PyPDFLoader('/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2025-Proxy-Statement.pdf')
docs2 = loader2.load()

docs.extend(docs2)

# 문서 분할

# Add this before your BM25 retriever creation
print("Debugging docs:")
print(f"Type of docs: {type(docs)}")
print(f"Length of docs: {len(docs) if docs else 0}")

if docs:
    print(f"First doc type: {type(docs[0])}")
    if hasattr(docs[0], 'page_content'):
        print(f"First doc content: {docs[0].page_content[:200]}...")
    else:
        print(f"First doc: {docs[0]}")


In [3]:
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(docs)

# 임베딩

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

model_huggingface = HuggingFaceEmbeddings(model_name='jhgan/ko-sroberta-multitask'
                                          , model_kwargs = {'device':'cpu'}
                                          , encode_kwargs = {'normalize_embeddings':True})

# 벡터 저장소

In [5]:
from langchain_community.vectorstores import FAISS

vectorstore = FAISS.from_documents(texts, 
                                   model_huggingface)

In [None]:
results = vectorstore.similarity_search("How much of the cash used in Operating activities in the year of 2024?, k=5")
#answer: $ 115,877

In [16]:
for r in results:
    print(r.page_content)

generally corresponds to our net sales. The increase in operating cash flow in 2024, compared to the prior year, was due to an 
increase in net income (loss), excluding non-cash expenses, and changes in working capital. Working capital at any specific 
point in time is subject to many variables, including variability in demand, inventory management and category expansion, the 
timing of cash receipts and payments, customer and vendor payment terms, and fluctuations in foreign exchange rates.
Cash provided by (used in) investing activities corresponds with cash capital expenditures, including leasehold 
improvements, incentives received from property and equipment vendors, proceeds from asset sales, cash outlays for 
22
with first quarter 2024. This guidance anticipates an unusually large, unfavorable impact of approximately $2.1 
billion, or 150 basis points, from foreign exchange rates. Also, as a reminder, in first quarter 2024 the impact from 
Leap Year added approximately $1.5 bill

# 문서 저장소 ID 확인
vectorstore.index_to_docstore_id


# 저장된 문서의 ID: Document 확인
vectorstore.docstore._dict

In [8]:
# Retriever: 문서에 포함되어 있는 정보를 검색하고 생성


# Sparse Retriever 
# TF-IDF 또는 BM25와 같은 전통적 정보검색 기법
# 키워드 선택이 검색 품질을 좌우 (간단하고 명확한 키워드 검색에 유리)
from langchain_community.retrievers import BM25Retriever
bm25_retriever = BM25Retriever.from_documents(texts)
bm25_retriever.k = 4 # set number of documents to retrieve 

# Dense Retriever 
# vector간의 거리 (ex.cosine similarity)
# 키워드가 일치 하지 않아도 의미적으로 관련(뉘앙스와 문맥 일치) 문서 검색
# 복잡한 쿼리 유리
faiss_retriever = vectorstore.as_retriever()


In [19]:
# 검색기에 쿼리를 날려 검색된 chunk 결과를 확인합니다.
bm25_retriever.invoke("How much of the cash used in Operating activities in the year of 2024?")


[Document(metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 15.0 (Macintosh)', 'creationdate': '2022-02-14T21:08:55-06:00', 'author': 'T&C Composition', 'gts_pdfxconformance': 'PDF/X-1a:2001', 'gts_pdfxversion': 'PDF/X-1:2001', 'keywords': '25-4123-1_2', 'moddate': '2025-04-09T12:45:58-07:00', 'subject': 'Annual Report', 'title': 'Amazon.com, Inc.', 'trapped': '/False', 'source': '/Users/reejungkim/Documents/Git/HuggingFace/Amazon-2024-Annual-Report.pdf', 'total_pages': 91, 'page': 33, 'page_label': '34'}, page_content='income tax liabilities against us. Developments in an audit, investigation, or other tax controversy could have a material effect \non our operating results or cash flows in the period or periods for which that development occurs, as well as for prior and \nsubsequent periods. We regularly assess the likelihood of an adverse outcome resulting from these proceedings to determine the \nadequacy of our tax accruals. Although we believe our tax esti

In [20]:
from langchain_core.prompts import PromptTemplate


    
from langchain.prompts import PromptTemplate

template = """<|system|>
You are an assistant for question-answering tasks. 
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If you don't know the answer, just say that you don't know. 
Answer in Korean. <|end|>

<|user|>
{question}<|end|>
<|assistant|>"""

prompt = PromptTemplate.from_template(template)



In [21]:
# Make sure these are all properly defined:
print("bm25_retriever:", type(bm25_retriever))
print("prompt:", type(prompt))
print("llm:", type(llm))

bm25_retriever: <class 'langchain_community.retrievers.bm25.BM25Retriever'>
prompt: <class 'langchain_core.prompts.prompt.PromptTemplate'>
llm: <class 'langchain_huggingface.llms.huggingface_endpoint.HuggingFaceEndpoint'>


In [22]:
import os
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEndpoint
from dotenv import load_dotenv

os.chdir('/Users/reejungkim/Documents/Git/working-in-progress')
load_dotenv()

# 사용할 모델의 저장소 ID를 설정합니다.
repo_id = "microsoft/Phi-3-mini-4k-instruct"

llm = HuggingFaceEndpoint(
    repo_id=repo_id,  # 모델 저장소 ID를 지정합니다.
    max_new_tokens=256,  # 생성할 최대 토큰 길이를 설정합니다.
    temperature=0.1,
    huggingfacehub_api_token=os.environ["huggingface_read"],  # 허깅페이스 토큰
)

# LLMChain을 초기화하고 프롬프트와 언어 모델을 전달합니다.
# 체인(Chain) 생성
# 단계 8: 체인(Chain) 생성
from langchain_core.runnables import RunnablePassthrough

def format_docs(docs):
    """Format retrieved documents into a single string"""
    return "\n\n".join([f"Document {i+1}: {doc.page_content}" for i, doc in enumerate(docs)])



chain = (
    {
        "context": lambda x: format_docs(bm25_retriever.invoke(x["question"])),
        "question": lambda x: x["question"]
    }
    | prompt
    | llm
    | StrOutputParser()
)


# 질문을 전달하여 LLMChain을 실행하고 결과를 출력합니다.

# Simpler approach that's easier to debug
def retrieve_and_format(question):
    docs = bm25_retriever.invoke(question)
    return format_docs(docs)

# Test this function first
test_context = retrieve_and_format("what is the revenue of Amazon in 2024")
print("Context:", test_context[:500])

Context: Document 1: changes in estimates were not material to our consolidated results of operations for the years ended December 31, 2023 and 
2024. As of December 31, 2023 and 2024, our total self-insurance liabilities were $6.3 billion and $8.5 billion and are included 
in “Accrued expenses and other” on our consolidated balance sheets.  
Unearned Revenue
Unearned revenue is recorded when payments are received or due in advance of performing our service obligations and is 
recognized over the service


In [13]:
print(prompt)

input_variables=['question'] input_types={} partial_variables={} template="<|system|>\nYou are an assistant for question-answering tasks. \nUsing the information contained in the context,\ngive a comprehensive answer to the question.\nRespond only to the question asked, response should be concise and relevant to the question.\nProvide the number of the source document when relevant.\nIf you don't know the answer, just say that you don't know. \nAnswer in Korean. <|end|>\n\n<|user|>\n{question}<|end|>\n<|assistant|>"


### Chroma 기반 문서 벡터화 

from langchain_chroma import Chroma  
db = Chroma.from_documents(texts, model_huggingface )

retriever = Chroma.as_retriever()

question = 'who is Brad D. Smith'
answer = retriever.invoke(question) 


In [14]:
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
