In [10]:
import os
import getpass
import logging
import pdfplumber
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
#Function to Extract Tables as Text
#Extract tables from a pdf_page object and convert them into a readable text format
def extract_tables_as_text(pdf_page):
    tables = pdf_page.extract_tables()
    table_texts = []
    for table in tables:
        # Each table is a list of rows, where each row is a list of cells.
        row_strings = []
        for row in table:
            #Handle None values and join cells with commas
            cleaned_row = [cell if cell is not None else "" for cell in row]
            row_str = ", ".join(cleaned_row)
            row_strings.append(row_str)
        combined_table_text = "\n".join(row_strings)
        table_texts.append(combined_table_text)
    return "\n\n".join(table_texts)


pdf_path = "./ammazon_annual.pdf"
documents = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text() or ""
        table_text = extract_tables_as_text(page)
        
        if table_text.strip():
            #header to separate tables from main text
            text += "\n\n[Table Data]\n" + table_text

        if text.strip():
            documents.append(Document(page_content=text))

In [3]:
# Smaller chunks = more fine-grained retrieval but might lose context
# Larger chunks = more context but risk token overload in prompt
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)
docs = text_splitter.split_documents(documents)
logger.info(f"Split documents into {len(docs)} chunks.")

INFO:__main__:Split documents into 952 chunks.


In [26]:
#Embedding and Vector Store setup

embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
logger.info(f"Loaded HuggingFace Embeddings model: {embedding_model_name}")

vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 6})  

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Loaded HuggingFace Embeddings model: sentence-transformers/all-MiniLM-L6-v2


In [18]:
OPENAI_API_KEY = getpass.getpass("Enter your OpenAI API key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

llm = ChatOpenAI(
    model_name="gpt-3.5-turbo", 
    temperature=0.7,
    max_tokens=500,  
    openai_api_key=OPENAI_API_KEY
)

prompt_template = """You are a helpful assistant. Using the provided context, write a complete, well-structured, and detailed answer to the question. Incorporate any relevant information from the context into your answer. Be direct, accurate, and friendly in tone. Provide dates, positions, and context as necessary.

Context:
{context}

Question: {question}

Please provide a detailed and well-structured answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",  
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}
)

In [25]:
test_response = llm.predict("Hello, how are you?")
print("Test Response:", test_response)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Test Response: Hello! I'm just a computer program, so I don't have feelings, but I'm here to help you. How can I assist you today?


In [23]:
question = "What position in the company does Jeffrey P. Bezos takes and since what time?"
response = qa_chain.invoke({"query": question})

#dict 'query' and 'result'
print("Answer:", response["result"])

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Answer: Jeffrey P. Bezos holds the position of President, Chief Executive Officer, and Chairman of the Board at Amazon.com. He has been the Chairman of the Board since founding the company in 1994 and took on the role of Chief Executive Officer in May 1996. Mr. Bezos also served as President of the Company from its founding until June 1999, and then again from October 2000 to the present. As of January 30, 2020, Jeffrey P. Bezos continues to hold the positions of Chairman of the Board, President, and Chief Executive Officer at Amazon.com.


Print the question and show retrieved documments

In [22]:
# question = input("Enter your question")
question = "What position in the company does Jeffrey P. Bezos hold and since when?"

#retrieve documents to show retrieved chunks
retrieved_docs = retriever.get_relevant_documents(question)
retrieved_chunks = [doc.page_content for doc in retrieved_docs]

#Invoke the chain to get the final answer
response = qa_chain.invoke({"query": question})
answer = response["result"]

print("Question:", question)
print("Answer:", answer)
print("\nRetrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks, start=1):
    print(f"Chunk {i}:\n{chunk}\n{'-' * 40}")

Question: What position in the company does Jeffrey P. Bezos hold and since when?
Answer: Jeffrey P. Bezos holds the position of President, Chief Executive Officer, and Chairman of the Board at Amazon.com. He has been Chairman of the Board since founding the company in 1994 and has served as Chief Executive Officer since May 1996. Additionally, Mr. Bezos has held the title of President from the company's inception until June 1999 and resumed the role in October 2000, holding it to the present day. Therefore, Jeffrey P. Bezos has been serving in his current positions at Amazon.com since May 1996.

Retrieved Chunks:
Chunk 1:
Board of Directors
Name Age Position
Jeffrey P. Bezos 56 President, Chief Executive Officer, and Chairman of the Board
Rosalind G. Brewer 57 Group President, Americas and Chief Operating Officer, Starbucks Corporation
Jamie S. Gorelick 69 Partner, Wilmer Cutler Pickering Hale and Dorr LLP
Daniel P. Huttenlocher 61 Dean, MIT Schwarzman College of Computing
-----------

In [24]:
question = "What was Amazon’s net sales in 2019?"

retrieved_docs = retriever.get_relevant_documents(question)
retrieved_chunks = [doc.page_content for doc in retrieved_docs]

response = qa_chain.invoke({"query": question})
answer = response["result"]


print("Question:", question)
print("Answer:", answer)
print("\nRetrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks, start=1):
    print(f"Chunk {i}:\n{chunk}\n{'-' * 40}")

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Question: What was Amazon’s net sales in 2019?
Answer: In 2019, Amazon's net sales totaled $280,522 million. This amount represents the consolidated net sales figure for the year ended December 31, 2019, as disclosed in the financial information provided. 

To break down the net sales further, the sales were composed of $170,773 million from North America, $74,723 million from International markets, and $35,026 million from AWS (Amazon Web Services). The North America segment saw a 21% year-over-year growth, while the International segment grew by 13%. 

Overall, Amazon's net sales in 2019 showed a significant increase compared to the previous year, reflecting the company's continued expansion and success across various segments, including e-commerce, cloud services, and digital content subscriptions.

Retrieved Chunks:
Chunk 1:
AWS sales, Amazon Prime membership fees, advertising services, and certain digital content subscriptions. Net sales information is as follows (in millions):
Ye