In [5]:
import os
import pdfplumber
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain.vectorstores import FAISS
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
#Function to Extract Tables as Text
#Extract tables from a pdf_page object and convert them into a readable text format
def extract_tables_as_text(pdf_page):
    tables = pdf_page.extract_tables()
    table_texts = []
    for table in tables:
        #Each table is a list of rows, where each row is a list of cells.
        row_strings = []
        for row in table:
            #Handle None values and join cells with commas
            cleaned_row = [cell if cell is not None else "" for cell in row]
            row_str = ", ".join(cleaned_row)
            row_strings.append(row_str)
        #Combine all rows into a single chunk of text.
        combined_table_text = "\n".join(row_strings)
        table_texts.append(combined_table_text)
    return "\n\n".join(table_texts)

pdf_path = "C:/Users/Admin/Desktop/ammazon_annual.pdf"
documents = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text() or ""
        table_text = extract_tables_as_text(page)
        if table_text.strip():
            #Add some delimiter or header to clearly separate table data
            text += "\n\n[Table Data]\n" + table_text

        if text.strip():
            documents.append(Document(page_content=text))

In [30]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50
)
docs = text_splitter.split_documents(documents)
logger.info(f"Split documents into {len(docs)} chunks.")

INFO:__main__:Split documents into 952 chunks.


In [31]:
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
logger.info(f"Loaded HuggingFace Embeddings model: {embedding_model_name}")

vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Loaded HuggingFace Embeddings model: sentence-transformers/all-MiniLM-L6-v2


In [35]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

#truncation to avoid length errors
llm_pipeline = pipeline(
    "text2text-generation", 
    model=model, 
    tokenizer=tokenizer,
    max_length=512,
    truncation=True,
    max_new_tokens=200,
    min_length=20,
    do_sample=True,
    # temperature=0.7,
    # top_p=0.9,
    # top_k=200
)
llm = HuggingFacePipeline(pipeline=llm_pipeline)



prompt_template = """You are a helpful assistant. Using the following context, write a complete, well-structured sentence (or short paragraph) that answers the question in detail. If there is relevant information in the context, incorporate it into your answer. Be direct, accurate, and use a friendly tone.

Context:
{context}

Question: {question}
Please provide a detailed answer:
"""

prompt = PromptTemplate(
    template=prompt_template, 
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}
)

Device set to use cpu


In [38]:
question = "What position in the company does Jeffrey P. Bezos takes and since what time?"
response = qa_chain.invoke({"query": question})

#dict 'query' and 'result'
print("Answer:", response["result"])

Both `max_new_tokens` (=200) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: President, Chief Executive Officer, and Chairman of the Board. Mr. Bezos has been Chairman of the Board of Amazon.com since founding it in 1994 and Chief Executive Officer since May 1996.


In [36]:
# question = input("Enter your question")
question = "What position in the company does Jeffrey P. Bezos hold and since when?"

#retrieve documents to show retrieved chunks
retrieved_docs = retriever.get_relevant_documents(question)
retrieved_chunks = [doc.page_content for doc in retrieved_docs]

response = qa_chain.invoke({"query": question})
answer = response["result"]

print("Question:", question)
print("Answer:", answer)
print("\nRetrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks, start=1):
    print(f"Chunk {i}:\n{chunk}\n{'-' * 40}")

Both `max_new_tokens` (=200) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Question: What position in the company does Jeffrey P. Bezos hold and since when?
Answer: President, Chief Executive Officer, and Chair of the Board. Welch has been Chairman of the Board of Amazon.com since 1994 and Senior Executive Officer from May 1996.

Retrieved Chunks:
Chunk 1:
Board of Directors
Name Age Position
Jeffrey P. Bezos 56 President, Chief Executive Officer, and Chairman of the Board
Rosalind G. Brewer 57 Group President, Americas and Chief Operating Officer, Starbucks Corporation
Jamie S. Gorelick 69 Partner, Wilmer Cutler Pickering Hale and Dorr LLP
Daniel P. Huttenlocher 61 Dean, MIT Schwarzman College of Computing
----------------------------------------
Chunk 2:
and Chairman of the Board
Pursuant to the requirements of the Securities Exchange Act of 1934, this Report has been signed below by the following persons on behalf of the registrant
and in the capacities indicated as of January 30, 2020.
Signature Title
/s/ Jeffrey P. Bezos
Jeffrey P. Bezos Chairman of the 