In [4]:
import os
import pdfplumber
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFacePipeline
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
from langchain.vectorstores import FAISS

import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
pdf_path = "C:/Users/Admin/Desktop/ammazon_annual.pdf"
documents = []
with pdfplumber.open(pdf_path) as pdf:
    for page in pdf.pages:
        text = page.extract_text() or ""
        # If tables exist and you want to incorporate them:
        tables = page.extract_tables()
        for table in tables:
            table_text = "\n".join([", ".join(row) for row in table])
            text += "\n" + table_text
        if text.strip():
            documents.append(Document(page_content=text))

In [5]:
import spacy
import subprocess
import sys

try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load('en_core_web_sm')

def preprocess_text(text: str) -> str:          #tokenize, convert to lovercase, remove whitespace
    doc = nlp(text)
    sentences = [sent.text.strip().lower() for sent in doc.sents]
    return ' '.join(sentences)

#preprocess all documents
for doc in documents:
    original_text = doc.page_content
    processed_text = preprocess_text(original_text)
    doc.page_content = processed_text

logger.info("Preprocessing of text completed.")

INFO:__main__:Preprocessing of text completed.


In [26]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=30
)
docs = text_splitter.split_documents(documents)
logger.info(f"Split documents into {len(docs)} chunks.")

INFO:__main__:Split documents into 1535 chunks.


In [27]:
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=embedding_model_name)
logger.info(f"Loaded HuggingFace Embeddings model: {embedding_model_name}")

vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: cpu
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: sentence-transformers/all-MiniLM-L6-v2
INFO:__main__:Loaded HuggingFace Embeddings model: sentence-transformers/all-MiniLM-L6-v2


In [28]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

#truncation to avoid length errors
llm_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    truncation=True,
    max_new_tokens=100,
    min_length=20,
    do_sample=True,
    # temperature=0.7,
    # top_p=0.9,
    # top_k=200
)
llm = HuggingFacePipeline(pipeline=llm_pipeline)
logger.info(f"Initialized HuggingFace pipeline with model: {model_name}")

Device set to use cpu
INFO:__main__:Initialized HuggingFace pipeline with model: google/flan-t5-base


In [29]:
#prompt and QA Chain
prompt_template = """You are a helpful assistant. Using the following context, write a complete, well-structured sentence (or short paragraph) that answers the question in detail. If there is relevant information in the context, incorporate it into your answer. Be direct, accurate, and use a formal tone.

Context:
{context}

Question: {question}
Please provide a detailed answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt}
)

In [30]:
question = "What position in the company does the Jeffrey P. Bezos takes and since what time?"
response = qa_chain.invoke({"query": question})

print("Answer:", response["result"])  #dictionary

Both `max_new_tokens` (=100) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Answer: Jeff Roberts Bezos is the President and Chief Executive Officer of Amazon. Amazon.com was founded in 1994 as a company.


In [31]:
# question = input("Enter your question")
question = "What position in the company does Jeffrey P. Bezos hold and since when?"

retrieved_docs = retriever.get_relevant_documents(question)
retrieved_chunks = [doc.page_content for doc in retrieved_docs]

response = qa_chain.invoke({"query": question})

answer = response["result"]  #dictionary query and result

print("Question:", question)
print("Answer:", answer)
print("\nRetrieved Chunks:")
for i, chunk in enumerate(retrieved_chunks, start=1):
    print(f"Chunk {i}:\n{chunk}\n{'-' * 40}")

Both `max_new_tokens` (=100) and `max_length`(=512) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Question: What position in the company does Jeffrey P. Bezos hold and since when?
Answer: He is currently chairman of the board of Amazon. He is currently chairman of the board of amazon.com since may 1996.

Retrieved Chunks:
Chunk 1:
and secretary from september 2012 to may 2014, and as vice president and associate general counsel for litigation and regulatory matters from april 2002
until september 2012. board of directors
name age position
jeffrey p. bezos 56 president, chief executive officer, and chairman of the board
----------------------------------------
Chunk 2:
and in the capacities indicated as of january 30, 2020. signature title
/s/ jeffrey p. bezos
jeffrey p. bezos chairman of the board, president, and chief executive officer (principal executive
officer)
/s/ brian t. olsavsky
----------------------------------------
Chunk 3:
request. /s/ jeffrey p. bezos
jeffrey p. bezos
chairman and chief executive officer
(principal executive officer) date: january 30, 2020
----------