In [2]:
import glob
import os
import re
import dotenv
from langchain.document_loaders import UnstructuredPDFLoader
from unstructured.cleaners.core import clean_extra_whitespace
from langchain.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field
from typing import List
from langchain.retrievers.multi_query import MultiQueryRetriever
# from langchain.chat_models import ChatOpenAI
from langchain_community.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables import RunnablePassthrough
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser

In [3]:
# Get a list of all .pdf files in the current directory
pdf_files = glob.glob('*.pdf')

pdf_documents = [file for file in pdf_files]
pdf_documents

['2022-annual-report-bofa.pdf', '2022-annual-report-wf.pdf']

### 1. Indexing : Load & Split

We need to first load the contents from the PDF file. We will use the [DocumentLoaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/), which are objects that load in data from a source and return a list of Documents. A Document is an object with some page_content (str) and metadata (dict). <br>
The UnstructuredPDFLoader is used in the usecase.

In [4]:
def remove_one_characters(text):
    if re.match('^.$', text):
        return ''
    else:
        return text

def remove_orientation_text(text):
    # There are texts in the pdf that is written vertically which makes the text to appear as
    # character space character & so on. This function removes those texts.
    if re.match('^(.\s)+.$', text):
        return ''
    else:
        return text

def remove_page_number_details(text):
    pattern1 = r'\b\d+\s*\|?\s*bank of america'
    if re.search(pattern1, text, re.IGNORECASE):
        return ''
    
    pattern2 = r'bank of america\s*2022\s*\|?\s*\d+'
    if re.search(pattern2, text, re.IGNORECASE):
        return ''
    
    return text 

In [16]:
# While using hi_res_model_name="detectron2_onnx" it will give error - "Unable to get page count. Is poppler installed and in PATH?"
# perform steps mentioned in [pdf2image](https://github.com/Belval/pdf2image?tab=readme-ov-file#windows) to install poppler
loaders = [UnstructuredPDFLoader(os.path.join(os.getcwd(),file), mode = 'elements', \
                                 strategy = 'hi_res', hi_res_model_name="detectron2_onnx", \
                                post_processors = [remove_one_characters, remove_orientation_text,
                                                    remove_page_number_details, \
                                                   clean_extra_whitespace]) for file in pdf_documents]

In [17]:
# Number of data loaders created is proportional to the number of documents
loaders

[<langchain_community.document_loaders.pdf.UnstructuredPDFLoader at 0x1f28476e140>,
 <langchain_community.document_loaders.pdf.UnstructuredPDFLoader at 0x1f28476dba0>]

In [18]:
# Asked to install pdf2image, pdfminer.six (do not install pdfminer as it is not actively maintained instead
# use pdfminer.six), opencv-python, unstructured_inference & many more
# Instead of installing individual dependent libraries for UnstructuredPDF, install unstructured using
# pip install unstructured[pdf] 
doc_0 = loaders[0].load_and_split()

In [20]:
# To avoid printing multiple text on GitHub
for doc in doc_0:
    if False:
        print(doc.page_content)

In [9]:
doc_1 = loaders[1].load_and_split()

In [11]:
# To avoid printing multiple text on GitHub
for doc1 in doc_1:
    if False:
        print(doc1.page_content)

In [12]:
# Remove the page numbers along with text "Wells Fargo & Company" in the footer.
previous_text = ''
for i, doc1 in enumerate(doc_1):
    if re.match('^\d+$', doc1.page_content):
        previous_text = doc1.page_content
        continue
    
    if re.match('^\d+$', previous_text) and re.search('^Wells Fargo & Company$', doc1.page_content, re.IGNORECASE):  
        doc1.page_content = ''           
    else:
        previous_text = ''   

In [13]:
previous_text = ''
for i, doc1 in enumerate(doc_1):
    if re.search('^Wells Fargo & Company$', doc1.page_content, re.IGNORECASE):
        previous_text = doc1.page_content
        continue
    
    if re.search('^Wells Fargo & Company$', previous_text, re.IGNORECASE) and re.match('^\d+$', doc1.page_content):        
        doc1.page_content = '' 
    else:
        previous_text = ''    

In [14]:
doc1 = list(filter(lambda x: x.page_content != '', doc_1))

In [15]:
for doc1 in doc_1:
    # To avoid printing multiple text on GitHub
    if False:
        print(doc1.page_content)

In [16]:
max_length = max(len(content_bofa.page_content) for content_bofa in doc_0)
print(max_length)

4000


In [17]:
max_length = max(len(content_wf.page_content) for content_wf in doc_1)
print(max_length)

3935


### 2. Indexing : Store
Creating embeddings for the splitted data and store the documents and it's corresponsing embeddings in a vector store. At this point we have a query-able vector store containing the chunked contents of our PDF's. Given a user question, we should ideally be able to return the snippets of the text that answer the question.

In [23]:
vectorstore = FAISS.from_documents(documents=doc_0+doc_1, embedding=OpenAIEmbeddings())

In [24]:
vectorstore.save_local('vectorstore')

In [31]:
vectorstore = FAISS.load_local("./vectorstore",OpenAIEmbeddings())

In [32]:
# Similarity Search
query = "Bank of America Chair & CEO?"
docs = vectorstore.similarity_search(query, k=3)
print(docs)

[Document(page_content='Brian T. Moynihan Chair of the Board and Chief Executive Oﬃcer, Bank of America Corporation', metadata={'source': 'c:\\Users\\baira\\Desktop\\Infy_Tech_Pioneer\\InfyTech_Docs_ChatBot\\2022-annual-report-bofa.pdf', 'detection_class_prob': 0.8070501685142517, 'coordinates': {'points': ((107.71715497970581, 278.45842002094656), (107.71715497970581, 401.0500022409043), (392.23714805555574, 401.0500022409043), (392.23714805555574, 278.45842002094656)), 'system': 'PixelSpace', 'layout_width': 1700, 'layout_height': 2200}, 'last_modified': '2024-01-04T14:51:31', 'filetype': 'application/pdf', 'languages': ['eng'], 'page_number': 223, 'parent_id': 'dc1fb50caf875b9f261d70b0be8ba6e3', 'file_directory': 'c:\\Users\\baira\\Desktop\\Infy_Tech_Pioneer\\InfyTech_Docs_ChatBot', 'filename': '2022-annual-report-bofa.pdf', 'category': 'NarrativeText'}), Document(page_content='Bank of America Corporation', metadata={'source': 'c:\\Users\\baira\\Desktop\\Infy_Tech_Pioneer\\InfyTech_

### 3. Retrieval and Generation: Retrieve
Different ways to retrive documents based on query - [Link](https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore)

<b> Using the query as is to retrieve the relevant documents using the search type "similarity_score_threshold"

In [33]:
retriever_similarity = vectorstore.as_retriever(search_type="similarity_score_threshold",
                                                 search_kwargs={"score_threshold": 0.70})
# retrieved_docs = retriever.invoke("How innovation is driven at bank of america?")
retrieved_docs = retriever_similarity.get_relevant_documents("What is the full-form of ROTCE?")

In [30]:
len(retrieved_docs)

2

In [31]:
for doc in retrieved_docs:
    print(doc.page_content)

Return on average tangible common equity (ROTCE)3
1 Return on tangible common equity (ROTCE) is a non-GAAP financial measure. For additional information, including a corresponding reconciliation to GAAP financial measures, see the “Financial Review – Capital Management – Tangible Common Equity” section in this Report.


<b> Using the Multi Query retriver which will create variants of queries based on the prompt to retrieve the relevant documents 

In [41]:
# supply a prompt along with an output parser to split the results into a list of queries.
# Output parser will split the LLM result into a list of queries
class LineList(BaseModel):
    # "lines" is the key (attribute name) of the parsed output
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate four \ 
    different versions of the given user question to retrieve relevant documents from a vector \
    database. By generating multiple perspectives on the user question, your goal is to help \
    the user overcome some of the limitations of the distance-based similarity search. \ 
    Provide these alternative questions separated by newlines. \
    Original question: {question}""",
)
llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo-16k')


# Chain
llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

In [34]:
# Run
retriever = MultiQueryRetriever(
    include_original=True,
    retriever=vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.70}),
    llm_chain=llm_chain, parser_key="lines",
    verbose=True
)  # "lines" is the key (attribute name) of the parsed output

# Results
unique_docs = retriever.get_relevant_documents(
    query="What is tuition assistance?"
)
len(unique_docs)

8

In [42]:
# To check different queries generated by the llm
llm_chain.invoke("What is tuition assistance?")

{'question': 'What is tuition assistance?',
 'text': LineList(lines=['1. Can you explain the concept of tuition assistance?', '2. How does tuition assistance work?', '3. What are the benefits of tuition assistance programs?', '4. Can you provide an overview of tuition assistance options available?'])}

### 3. Retrieval and Generation: Generate
In this step everything will be put together into a chain. A chain will take question, will retrieve relevant documents, will construct a prompt, will pass to the llm model and will parse the output.

In [36]:
contextualize_ques_system_prompt = """Given a chat history and the latest user question \
which might refer to a context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_ques_system_prompt),
        # Prompt template that assumes variable is already list of messages.
        # We provide the variable name to be used as messages
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()

In [37]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [41]:
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know.

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


def contextualized_question(input: dict):
    if input.get("chat_history"):        
        return contextualize_q_chain
    else:
        return input["question"]


rag_chain = (
    RunnablePassthrough.assign(
        context=contextualized_question | retriever_similarity
    )
    | qa_prompt
    | llm
)

In [42]:
rag_chain

RunnableAssign(mapper={
  context: RunnableLambda(contextualized_question)
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000022A090A4550>, search_type='similarity_score_threshold', search_kwargs={'score_threshold': 0.7})
})
| ChatPromptTemplate(input_variables=['chat_history', 'context', 'question'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.\n\n{

In [43]:
chat_history = []

question = "How Risk Management is done in Wells Fargo?"
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])

In [44]:
ai_msg.content

"Wells Fargo manages a variety of risks that can significantly affect its financial performance and its ability to meet the expectations of its customers, shareholders, regulators, and other stakeholders. The company's top priority is to strengthen its risk and control infrastructure. Wells Fargo continues to enhance and mature its risk management programs, including operational and compliance risk management programs as required by regulatory orders. Additionally, Wells Fargo is actively involved in industry cybersecurity efforts and works with third-party service providers and governmental agencies to enhance defenses and improve resiliency to information security threats."

In [45]:
second_question = "How Wells Fargo protect against unauthorized access?"
new_msg = rag_chain.invoke({"question": second_question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=second_question), new_msg])

In [46]:
new_msg.content

"Wells Fargo prioritizes the protection of its networks, computers, software, and data from unauthorized access. The company implements various controls, processes, and systems to enhance security and prevent unauthorized access. These measures include:\n\n1. Proactive involvement in industry cybersecurity efforts: Wells Fargo collaborates with other parties, including third-party service providers and governmental agencies, to enhance defenses and improve resiliency to information security threats.\n\n2. Robust risk management programs: Wells Fargo has established risk management programs to identify, assess, and mitigate risks associated with unauthorized access. These programs help in strengthening the risk and control infrastructure of the company.\n\n3. Continuous development and enhancement of controls: Wells Fargo continuously develops and enhances controls, processes, and systems to protect its networks, computers, software, and data. This includes implementing advanced securit

In [47]:
third_question = "What are different types of Risk Management at Wells Fargo?"
rag_chain.invoke({"question": third_question, "chat_history": chat_history}).content

"Wells Fargo employs various types of risk management to effectively identify, assess, and mitigate risks. Some of the different types of risk management at Wells Fargo include:\n\n1. Operational Risk Management: Wells Fargo focuses on managing risks associated with its day-to-day operations. This includes identifying and mitigating risks related to internal processes, systems, people, and external events that could impact the bank's operations and reputation.\n\n2. Compliance Risk Management: Wells Fargo has robust compliance risk management programs in place to ensure adherence to regulatory requirements and industry standards. This involves monitoring and managing risks related to compliance with laws, regulations, and internal policies.\n\n3. Credit Risk Management: Wells Fargo manages credit risk, which is the risk of financial loss arising from a borrower's failure to repay a loan or meet contractual obligations. The bank employs various credit risk assessment and mitigation tech