In [31]:
import json
from langchain_core.documents import Document
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import FAISS

### 1. Indexing : Load & Split

We need to first load the contents from the PDF file. We will use the [DocumentLoaders](https://python.langchain.com/docs/modules/data_connection/document_loaders/), which are objects that load in data from a source and return a list of Documents. A Document is an object with some page_content (str) and metadata (dict). <br>
The UnstructuredPDFLoader is used in the usecase.

In [35]:
class ParsedPDFLoader:
    def __init__(self, path):
        self.path = path
        self.documents = []

        self._text_splitter = SemanticChunker(OllamaEmbeddings())

    def _load_file(self):
        with open(self.path, "r") as f:
            return json.loads(f.read())

    def _combine_siblings(self, data, parent_titles=[]):
        # Prepare text elements using semantic chunking
        
        whole_content = ""
        metadata = {
            "page_numbers": [],
            "filename": "",
            "languages": [],
            "title": parent_titles + [data["title"]],
        }
        
        # Select only text elements
        text_elements = [
            element for element in data["elements"] if element['category'] not in ["Table", "Image", "Figure"]
        ]
        for element in text_elements:
            whole_content += element["text"] + " "
            metadata["page_numbers"].append(element["metadata"]["page_number"])
            metadata["filename"] = element["metadata"]["filename"]
            metadata["languages"].extend(element["metadata"]["languages"])

        # Perform semantic chunking and create documents
        if whole_content != "":
            print(whole_content)
            docs = self._text_splitter.create_documents([whole_content], [metadata])
            print(docs)
            print("-----")
            for doc in docs:
                # Add parent titles to the document
                doc.page_content = "{}\n\n{}".format("/".join(metadata["title"]), doc.page_content)
            self.documents.extend(docs)

        # Create separate documents for media elements
        media_elements = [element for element in data["elements"] if element['category'] in ["Table", "Image", "Figure"]]
        for element in media_elements:
            metadata = {
                "page_numbers": [element["metadata"]["page_number"]],
                "filename": element["metadata"]["filename"],
                "languages": element["metadata"]["languages"],
                "title": parent_titles + [data["title"]],
                "image_path": element["metadata"]["image_path"],
            }
            text = "{}\n\n{}".format("/".join(metadata["title"]), element["text"])
            doc = Document(text, metadata=metadata)
            self.documents.append(doc)

        # Recursively combine children
        for child in data["children"]:
            self._combine_siblings(child, parent_titles + [data["title"]])

    def load(self):
        data = self._load_file()

        # Combine all siblings together and perform semantic chunking
        self._combine_siblings(data)

    def get_documents(self):
        return self.documents

In [36]:
loader = ParsedPDFLoader("parsed/wf.json")
loader.load()

Dear Shareholders, I'm proud to report that Wells Fargo continued to make progress on our priorities in 2022. Our underlying financial performance is improving, we are moving forward on our risk, control and regulatory agenda, we are focusing on businesses where we can generate appropriate risk-adjusted returns, we continue to strengthen the leadership team, and we are executing on our strategic objectives. While we have made progress, our work is not complete and we remain focused on successful and timely execution of our multi-year journey to complete our risk and control work and to move forward with our businesses. 
[Document(page_content="Dear Shareholders, I'm proud to report that Wells Fargo continued to make progress on our priorities in 2022. Our underlying financial performance is improving, we are moving forward on our risk, control and regulatory agenda, we are focusing on businesses where we can generate appropriate risk-adjusted returns, we continue to strengthen the lead

IndexError: index -1 is out of bounds for axis 0 with size 0

### 2. Indexing : Store
Creating embeddings for the splitted data and store the documents and it's corresponsing embeddings in a vector store. At this point we have a query-able vector store containing the chunked contents of our PDF's. Given a user question, we should ideally be able to return the snippets of the text that answer the question.

In [None]:
vectorstore = FAISS.from_documents(documents=loader.get_documents(), embedding=OllamaEmbeddings())
vectorstore.save_local('vectorstores/wf_vectorstore')

In [None]:
vectorstore = FAISS.load_local("./vectorstore",OllamaEmbeddings())

In [None]:
# Similarity Search
query = "Bank of America Chair & CEO?"
docs = vectorstore.similarity_search(query, k=3)
print(docs)

In [None]:
# Similarity Search
query = "Who is the CEO of Amazon?"
docs = vectorstore.similarity_search(query, k=3)
print(docs)

### 3. Retrieval and Generation: Retrieve
Different ways to retrive documents based on query - [Link](https://python.langchain.com/docs/modules/data_connection/retrievers/vectorstore)

<b> Using the query as is to retrieve the relevant documents using the search type "similarity_score_threshold"

In [None]:
retriever_similarity = vectorstore.as_retriever(search_type="similarity_score_threshold",
                                                 search_kwargs={"score_threshold": 0.70})
# retrieved_docs = retriever.invoke("How innovation is driven at bank of america?")
retrieved_docs = retriever_similarity.get_relevant_documents("Who is the CEO of Amazon") #What is the full-form of ROTCE?

In [None]:
len(retrieved_docs)

In [None]:
for doc in retrieved_docs:
    print(doc.page_content)

<b> Using the Multi Query retriver which will create variants of queries based on the prompt to retrieve the relevant documents 

In [None]:
# supply a prompt along with an output parser to split the results into a list of queries.
# Output parser will split the LLM result into a list of queries
class LineList(BaseModel):
    # "lines" is the key (attribute name) of the parsed output
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate four \ 
    different versions of the given user question to retrieve relevant documents from a vector \
    database. By generating multiple perspectives on the user question, your goal is to help \
    the user overcome some of the limitations of the distance-based similarity search. \ 
    Provide these alternative questions separated by newlines. \
    Original question: {question}""",
)
llm = ChatOpenAI(temperature=0, model='gpt-3.5-turbo-16k')


# Chain
llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

In [None]:
# Run
retriever = MultiQueryRetriever(
    include_original=True,
    retriever=vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": 0.70}),
    llm_chain=llm_chain, parser_key="lines",
    verbose=True
)  # "lines" is the key (attribute name) of the parsed output

# Results
unique_docs = retriever.get_relevant_documents(
    query="What is tuition assistance?"
)
len(unique_docs)

In [None]:
# To check different queries generated by the llm
llm_chain.invoke("What is tuition assistance?")

### 3. Retrieval and Generation: Generate
In this step everything will be put together into a chain. A chain will take question, will retrieve relevant documents, will construct a prompt, will pass to the llm model and will parse the output.

In [None]:
# contextualize_ques_system_prompt = """Given a chat history and the latest user question \
# which might refer to a context in the chat history, formulate a standalone question \
# which can be understood without the chat history. Do NOT answer the question, \
# just reformulate it if needed and otherwise return it as is."""

contextualize_ques_system_prompt = """ Check if the latest user question refers to a context in the chat history. \
If it does, formulate a standalone question which can be understood without the chat history. \
If it does not, return the question as is. Your task is to just reformulate the question if needed.
"""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_ques_system_prompt),
        # Prompt template that assumes variable is already list of messages.
        # We provide the variable name to be used as messages
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

contextualize_q_chain = contextualize_q_prompt | llm | StrOutputParser()

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
# just say that you don't know.
# qa_system_prompt = """You are an assistant for question-answering tasks. \
# Use the following pieces of retrieved context to answer the question. \
# If you don't know the answer, provide a response as - "I don't know." \

# {context}"""
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following context or document embeddings to answer the question. \
Don't justify your answers and don't search the internet.If the answer could not be found in the documents,\
ay the words "Sorry, I am unable to answer your question with the information available to me"\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


def contextualized_question(input: dict):
    if input.get("chat_history"):        
        return contextualize_q_chain
    else:
        return input["question"]


rag_chain = (
    RunnablePassthrough.assign(
        context=contextualized_question | retriever_similarity
    )
    | qa_prompt
    | llm
)

In [None]:
rag_chain

In [None]:
chat_history = []

question = "How Risk Management is done in Wells Fargo?"
ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])

In [None]:
ai_msg.content

In [None]:
second_question = "How Wells Fargo protect against unauthorized access?"
new_msg = rag_chain.invoke({"question": second_question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=second_question), new_msg])

In [None]:
new_msg.content

In [None]:
third_question = "Who is the CEO of Amazon?"
new_msg = rag_chain.invoke({"question": third_question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=third_question), new_msg])

In [None]:
new_msg.content

In [None]:
fourth_question = "What did Sheri Bronstein wrote in her letter in Bank of America document?"
new_msg = rag_chain.invoke({"question": fourth_question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=fourth_question), new_msg])

In [None]:
new_msg.content

In [None]:
fifth_question = "What is Emotional Wellness?"
new_msg = rag_chain.invoke({"question": fifth_question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=fifth_question), new_msg])
new_msg.content

In [None]:
third_question = "What are different types of Risk Management at Wells Fargo?"
rag_chain.invoke({"question": third_question, "chat_history": chat_history}).content

In [None]:
# Clear chat history
chat_history.clear()