### Imports and API key

In [2]:
import os

from langchain.llms import OpenAI, HuggingFacePipeline
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.agents import tool, AgentType
from langchain.agents.initialize import initialize_agent
from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo,
)

from transformers import (
    PegasusTokenizer,
    PegasusForConditionalGeneration,
    BertTokenizer,
    BertForSequenceClassification,
    pipeline
)

os.environ['OPENAI_API_KEY'] = 'YOUR_API_KEY'

### Load and store documents in VectorDB

In [4]:
# specify document path and metadata
doc_info = {
    'path': '../documents/financial_annual_report_2022.pdf',
    'name': 'financial_annual_report_2022',
    'description': "a finance company's annual report documentation for the year 2022"
}

In [5]:
# load and split PDF document
loader = PyPDFLoader(doc_info['path'])
pages = loader.load_and_split()
print(f'num_pages: {len(pages)}')

# use only the first 10 pages to save compute
pages = pages[:10]

# add document to vector database
faiss_index = FAISS.from_documents(pages, OpenAIEmbeddings())
vectorstore_info = VectorStoreInfo(name=doc_info['name'],
                                   description=doc_info['description'],
                                   vectorstore=faiss_index)
toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info)

num_pages: 352


### Initialize models

In [6]:
# initialize base LLM
llm = OpenAI(temperature=0)

# initialize financial summarizer model
summary_model_name = "human-centered-summarization/financial-summarization-pegasus"
summary_tokenizer = PegasusTokenizer.from_pretrained(summary_model_name)
summary_model = PegasusForConditionalGeneration.from_pretrained(summary_model_name)
summary_hf = HuggingFacePipeline(pipeline=pipeline('summarization',
                                 model=summary_model,
                                 tokenizer=summary_tokenizer))

# initialize financial sentiment analysis model
sentiment_model_name = 'yiyanghkust/finbert-tone'
finbert = BertForSequenceClassification.from_pretrained(sentiment_model_name,
                                                        num_labels=3)
finbert_tokenizer = BertTokenizer.from_pretrained(sentiment_model_name)
finbert_pipe = pipeline("sentiment-analysis", model=finbert,
                        tokenizer=finbert_tokenizer)

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at human-centered-summarization/financial-summarization-pegasus and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Create custom tools

In [7]:
# define custom langchain tools
@tool
def financial_summary(text: str) -> str:
    """Returns the financial summary of the input text, \
    use this for any queries related to finding the summary \
    of the input. However, remember to extract the relevant snippet \
    from `financial_annual_report_2022` first, before using this function.
    The input should always be a string related \
    to financial information, and this function always returns \
    the summary of this financial information."""
    return summary_hf.predict(text)

@tool
def financial_sentiment(text: str) -> str:
    """Returns the financial sentiment of the input text, \
    use this for any queries related to finding the sentiment \
    of the input. However, remember to extract the relevant snippet \
    from `financial_annual_report_2022` first, before using this function.
    The input should always be a string related \
    to financial information, and this function always returns \
    the sentiment of this financial information."""
    return finbert_pipe(text)

### Initialize agents

In [8]:
# create retrieval agent
retrieval_agent = create_vectorstore_agent(llm=llm,
                                           toolkit=toolkit,
                                           verbose=True)

# create retrieval + summary agent
summary_agent = initialize_agent(toolkit.get_tools() + [financial_summary],
                                 llm,
                                 agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
                                 handle_parsing_errors=True,
                                 verbose=True)

# create retrieval + summary + sentiment analysis agent
sentiment_agent = initialize_agent(toolkit.get_tools() + [financial_summary,
                                                          financial_sentiment],
                                  llm,
                                  agent=AgentType.CHAT_ZERO_SHOT_REACT_DESCRIPTION,
                                  handle_parsing_errors=True,
                                  verbose=True)

### Results

#### Retrieval-only agent

In [9]:
query = "How much was Capital One's loan growth and revenue in 2022 compared to 2021?"
retrieval_agent.run(query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find the financial information for Capital One in 2022
Action: financial_annual_report_2022
Action Input: Capital One's loan growth and revenue in 2022[0m
Observation: [36;1m[1;3m Capital One's loan growth in 2022 was 13% and revenue was $34.3 billion, up 12.5% from 2021.[0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: Capital One's loan growth in 2022 was 13% and revenue was $34.3 billion, up 12.5% from 2021.[0m

[1m> Finished chain.[0m


"Capital One's loan growth in 2022 was 13% and revenue was $34.3 billion, up 12.5% from 2021."

#### Retrieval + summary agent

In [10]:
summary_agent.run(f'{query} After retrieving, summarize the information.')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: I need to use the financial_annual_report_2022 and financial_summary tools to answer this question.

Action:
```
{
  "action": "financial_annual_report_2022",
  "action_input": "How much was Capital One's loan growth and revenue in 2022 compared to 2021?"
}
```
[0m
Observation: [36;1m[1;3m Loan growth was 13% and revenue was 12.5% higher than 2021.[0m
Thought:

Your max_length is set to 64, but your input_length is only 12. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=6)


[32;1m[1;3m I now need to summarize this information.

Action:
```
{
  "action": "financial_summary",
  "action_input": "Loan growth was 13% and revenue was 12.5% higher than 2021."
}
```

[0m
Observation: [38;5;200m[1;3mLoan growth was 13%, revenue was 12.5% higher in 2021[0m
Thought:[32;1m[1;3m I now know the final answer.

Final Answer: Capital One's loan growth and revenue in 2022 were 13% and 12.5% higher than 2021, respectively.[0m

[1m> Finished chain.[0m


"Capital One's loan growth and revenue in 2022 were 13% and 12.5% higher than 2021, respectively."

#### Retrieval + summary + sentiment analysis agent

In [11]:
sentiment_agent.run(f'{query} After retrieving, provide the finanical sentiment.')



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: I need to use the financial_annual_report_2022 and financial_sentiment tools
Action:
```
{
  "action": "financial_annual_report_2022",
  "action_input": "How much was Capital One's loan growth and revenue in 2022 compared to 2021?"
}
```
[0m
Observation: [36;1m[1;3m Loan growth was 13% and revenue was 12.5% higher than 2021.[0m
Thought:[32;1m[1;3m I need to use the financial_sentiment tool
Action:
```
{
  "action": "financial_sentiment",
  "action_input": "Loan growth was 13% and revenue was 12.5% higher than 2021."
}
```

[0m
Observation: [36;1m[1;3m[{'label': 'Positive', 'score': 1.0}][0m
Thought:[32;1m[1;3m I now know the final answer
Final Answer: Capital One's loan growth and revenue in 2022 was 13% and 12.5% higher than 2021 respectively, with a positive sentiment.[0m

[1m> Finished chain.[0m


"Capital One's loan growth and revenue in 2022 was 13% and 12.5% higher than 2021 respectively, with a positive sentiment."