# Notebook to collect financial information

## Import libraries

In [1]:
from pathlib import Path

from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import WebBaseLoader
from langchain.document_loaders import TextLoader

from langchain_community.document_loaders import BSHTMLLoader
from langchain_community import embeddings
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_community.document_loaders import PyMuPDFLoader

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.vectorstores import DuckDB
from langchain.vectorstores import Chroma

from langchain.tools import DuckDuckGoSearchRun
from langchain.tools import Tool
from langchain.docstore import InMemoryDocstore
from langchain.agents import Tool

import uuid
import time
import pymupdf
from crewai import Agent, Task, Crew
from crewai.task import TaskOutput



USER_AGENT environment variable not set, consider setting it to identify your requests.
* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


## Load and split one document

In [2]:
#relative_path = Path("data/sec-edgar-filings/ABR/10-K/0001628280-24-005456/primary-document.html")
relative_path_sec_file = Path("data/sec-edgar-filings/O/10-K/realty_income.pdf")
#relative_path_investor_pres = Path("data/sec-edgar-filings/O/10-K/investor-presentation-q4-2023.pdf")


#file_path = Path.cwd() / relative_path
#if not file_path.exists():
#    raise FileNotFoundError(f"The file {file_path} does not exist.")

#urls = ["https://otp.tools.investis.com/clients/us/realty_income_corporation/SEC/sec-show.aspx?Type=html&FilingId=17292849&Cik=0000726728"]

#loader = WebBaseLoader(urls)
#loader = BSHTMLLoader(str(file_path))
#data = loader.load()
data = PyMuPDFLoader(relative_path_sec_file).load()
#data.extend(PyMuPDFLoader(relative_path_investor_pres).load())

all_page_content = [page.page_content for page in data]
full_data = ' - NEW PAGE - '.join(all_page_content)

chunk_size = 50000 * 4 
chunk_overlap = 5000 * 4

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap)

final_splits = text_splitter.split_text(full_data)

print(f"Total number of splits: {len(final_splits)}")
print("Sample split:")
#print(final_splits[11])
len(final_splits[0])

Total number of splits: 4
Sample split:


199891

In [3]:
required_items = """
1. What is the name of company this statement is about?
2. What is the industry in which the company in this statement is operating?
3. Net income?
4. Risks?
5. One important piece of information an Investor should know about the company.

Format your response as follows:
- Name of the company: [Single line answer]
- Industry: [Single line answer]
- Net income: [Single number]
- Risks: [Bullet list of the top 3 risks, each 1 line]
- Important fact: [Single line answer]
"""

In [4]:
def create_enhanced_prompt(retrieved_info, required_items):
    enhanced_prompt = f"""You are a financial analyst tasked with extracting key information from a company's 10-K or 10-Q report. Focus solely on the provided excerpts to answer the following questions.
    
    Here are the relevant excerpts from the report:
    {retrieved_info}
    
    
    Please provide information about the following:
    {required_items}
    If any requested information is not explicitly stated in the provided excerpts, respond with "Information not provided in the given context."
    
    Base your answers strictly on the provided context. Keep your responses brief and focused.
    
    For each answer, indicate your confidence level (High/Medium/Low) based on how explicitly the information is stated in the text.
    """
    return enhanced_prompt

## Asking the mighty llm

In [5]:
#llm = Ollama(model="llama3.1", temperature=0.5)

llm = Ollama(model="dolphin-2.9.3-mistral-nemo-12b", temperature=0.3)

In [6]:
for retrieved_info in final_splits: 

    enhanced_prompt = create_enhanced_prompt(retrieved_info, required_items)
    
    answer = llm.invoke(enhanced_prompt)

    print(answer)
    print('-------------------------------------------------------------------')

Name of the company: Information not provided in the given context.
Industry: Financial services industry
Net income: Information not provided in the given context.
Risks: 
- Counterparty credit risk
- Interest rate risk
- Liquidity risk
Important fact: The company has entered into interest rate swaps to manage its debt obligations and mitigate risks associated with variable interest rates.
-------------------------------------------------------------------
- Name of the company: VEREIT Operating Partnership, L.P. 
- Industry: Real Estate Investment Trusts (REITs) 
- Net income: Information not provided in the given context. 
- Risks: 
    - Market risk due to fluctuations in real estate markets. 
    - Interest rate risk due to variable interest rates on debt obligations. 
    - Credit risk from tenants failing to pay rent or defaulting on lease agreements. 
- Important fact: The company has issued multiple series of notes with varying maturity dates and interest rates, including a re

In [7]:
horst

NameError: name 'horst' is not defined

Here are the answers:

1. **Realty Income Corporation** (Confidence: High)
2. **Real Estate Investment Trust (REIT)** (Confidence: Medium)
3. **$272,083,100** (Confidence: High)
4. 
* **Continued qualification as a real estate investment trust**
* **General domestic and foreign business, economic, or financial conditions**
* **Potential liability relating to environmental matters** (Confidence: Low)

Note that the provided text does not explicitly mention net income in the usual sense, but rather mentions "operating results" which is related to the overall performance of the company.

## Let the crew figure this out

In [None]:
llm = Ollama(model="llama3.1", temperature=1.0)


# Initialize DuckDuckGo search tool
duckduckgo_search = DuckDuckGoSearchRun()

def on_tool_start(tool_name):
    print(f"Tool {tool_name} started")

def on_tool_error(tool_name, error):
    print(f"Tool {tool_name} encountered an error: {error}")

def on_tool_end(tool_name, result):
    print(f"Tool {tool_name} completed with result: {result}")


def callback_function(output: TaskOutput):
    # Do something after the task is completed
    # Example: Send an email to the manager
    print(f"""
        Task completed!
        Task: {output.description}
        Output: {output.summary}
    """)

# Function to query the vector database
def query_vector_db(query: str) -> str:
    results = db.similarity_search(query, k=2)
    return "\n".join([doc.page_content for doc in results])

# Create tools
tools = [
    Tool(
        name="DuckDuckGo Search",
        func=duckduckgo_search.run,
        description="Useful for when you need to search the internet for current information.",
        on_start=on_tool_start,
        on_error=on_tool_error,
        on_end=on_tool_end
    ),
    Tool(
        name="Vector Database",
        func=query_vector_db,
        description="Useful for retrieving specific information from the company's knowledge base.",
        on_start=on_tool_start,
        on_error=on_tool_error,
        on_end=on_tool_end
    )
]


# Create the Financial Analyst agent with tools
financial_analyst = Agent(
    role='Financial Analyst',
    goal='Provide accurate and insightful financial analysis based on given prompts with all the tools provided to you.',
    backstory="""You are an experienced financial analyst with a strong 
    background in reading financial statements and company related financial reports. You are also eager to use tools.
    Use the tools provided to you ("DuckDuckGo Search" and "Vector Database") for your work.
    The correctness of your reports is key. If you get information wrong you'll receive salary cuts.
    The provided tools will help you to ensure the correctness of your report.
    You like your reports to be brief and to the point.""",
    verbose=True,
    allow_delegation=False,
    tools=tools,
    llm=llm,
    max_iter=50
    
)
#                    You are given this enhanced prompt: >>>{enhanced_prompt}<<< 
#                    It has a basic input template and included are outputs from a vector database, which was
#                    filled with chunks of A. financial statements and B. the company presentation of the company we are interested in.
task1 = Task(
    description=f"""

                    The goal is to provide this information: >>>{required_items}<<<

                    The enhanced prompt was already given to an LLM to extract the information form the vector database chunks included in
                    the enhanced prompt. This is the output from the LLM >>>{answer}<<<.

                    This information might be:
                        A. Correct
                        B. Incorrect
                        C. Incomplete

                    Your task is to verify the output from the LLM and check with the tools given to you if the given information is correct.
                    If the information is correct and complete indicate the information as correct and complete.
                    If the information is incorrect, please correct the information and indicate that you had to correct the information.
                    If the information is incomplete, please complete the information and indicate what additional information you added.

                    Use the tools provided to you ("DuckDuckGo Search" and "Vector Database") for your work. As a minimum you have to use
                    "DuckDuckGo Search" to verify the Net income.

                    When you think you are done review the information you have. Check again with a tool if the Net income in in line with DuckDuckGo.
    
                    Only then provide this information >>>{required_items}<<< at the end.

                   """,
    agent=prompt_expert,
    expected_output=f"""A well-crafted prompt for a financial analyst to provide
                        {required_items} 
                        for the company in question.
                        """,
    callback=callback_function
)

# Create the crew with both agents
crew = Crew(
    agents=[financial_analyst],
    tasks=[task1],
    verbose=True
)

# Run the crew
result = crew.kickoff()

print(result)


