# Notebook to collect financial information

## Import libraries

In [1]:
from pathlib import Path

from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import WebBaseLoader
from langchain.document_loaders import TextLoader

from langchain_community.document_loaders import BSHTMLLoader
from langchain_community import embeddings
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_community.document_loaders import PyMuPDFLoader

from langchain.vectorstores import FAISS
from langchain.vectorstores import DuckDB
from langchain.vectorstores import Chroma

from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.vectorstores import FAISS
from langchain.vectorstores import DuckDB
from langchain.vectorstores import Chroma

from langchain.tools import DuckDuckGoSearchRun
from langchain.tools import Tool
from langchain.docstore import InMemoryDocstore
from langchain.agents import Tool

import uuid
import time
import pymupdf
from crewai import Agent, Task, Crew
from crewai.task import TaskOutput



USER_AGENT environment variable not set, consider setting it to identify your requests.
* 'allow_population_by_field_name' has been renamed to 'populate_by_name'
* 'smart_union' has been removed


## Load and split one document

In [2]:
#relative_path = Path("data/sec-edgar-filings/ABR/10-K/0001628280-24-005456/primary-document.html")
relative_path_sec_file = Path("data/sec-edgar-filings/O/10-K/realty_income.pdf")
relative_path_investor_pres = Path("data/sec-edgar-filings/O/10-K/investor-presentation-q4-2023.pdf")


#file_path = Path.cwd() / relative_path
#if not file_path.exists():
#    raise FileNotFoundError(f"The file {file_path} does not exist.")

#urls = ["https://otp.tools.investis.com/clients/us/realty_income_corporation/SEC/sec-show.aspx?Type=html&FilingId=17292849&Cik=0000726728"]

#loader = WebBaseLoader(urls)
#loader = BSHTMLLoader(str(file_path))
#data = loader.load()
data = PyMuPDFLoader(relative_path_sec_file).load()
data.extend(PyMuPDFLoader(relative_path_investor_pres).load())


#chunk_size = 512 * 3 # mxbai-embed-large context length = 512 defensively assumend token length = 3
chunk_size = 2048 * 3 # nomic-embed-text context length = 2048 defensively assumend token length = 3
chunk_overlap = 500

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

final_splits = text_splitter.split_documents(data)

print(f"Total number of splits: {len(final_splits)}")
print("Sample split:")
print(final_splits[11])

Total number of splits: 257
Sample split:
page_content='Readers are cautioned not to place undue reliance on forward-looking statements. Forward-looking statements are not guarantees of future
plans and performance and speak only as of the date this annual report was filed with the SEC. Actual plans and operating results may differ
materially from what is expressed or forecasted in this annual report and forecasts made in the forward-looking statements discussed in this
annual report might not materialize. We do not undertake any obligation to update forward-looking statements that may be made to reflect
events or circumstances after the date these statements were made.
7' metadata={'source': 'data\\sec-edgar-filings\\O\\10-K\\realty_income.pdf', 'file_path': 'data\\sec-edgar-filings\\O\\10-K\\realty_income.pdf', 'page': 11, 'total_pages': 193, 'format': 'PDF 1.4', 'title': '0000726728-24-000047', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-K f

In [3]:
print(final_splits[11])

page_content='Readers are cautioned not to place undue reliance on forward-looking statements. Forward-looking statements are not guarantees of future
plans and performance and speak only as of the date this annual report was filed with the SEC. Actual plans and operating results may differ
materially from what is expressed or forecasted in this annual report and forecasts made in the forward-looking statements discussed in this
annual report might not materialize. We do not undertake any obligation to update forward-looking statements that may be made to reflect
events or circumstances after the date these statements were made.
7' metadata={'source': 'data\\sec-edgar-filings\\O\\10-K\\realty_income.pdf', 'file_path': 'data\\sec-edgar-filings\\O\\10-K\\realty_income.pdf', 'page': 11, 'total_pages': 193, 'format': 'PDF 1.4', 'title': '0000726728-24-000047', 'author': 'EDGAR® Online LLC, a subsidiary of OTC Markets Group', 'subject': 'Form 10-K filed on 2024-02-21 for the period ending 2

## Create and store embeddings

In [4]:
#final_splits = final_splits[:30]

documents = []
metadatas = []
ids = []

for split in final_splits:
    documents.append(split.page_content)
    metadatas.append(split.metadata)
    ids.append(str(uuid.uuid4()))  # Generate a unique ID for each split


In [5]:
def create_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [6]:


# Initialize the embedding model
#embeddings = HuggingFaceEmbeddings()

embeddings = OllamaEmbeddings(
    base_url="http://localhost:11434",
    model="nomic-embed-text"
    #model="mxbai-embed-large"
)



#db = FAISS.from_documents(final_splits, embedding=embeddings, normalize_L2=True)
db = FAISS.from_texts([""], embedding=embeddings, normalize_L2=True)



doc_objects = [Document(page_content=doc, metadata=meta) for doc, meta in zip(documents, metadatas)]
#doc_objects = final_splits

max_batch_size = 10
document_batches = create_batches(doc_objects, max_batch_size)
id_batches = create_batches(ids, max_batch_size)

print('Start batch process')

document_count = 0
for doc_batch in document_batches:
    id_batch = next(id_batches)
    
    # Add the batch to FAISS
    db.add_documents(documents=doc_batch, ids=id_batch)
    
    document_count += len(doc_batch)
    print(f"Added {document_count} documents out of {len(doc_objects)} to the collection.")
    
    time.sleep(0.1)

# Save the index locally
db.save_local("faiss_index")



Start batch process
Added 10 documents out of 257 to the collection.
Added 20 documents out of 257 to the collection.
Added 30 documents out of 257 to the collection.
Added 40 documents out of 257 to the collection.
Added 50 documents out of 257 to the collection.
Added 60 documents out of 257 to the collection.
Added 70 documents out of 257 to the collection.
Added 80 documents out of 257 to the collection.
Added 90 documents out of 257 to the collection.
Added 100 documents out of 257 to the collection.
Added 110 documents out of 257 to the collection.
Added 120 documents out of 257 to the collection.
Added 130 documents out of 257 to the collection.
Added 140 documents out of 257 to the collection.
Added 150 documents out of 257 to the collection.
Added 160 documents out of 257 to the collection.
Added 170 documents out of 257 to the collection.
Added 180 documents out of 257 to the collection.
Added 190 documents out of 257 to the collection.
Added 200 documents out of 257 to the c

#### To avoid storing everything again (turn from MD to code)
embeddings = OllamaEmbeddings(
    base_url="http://localhost:11434",
    #model="nomic-embed-text"
    model="mxbai-embed-large"
)

db = FAISS.load_local("faiss_index", embeddings)

## Perform a similarity search + create prompt

In [7]:
test_text = """By:
/s/RONALD L. MERRIMAN
 
Date: February 21, 2024
 
Ronald L. Merriman
 
 
 
Director
 
 
By:
/s/SUMIT ROY
 
Date: February 21, 2024
 
Sumit Roy
 
 
 
Director, President, Chief Executive Officer
 
(Principal Executive Officer)
 
 
By:
/s/JONATHAN PONG
Date: February 21, 2024
Jonathan Pong
Executive Vice President, Chief Financial Officer and Treasurer
(Principal Financial Officer)
By:
/s/SEAN P. NUGENT
 
Date: February 21, 2024
 
Sean P. Nugent
 
 
 
Senior Vice President, Controller, Principal Accounting Officer
 
(Principal Accounting Officer)
 
 
96"""
results = db.similarity_search_with_relevance_scores(test_text, k=2)
retrieved_info = ""
for doc in results:
    print(f'{doc[0].page_content}\n')
    print('--------------------------------------------------')

By:
/s/RONALD L. MERRIMAN
 
Date: February 21, 2024
 
Ronald L. Merriman
 
 
 
Director
 
 
By:
/s/SUMIT ROY
 
Date: February 21, 2024
 
Sumit Roy
 
 
 
Director, President, Chief Executive Officer
 
(Principal Executive Officer)
 
 
By:
/s/JONATHAN PONG
Date: February 21, 2024
Jonathan Pong
Executive Vice President, Chief Financial Officer and Treasurer
(Principal Financial Officer)
By:
/s/SEAN P. NUGENT
 
Date: February 21, 2024
 
Sean P. Nugent
 
 
 
Senior Vice President, Controller, Principal Accounting Officer
 
(Principal Accounting Officer)
 
 
96

--------------------------------------------------
EXHIBIT 31.1
 
Certification of Chief Executive Officer
 
I, Sumit Roy, certify that:
 
1.          I have reviewed this annual report on Form 10-K of Realty Income Corporation for the year ended December 31, 2023;
 
2.          Based on my knowledge, this report does not contain any untrue statement of a material fact or omit to state a material fact necessary to
make the statements

In [8]:
queries = [
    "company name",
    "legal entity",
    " the Company, we, our or us refer"
    "exact name of registrant as specified in its charter",
    "primary business operations industry sector",
    "regulated as",
    "Net income $",
    "risk factors",
    "business challenges",
    "uncertainties",
    "adverse events"
    
]

results = []
for query in queries:
    results.extend(db.similarity_search_with_relevance_scores(query, k=1))



In [9]:
retrieved_info = ""
for doc in results:
    print(f'{doc[0].page_content}\n\n')
    retrieved_info += f"{doc[0].page_content}\n"

Exhibit 4.88
DESCRIPTION OF SECURITIES
As of December 31, 2023, Realty Income Corporation, a Maryland corporation (“Realty Income,” “we,” “us,” and the
“Company”), had ten classes of securities registered under Section 12 of the Securities Exchange Act of 1934, as amended (the
“Exchange Act”): (i) our common stock, $0.01 par value per share (“common stock”); (ii) our 1.125% Notes due 2027 (the “July
2027 notes”); (iii) our 1.875% Notes due 2027 (the “January 2027 notes”); (iv) our 1.625% Notes due 2030 (the “October 2030
notes”); (v) our 4.875% Notes due 2030 (the “July 2030 Notes”); (vi) our 5.750% Notes due 2031 (the “2031 notes”); (vii) our
1.750% Notes due 2033 (the “2033 notes”); and (viii) our 5.125% Notes due 2034 (the “2034 notes”); (ix) our 6.000% Notes due
2039 (the “2039 notes”); and (x) our 2.500% Notes due 2042 (the “2042 notes”, together with the July 2027 notes, January 2027
notes, October 2030 notes, July 2030 notes, 2031 notes, 2033 notes, 2034 notes and 2039 notes, th

In [11]:
required_items = """
1. What is the name of company this statement is about?
2. What is the industry in which the company in this statement is operating?
3. Net income?
4. Risks?

Format your response as follows:
- Name of the company: [Single line answer]
- Industry: [Single line answer]
- Net income: [Single number]
- Risks: [Bullet list of the top 3 risks, each 1 line]
"""

In [12]:
enhanced_prompt = f"""You are a financial analyst tasked with extracting key information from a company's 10-K or 10-Q report. Focus solely on the provided excerpts to answer the following questions.

Here are the relevant excerpts from the report:
{retrieved_info}


Please provide information about the following:
{required_items}
If any requested information is not explicitly stated in the provided excerpts, respond with "Information not provided in the given context."

Base your answers strictly on the provided context. Keep your responses brief and focused.

For each answer, indicate your confidence level (High/Medium/Low) based on how explicitly the information is stated in the text.
"""

In [13]:
print(enhanced_prompt)

You are a financial analyst tasked with extracting key information from a company's 10-K or 10-Q report. Focus solely on the provided excerpts to answer the following questions.

Here are the relevant excerpts from the report:
Exhibit 4.88
DESCRIPTION OF SECURITIES
As of December 31, 2023, Realty Income Corporation, a Maryland corporation (“Realty Income,” “we,” “us,” and the
“Company”), had ten classes of securities registered under Section 12 of the Securities Exchange Act of 1934, as amended (the
“Exchange Act”): (i) our common stock, $0.01 par value per share (“common stock”); (ii) our 1.125% Notes due 2027 (the “July
2027 notes”); (iii) our 1.875% Notes due 2027 (the “January 2027 notes”); (iv) our 1.625% Notes due 2030 (the “October 2030
notes”); (v) our 4.875% Notes due 2030 (the “July 2030 Notes”); (vi) our 5.750% Notes due 2031 (the “2031 notes”); (vii) our
1.750% Notes due 2033 (the “2033 notes”); and (viii) our 5.125% Notes due 2034 (the “2034 notes”); (ix) our 6.000% Notes 

## Asking the mighty llm

In [14]:
llm = Ollama(model="llama3.1", temperature=0.5)

answer = llm.invoke(enhanced_prompt)

In [15]:
print(answer)

Here are the requested answers:

1. **Realty Income Corporation**: (High)
2. **Real Estate Investment Trust (REIT)**: (Medium - inferred from the context, not explicitly stated)
3. **$272,083,100** (Low - this is a total revenue or income figure, but it's unclear if it refers specifically to net income)
4. 
* **Continued qualification as a real estate investment trust**: (High)
* **General domestic and foreign business, economic, or financial conditions**: (Medium)
* **Potential liability relating to environmental matters**: (Low)


Here are the answers:

1. **Realty Income Corporation** (Confidence: High)
2. **Real Estate Investment Trust (REIT)** (Confidence: Medium)
3. **$272,083,100** (Confidence: High)
4. 
* **Continued qualification as a real estate investment trust**
* **General domestic and foreign business, economic, or financial conditions**
* **Potential liability relating to environmental matters** (Confidence: Low)

Note that the provided text does not explicitly mention net income in the usual sense, but rather mentions "operating results" which is related to the overall performance of the company.

## Let the crew figure this out

In [18]:



# Initialize DuckDuckGo search tool
duckduckgo_search = DuckDuckGoSearchRun()


def callback_function(output: TaskOutput):
    # Do something after the task is completed
    # Example: Send an email to the manager
    print(f"""
        Task completed!
        Task: {output.description}
        Output: {output.summary}
    """)

# Function to query the vector database
def query_vector_db(query: str) -> str:
    results = db.similarity_search(query, k=2)
    return "\n".join([doc.page_content for doc in results])

# Create tools
tools = [
    Tool(
        name="DuckDuckGo Search",
        func=duckduckgo_search.run,
        description="Useful for when you need to search the internet for current information."
    ),
    Tool(
        name="Vector Database",
        func=query_vector_db,
        description="Useful for retrieving specific information from the company's knowledge base."
    )
]


# Create the Financial Analyst agent with tools
financial_analyst = Agent(
    role='Financial Analyst',
    goal='Provide accurate and insightful financial analysis based on given prompts',
    backstory="""You are an experienced financial analyst with a strong 
    background in reading financial statements and company related financial reports. 
    The correctness of your reports is key. If you get information wrong you'll receive salary cuts.
    You like your reports to be brief and to the point.""",
    verbose=True,
    allow_delegation=False,
    tools=tools,
    llm=llm,
    max_iter=50
    
)

# Create the Prompt Expert agent
prompt_expert = Agent(
    role='Prompt Engineering Expert',
    goal='Create effective prompts for financial analysis tasks',
    backstory=f"""You are a skilled prompt engineer with a deep understanding 
    of language models and how to craft prompts that elicit the most useful 
    and accurate responses, particularly in the domain of finance. 
    """,
    verbose=True,
    allow_delegation=False,
    llm=llm
)

# Create tasks for the agents
task1 = Task(
    description=f"""Using this RAG-generated prompt as a guide >>>{enhanced_prompt}<<<, create an 
    effective prompt for the financial analyst to provide                         
                        {required_items}
    of the company in question. You have already been given an regarding the company, but the information might come from an non-perfect RAG system.
    The financial analyst needs to be instructed to improve the given information. Your main task is to instruct the analyst to use the provided 
    tools when necessary. It is key for you to get the name of the company in question right, because when those are not correct,
    the financial analyst will work on the wrong company. So double check the company and the ticker symbol before you instruct the financial analyst. 
    Ask yourself if ticker and company name are alined.""",
    agent=prompt_expert,
    expected_output=f"""A well-crafted prompt for a financial analyst to provide
                        {required_items} 
                        for the company in question.
                        """,
    callback=callback_function
)

task2 = Task(
    description=f"""Find the relevant information about the company in question with all the tools given to you. Based on the input from the previous task. 
                    Also look at {answer} to look at the information that was already gathered. Double check if it's correct.""",
    agent=financial_analyst,
    expected_output=f"""Please provide information about the following:
                        {required_items}

                        Base your answers strictly on the provided context and your websearch. Keep your responses brief and focused.""",
    callback=callback_function
)

# Create the crew with both agents
crew = Crew(
    agents=[prompt_expert, financial_analyst],
    tasks=[task1, task2],
    verbose=True
)

# Run the crew
result = crew.kickoff()

print(result)



[1m[95m [2024-08-14 15:29:15][DEBUG]: == Working Agent: Prompt Engineering Expert[00m
[1m[95m [2024-08-14 15:29:15][INFO]: == Starting Task: Using this RAG-generated prompt as a guide >>>You are a financial analyst tasked with extracting key information from a company's 10-K or 10-Q report. Focus solely on the provided excerpts to answer the following questions.

Here are the relevant excerpts from the report:
Exhibit 4.88
DESCRIPTION OF SECURITIES
As of December 31, 2023, Realty Income Corporation, a Maryland corporation (“Realty Income,” “we,” “us,” and the
“Company”), had ten classes of securities registered under Section 12 of the Securities Exchange Act of 1934, as amended (the
“Exchange Act”): (i) our common stock, $0.01 par value per share (“common stock”); (ii) our 1.125% Notes due 2027 (the “July
2027 notes”); (iii) our 1.875% Notes due 2027 (the “January 2027 notes”); (iv) our 1.625% Notes due 2030 (the “October 2030
notes”); (v) our 4.875% Notes due 2030 (the “July 2030