# Notebook to collect financial information

## Import libraries

In [15]:
from pathlib import Path

from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import TextLoader

from langchain_community import embeddings
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings


from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.schema import Document

import uuid
import time


## Load and split one document

In [2]:
relative_path = Path("data/sec-edgar-filings/ABR/10-K/0001628280-24-005456/primary-document.html")
file_path = Path.cwd() / relative_path
if not file_path.exists():
    raise FileNotFoundError(f"The file {file_path} does not exist.")


loader = UnstructuredHTMLLoader(str(file_path))
data = loader.load()


chunk_size = 512 * 3 # mxbai-embed-large context length = 512 defensively assumend token length = 3
#chunk_size = 2048 * 3 # nomic-embed-text context length = 2048 defensively assumend token length = 3
chunk_overlap = 0

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

final_splits = text_splitter.split_documents(data)

print(f"Total number of splits: {len(final_splits)}")
print("Sample split:")
print(final_splits[5])

Total number of splits: 235
Sample split:
page_content='changes in interest rates; the quality and size of the investment pipeline and the rate at which we can invest our cash; impairments in the value of the collateral underlying our loans and investments; inflation; changes in federal and state laws and regulations, including changes in tax laws; the availability and cost of capital for future investments; and competition. Readers are cautioned not to place undue reliance on any of these forward-looking statements, which reflect our views as of the date of this report. The factors noted above could cause our actual results to differ significantly from those contained in any forward-looking statement.' metadata={'source': 'C:\\Users\\big10\\ml_project\\FinRepReader\\data\\sec-edgar-filings\\ABR\\10-K\\0001628280-24-005456\\primary-document.html'}


## Create and store embeddings

documents = []
metadatas = []
ids = []

for split in final_splits:
    documents.append(split.page_content)
    metadatas.append(split.metadata)
    ids.append(str(uuid.uuid4()))  # Generate a unique ID for each split


In [None]:
def create_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [8]:


# Initialize the embedding model
#embeddings = HuggingFaceEmbeddings()

embeddings = OllamaEmbeddings(
    base_url="http://localhost:11434",
    #model="nomic-embed-text"
    model="mxbai-embed-large"
)



db = FAISS.from_texts([""], embeddings)



doc_objects = [Document(page_content=doc, metadata=meta) for doc, meta in zip(documents, metadatas)]
#doc_objects = final_splits

max_batch_size = 10
document_batches = create_batches(doc_objects, max_batch_size)
id_batches = create_batches(ids, max_batch_size)

document_count = 0
for doc_batch in document_batches:
    id_batch = next(id_batches)
    
    # Add the batch to FAISS
    db.add_documents(documents=doc_batch, ids=id_batch)
    
    document_count += len(doc_batch)
    print(f"Added {document_count} documents out of {len(doc_objects)} to the collection.")
    
    time.sleep(0.1)  # Wait for 10 milliseconds

# Save the index locally
db.save_local("faiss_index")



Added 10 documents out of 235 to the collection.
Added 20 documents out of 235 to the collection.
Added 30 documents out of 235 to the collection.
Added 40 documents out of 235 to the collection.
Added 50 documents out of 235 to the collection.
Added 60 documents out of 235 to the collection.
Added 70 documents out of 235 to the collection.
Added 80 documents out of 235 to the collection.
Added 90 documents out of 235 to the collection.
Added 100 documents out of 235 to the collection.
Added 110 documents out of 235 to the collection.
Added 120 documents out of 235 to the collection.
Added 130 documents out of 235 to the collection.
Added 140 documents out of 235 to the collection.
Added 150 documents out of 235 to the collection.
Added 160 documents out of 235 to the collection.
Added 170 documents out of 235 to the collection.
Added 180 documents out of 235 to the collection.
Added 190 documents out of 235 to the collection.
Added 200 documents out of 235 to the collection.
Added 210

## Perform a similarity search + create prompt

In [9]:
queries = [
    "company name legal entity",
    "exact name of registrant as specified in its charter",
    "primary business operations industry sector",
    "regulated as"
    "CONSOLIDATED STATEMENTS income"
    "financial results net income revenue earnings",
    "risk factors business challenges uncertainties",
    
]

results = []
for query in queries:
    results.extend(db.similarity_search(query, k=4))



In [10]:
retrieved_info = ""
for doc in results:
    print(f'{doc.page_content}\n\n')
    retrieved_info += f"{doc}\n\n"

Exhibit # Description Form Exhibit # Filing Date  3.1 Articles of Incorporation of Arbor Realty Trust, Inc. S-11 3.1 11/13/2003 3.2 Articles of Amendment to Articles of Incorporation of Arbor Realty Trust, Inc. 10-Q 3.2 8/7/2007 3.3 Articles Supplementary of Arbor Realty Trust, Inc. S-11 3.2 11/13/2003 3.4 Articles Supplementary of 6.375% Series D Cumulative Redeemable Preferred Stock. 8-A 3.7 6/2/2021 3.5 Articles Supplementary of 6.25% Series E Cumulative Redeemable Preferred Stock. 8-A 3.5 8/11/2021 3.6 Articles Supplementary of 6.25% Series F Fixed-to-Floating Rate Cumulative Redeemable Preferred Stock. 8-A 3.6 10/12/2021 3.7 New Articles Supplementary classifying 3,565,000 shares of 6.25% Series F Cumulative Redeemable Preferred Stock 8-K 3.2 2/7/2022 3.8 Articles Supplementary designating Special Voting Preferred Stock. 8-K 3.1 7/15/2016 3.9 Amended and Restated Bylaws of Arbor Realty Trust, Inc. 10-K 3.9 2/17/2023 4.1 Form of Certificate for Common Stock. S-11/A 12/31/2003 4.2 S

In [11]:
enhanced_prompt = f"""You are a financial analyst tasked with extracting key information from a company's 10-K or 10-Q report. Focus solely on the provided excerpts to answer the following questions.

Here are the relevant excerpts from the report:
{retrieved_info}

Format your response as follows:
- Name of the company: [Single line answer]
- Industry: [Single line answer]
- Earnings: [Brief summary, max 2-3 lines]
- Risks: [Bullet list of top 3-5 risks, each 1 line]

For each answer, indicate your confidence level (High/Medium/Low) based on how explicitly the information is stated in the text.

If any requested information is not explicitly stated in the provided excerpts, respond with "Information not provided in the given context."

If the provided excerpts are insufficient to answer these questions comprehensively, state which specific additional sections of the report would be helpful.

Please provide information about the following:
1. Name of the company
2. Industry the company operates in
3. Earnings
4. Risks

Base your answers strictly on the provided context. Keep your responses brief and focused.
"""

In [12]:
print(enhanced_prompt)

You are a financial analyst tasked with extracting key information from a company's 10-K or 10-Q report. Focus solely on the provided excerpts to answer the following questions.

Here are the relevant excerpts from the report:
page_content='Exhibit # Description Form Exhibit # Filing Date  3.1 Articles of Incorporation of Arbor Realty Trust, Inc. S-11 3.1 11/13/2003 3.2 Articles of Amendment to Articles of Incorporation of Arbor Realty Trust, Inc. 10-Q 3.2 8/7/2007 3.3 Articles Supplementary of Arbor Realty Trust, Inc. S-11 3.2 11/13/2003 3.4 Articles Supplementary of 6.375% Series D Cumulative Redeemable Preferred Stock. 8-A 3.7 6/2/2021 3.5 Articles Supplementary of 6.25% Series E Cumulative Redeemable Preferred Stock. 8-A 3.5 8/11/2021 3.6 Articles Supplementary of 6.25% Series F Fixed-to-Floating Rate Cumulative Redeemable Preferred Stock. 8-A 3.6 10/12/2021 3.7 New Articles Supplementary classifying 3,565,000 shares of 6.25% Series F Cumulative Redeemable Preferred Stock 8-K 3.2 2

## Asking the mighty llm

In [17]:
llm = Ollama(model="llama3.1")

answer = llm.invoke(enhanced_prompt)

In [18]:
print(answer)

Based on the given excerpts, here are my responses:

**1. Name of the company:**
American Business Realty (ABR) - Confidence level: Medium

**2. Industry:**
Real Estate Investment Trust (REIT) - Confidence level: High

**3. Earnings:**
The company may experience increases in loan loss reserves and other impairments, encounter difficulty estimating loan loss reserves, and experience a decline in loan repayments if economic conditions deteriorate or they are unable to invest excess capital on acceptable terms. They also generate sufficient revenue from operations to pay their operating expenses and pay dividends to stockholders. - Confidence level: Medium

**4. Risks:**
* Volatility in our stock price
* Losses of key personnel with long-standing business relationships
* Adverse resolutions of lawsuits
* Future terrorist attacks, military conflict, or changes to laws and regulations (including environmental, social, and governance matters)
* COVID-19 pandemic causing severe disruptions to