In [2]:
from pathlib import Path

from langchain.document_loaders import UnstructuredHTMLLoader
from langchain_text_splitters import HTMLSectionSplitter
from langchain_text_splitters import HTMLHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_community import embeddings

from langchain_community.vectorstores import Chroma

from sentence_transformers import SentenceTransformer
import torch

import chromadb
from chromadb.utils import embedding_functions
from chromadb.db.base import UniqueConstraintError

import nomic
from nomic import embed

import uuid
from langchain.schema import Document

import time

## Load and split one document

In [3]:
relative_path = Path("data/sec-edgar-filings/ABR/10-K/0001628280-24-005456/primary-document.html")
file_path = Path.cwd() / relative_path
if not file_path.exists():
    raise FileNotFoundError(f"The file {file_path} does not exist.")

# Load the document
loader = UnstructuredHTMLLoader(str(file_path))
data = loader.load()


# Second split: RecursiveCharacterTextSplitter
chunk_size = 384
chunk_overlap = 0
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
final_splits = text_splitter.split_documents(data)
#final_splits = text_splitter.split_documents(header_documents)

print(f"Total number of splits: {len(final_splits)}")
print("Sample split:")
print(final_splits[5])

Total number of splits: 1006
Sample split:
page_content='Securities registered pursuant to Section 12(g) of the Act: None

Indicate by check mark if the registrant is a well-known seasoned issuer, as defined in Rule 405 of the Securities Act. þ No o

Indicate by check mark if the registrant is not required to file reports pursuant to Section 13 or Section 15(d) of the Act. Yes o þ' metadata={'source': 'C:\\Users\\big10\\ml_project\\FinRepReader\\data\\sec-edgar-filings\\ABR\\10-K\\0001628280-24-005456\\primary-document.html'}


In [3]:
#for i in range(len(final_splits)):
#    print(final_splits[i])

In [4]:
## Create and store embeddings

In [4]:
#collection_name = "sec_filings"
#collection = client.get_or_create_collection(name=collection_name, embedding_function=em)
#final_splits = final_splits[80:]
# Store splits in the database
documents = []
metadatas = []
ids = []

for split in final_splits:
    documents.append(split.page_content)
    metadatas.append(split.metadata)
    ids.append(str(uuid.uuid4()))  # Generate a unique ID for each split

# Add the documents to the collection
#collection.add(
#    documents=documents,
#    metadatas=metadatas,
#    ids=ids
#)



In [6]:
documents

['Type Asset Class Number Unpaid Principal Wtd. Avg. Pay Rate (1) Wtd. Avg. Remaining Months to Maturity (2) Bridge Loans Multifamily 316 $ 10,789,936 8.36 % 12.1 Single‑Family Rental 354 1,316,803 9.87 % 12.7 Land 7 118,595 0.13 % 0.3 Office 1 35,410 8.98 % 7.6    Retail 1 12,500 8.98 % 11.1 679 12,273,244 8.45 % 12.0 Mezzanine Loans Multifamily 46 232,104 8.77 % 60.3 Other 3',
 '16,353 3.35 % 4.5 49 248,457 8.41 % 56.6 Preferred Equity Multifamily 14 75,941 4.46 % 66.6 Other 3 9,800 — 10.7 17 85,741 3.95 % 60.3 Other Single‑Family Rental 2 7,564 9.84 % 13.9 Total 747 $ 12,615,006 8.42 % 13.2',
 '________________________________________',
 '(1)“Weighted Average Pay Rate” is a weighted average, based on each loan’s unpaid principal balance (“UPB”), of our interest rate required to be paid monthly as stated in the individual loan agreements. Certain loans and investments that require an additional rate of interest “accrual rate” to be paid at maturity are not included in the weighted av

In [7]:


#client = chromadb.PersistentClient(settings=Settings(chroma_db_impl="duckdb+parquet",
#                                    persist_directory="./data/chroma_new_db"
#                                ))

#client = chromadb.Client()

In [8]:

client = chromadb.PersistentClient(path="./data/chroma_new_db")

#client = chromadb.PersistentClient(path="./data/chroma_new_db", database='duckdb+parquet')
#from chromadb.config import Settings

#settings = Settings(
#    chroma_db_impl="duckdb+parquet",
#    persist_directory="./data/chroma_new_db") 
#client = chromadb.PersistentClient(settings=settings)

In [9]:




#embedding_function = embedding_functions.ollama_embedding_function.UserDefinedEmbeddingFunction(ollama_embedding_function)

ollama_ef = embedding_functions.OllamaEmbeddingFunction(
    url="http://localhost:11434/api/embeddings",
    model_name='mxbai-embed-large',
    #model_name="nomic-embed-text",
)


client.delete_collection("sec_filings")

#collection = client.get_or_create_collection(
#    "sec_filings",
#    embedding_function=ollama_ef
#)

collection = client.get_or_create_collection(
    "sec_filings"
)

#splits = ["This is my first text to embed",                    "This is my second document"]

#response = ollama.embeddings(model="nomic-embed-text", prompt=split)
#embedding = response["embedding"]



#embeddings = ollama_ef(splits)




In [10]:
max_batch_size = 10

# Function to create batches
def create_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

In [11]:
document_batches = create_batches(documents, max_batch_size)
metadata_batches = create_batches(metadatas, max_batch_size)
id_batches = create_batches(ids, max_batch_size)

vectorstore = Chroma.from_texts(
    texts=documents,
    collection_name="rag-ollama",
    embedding=embeddings.OllamaEmbeddings(model='mxbai-embed-large'),
)

In [None]:
document_count = 0

for document_batch in document_batches:

    collection.add(
        documents=document_batch,
        metadatas=next(metadata_batches),
        ids=next(id_batches)
    )
    document_count += len(document_batch)
    print(f"Added {document_count} documents out of {len(document_batch)} to the collection.")
    

    time.sleep(3)  # Waits for 10 milliseconds

Added 10 documents out of 10 to the collection.
Added 20 documents out of 10 to the collection.
Added 30 documents out of 10 to the collection.
Added 40 documents out of 10 to the collection.
Added 50 documents out of 10 to the collection.
Added 60 documents out of 10 to the collection.
Added 70 documents out of 10 to the collection.
Added 80 documents out of 10 to the collection.
Added 90 documents out of 10 to the collection.


collection.add(
    #embeddings=embeddings,
    documents=documents,
    ids=ids
)

print(f"Added {len(documents)} documents to the collection.")



In [None]:
print('past add')

In [6]:
# user_question = "What are the earnings of then company mentioned in the report?"
user_question = """I need you to provide information about the following topics:
                   Name of the company:
                   Industry the company operates in:
                   Eanings:
                   Risks:
                """

In [None]:
results = collection.query(
    query_texts=[user_question],
    n_results=10
)

print(results)

In [None]:
retrieved_info = ""
for doc in results['documents'][0]:
    retrieved_info += f"{doc}\n\n"

# 3. Construct enhanced prompt
enhanced_prompt = f"""
You are a financial expert. You are given parts of the 10-k or 10-q of a company the final question is about.
Here is the part of the report I want you to base your answer on:

{retrieved_info}

Please base your output strictly on the context of the report. Keep your answers brief and focussed in markdown format. Have a bullet 
point for each requested variable.
{user_question}
"""

In [None]:
print(enhanced_prompt)

In [None]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama3.1")

llm.invoke(enhanced_prompt)

**Name of the company**: Arbor Realty Trust, Inc.\n* **Industry the company operates in**:\n\t+ Structured Loan Origination and Investment Business\n\t+ Agency Loan Origination and Servicing Business\n\t+ Real estate investment and financing\n* **Earnings**:\n\t+ Not explicitly stated in the report. However, it can be inferred that the company generates revenue from its loan origination and servicing business.\n* **Risks**:\n\t+ Adverse changes in economic conditions and interest rates\n\t+ Quality and size of the investment pipeline\n\t+ Impairments in the value of collateral underlying loans and investments\n\t+ Changes in federal and state laws and regulations (including tax laws)\n\t+ Availability and cost of capital for future investment



In [None]:
horst

In [None]:
pip install nomic 

In [None]:
pip install faiss-gpu

In [None]:
from nomic import embed_text

In [None]:
import nomic

In [None]:
!pip install nomic-embed-text

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

# Initialize the embedding model
embeddings = HuggingFaceEmbeddings()

# Load and preprocess the documents
loader = TextLoader("data/sec-edgar-filings/ABR/10-K/0001628280-24-005456/primary-document.html")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)
print('1')
# Create the FAISS index and add documents
db = FAISS.from_documents(docs, embeddings)
print('2')
# Save the index locally
db.save_local("faiss_index")
print('3')
# Later, you can load the index
loaded_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)

# Perform a similarity search
query = "What industry does ABR operate in?"
docs = loaded_db.similarity_search(query)

# Print the results
for doc in docs:
    print(doc.page_content)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


1


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
db.add_documents()

In [7]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.schema import Document
import time

# Initialize the embedding model
embeddings = HuggingFaceEmbeddings()

# Create an empty FAISS index
db = FAISS.from_texts([""], embeddings)

# Function to create batches
def create_batches(data, batch_size):
    for i in range(0, len(data), batch_size):
        yield data[i:i + batch_size]

# Assume we have these lists prepared
#documents = [...]  # List of document contents
#metadatas = [...]  # List of metadata dictionaries
#ids = [...]  # List of document IDs

# Create Document objects
doc_objects = [Document(page_content=doc, metadata=meta) for doc, meta in zip(documents, metadatas)]

max_batch_size = 20
document_batches = create_batches(doc_objects, max_batch_size)
id_batches = create_batches(ids, max_batch_size)

document_count = 0
for doc_batch in document_batches:
    id_batch = next(id_batches)
    
    # Add the batch to FAISS
    db.add_documents(documents=doc_batch, ids=id_batch)
    
    document_count += len(doc_batch)
    print(f"Added {document_count} documents out of {len(doc_objects)} to the collection.")
    
    time.sleep(1)  # Wait for 10 milliseconds

# Save the index locally
db.save_local("faiss_index")

# Perform a similarity search
query = """I need you to provide information about the following topics:
                   Name of the company:
                   Industry the company operates in:
                   Eanings:
                   Risks:
                """
results = db.similarity_search(query)

# Print the results
for doc in results:
    print(doc.page_content)

Added 20 documents out of 1006 to the collection.
Added 40 documents out of 1006 to the collection.
Added 60 documents out of 1006 to the collection.
Added 80 documents out of 1006 to the collection.
Added 100 documents out of 1006 to the collection.
Added 120 documents out of 1006 to the collection.
Added 140 documents out of 1006 to the collection.
Added 160 documents out of 1006 to the collection.
Added 180 documents out of 1006 to the collection.
Added 200 documents out of 1006 to the collection.
Added 220 documents out of 1006 to the collection.
Added 240 documents out of 1006 to the collection.
Added 260 documents out of 1006 to the collection.
Added 280 documents out of 1006 to the collection.
Added 300 documents out of 1006 to the collection.
Added 320 documents out of 1006 to the collection.
Added 340 documents out of 1006 to the collection.
Added 360 documents out of 1006 to the collection.
Added 380 documents out of 1006 to the collection.
Added 400 documents out of 1006 to 