In [1]:
from pathlib import Path

from langchain.document_loaders import UnstructuredHTMLLoader
from langchain_text_splitters import HTMLSectionSplitter
from langchain_text_splitters import HTMLHeaderTextSplitter
from langchain_text_splitters import RecursiveCharacterTextSplitter


from sentence_transformers import SentenceTransformer
import torch

import chromadb
from chromadb.utils import embedding_functions
from chromadb.db.base import UniqueConstraintError
import nomic
from nomic import embed

import uuid
from langchain.schema import Document



  from tqdm.autonotebook import tqdm, trange


## Load and split one document

In [2]:
relative_path = Path("data/sec-edgar-filings/ABR/10-K/0001628280-24-005456/primary-document.html")
file_path = Path.cwd() / relative_path
if not file_path.exists():
    raise FileNotFoundError(f"The file {file_path} does not exist.")

# Load the document
loader = UnstructuredHTMLLoader(str(file_path))
data = loader.load()

# First split: HTMLHeaderTextSplitter
header_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=[("h1", "Header 1"), ("h2", "Header 2"), ("h3", "Header 3")],
)
html_header_splits = header_splitter.split_text(data[0].page_content)

# Convert splits to Documents with metadata
header_documents = []
for split in html_header_splits:
    metadata = {
        "source": str(file_path),
        "Header 1": split.metadata.get("Header 1", ""),
        "Header 2": split.metadata.get("Header 2", ""),
        "Header 3": split.metadata.get("Header 3", ""),
    }
    header_documents.append(Document(page_content=split.page_content, metadata=metadata))

# Second split: RecursiveCharacterTextSplitter
chunk_size = 2000
chunk_overlap = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)
final_splits = text_splitter.split_documents(header_documents)

print(f"Total number of splits: {len(final_splits)}")
print("Sample split:")
print(final_splits[5])

Total number of splits: 155
Sample split:
page_content='loans, junior participating interests in first mortgages and preferred and direct equity. We also invest in real estate-related joint ventures and may directly acquire real property and invest in real estate-related notes and certain mortgage-related securities. Through our Agency Business, we originate, sell and service a range of multifamily finance products through the Federal National Mortgage Association (“Fannie Mae”) and the Federal Home Loan Mortgage Corporation (“Freddie Mac,” and together with Fannie Mae, the government-sponsored enterprises, or “GSEs”), the Government National Mortgage Association (“Ginnie Mae”), Federal Housing Authority (“FHA”) and the U.S. Department of Housing and Urban Development (together with Ginnie Mae and FHA, “HUD”). We retain the servicing rights and asset management responsibilities on substantially all loans we originate and sell under the GSE and HUD programs. We are an approved Fannie Ma

In [3]:
## Create and store embeddings

In [4]:
#collection_name = "sec_filings"
#collection = client.get_or_create_collection(name=collection_name, embedding_function=em)

# Store splits in the database
documents = []
#metadatas = []
ids = []

for split in final_splits:
    documents.append(split.page_content)
    #metadatas.append(split.metadata)
    ids.append(str(uuid.uuid4()))  # Generate a unique ID for each split

# Add the documents to the collection
#collection.add(
#    documents=documents,
#    metadatas=metadatas,
#    ids=ids
#)



In [None]:


client = chromadb.PersistentClient(path="/data/chromadb")

#embedding_function = embedding_functions.ollama_embedding_function.UserDefinedEmbeddingFunction(ollama_embedding_function)

ollama_ef = embedding_functions.OllamaEmbeddingFunction(
    url="http://localhost:11434/api/embeddings",
    model_name="nomic-embed-text",
)

collection = client.get_or_create_collection(
    "sec_filings",
    embedding_function=ollama_ef
)

#splits = ["This is my first text to embed",                    "This is my second document"]

#response = ollama.embeddings(model="nomic-embed-text", prompt=split)
#embedding = response["embedding"]



#embeddings = ollama_ef(splits)


collection.add(
    #embeddings=embeddings,
    documents=documents,
    ids=ids
)

print(f"Added {len(documents)} documents to the collection.")

results = collection.query(
    query_texts=["Earnings 2024"],
    n_results=1
)

print(results)

In [None]:
collection.get_model()