In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
pip install -U langchain-community langchain_openai tiktoken chromadb

In [None]:
import os
import gc
from bs4 import BeautifulSoup
import re
from langchain.document_loaders import TextLoader
from langchain.schema import Document
import numpy as np
from concurrent.futures import ProcessPoolExecutor
import logging
from typing import List
import time
from langchain.text_splitter import RecursiveCharacterTextSplitter
import concurrent.futures
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import openai

In [None]:
# Docuemnt directory to load and do text embedding
FILEPATH = '/content/drive/MyDrive/Colab Notebooks/financial_reports/'

In [None]:
class Document:
  def __init__(self, page_content: str, metadata: dict = None):
    self.page_content = page_content
    self.metadata = metadata if metadata else {}

class HTMLTextLoader(TextLoader):
  def __init__(self, file_path):
    self.file_path = file_path
    self.metadata = self.extract_metadata(file_path)

  def extract_metadata(self, file_path):
    # Adjust based on actual filing structure within the directory
    company_ticker = os.path.basename(os.path.dirname(os.path.dirname(self.file_path)))
    year = os.path.basename(os.path.dirname(self.file_path)).split('-')[1]
    return {'file_path': file_path, 'company_ticker': company_ticker, 'year': year}

  def preprocess(self, content) -> List[Document]:
    try:
      soup = BeautifulSoup(content, 'html.parser')
      body = soup.find('body')

    except Exception as e:
      logging.error(f"Error parsing HTML for {self.file_path}: {e}")
      return []

    if body is None:
      logging.warning(f"No <body> tag found in {self.file_path}")
      return []

    # Removes scripts and styles
    for script_or_style in body(['script', 'style']):
      script_or_style.decompose()

    clean_text = body.get_text(separator=' ', strip=True)
    clean_text = self.clean_text(clean_text)

    return [Document(page_content=clean_text, metadata=self.metadata)]

  def clean_text(self, text: str) -> str:
    # Adjust based on actual situation
    patterns = [
        (r'(us-gaap|xbrli|srt|P\d{1,2}Y)', ''),
         (r'\b\d{8,}\b', ''),
          (r'\b\d{2,4}[-/\.\d]*\b', ''),
           (r'\s+', ' '),
            (r'[^a-zA-Z0-9\s.,!?\'"(){}-]', '')
            ]
    for pattern, replacement in patterns:
      text = re.sub(pattern, replacement, text)

    return text.strip()


In [None]:
def load(filepath) -> List[tuple]:
  files = []
  # Adjust based on actual filing structure within the directory
  for root, dirs, files_in_dir in os.walk(filepath):
    for file_name in files_in_dir:
      # Adjust based on actual naming rules
      if file_name == 'full-submission.txt':
        company_ticker = os.path.basename(os.path.dirname(root))
        year = os.path.basename(root).split('-')[1]
        file_path = os.path.join(root, file_name)
        files.append((file_path, company_ticker, year))
  return files

In [None]:
def process_file(file_info):
    file_path, company_ticker, year = file_info
    loader = HTMLTextLoader(file_path)

    with open(file_path, 'r', encoding='utf-8') as f:
      content = f.read()

    return loader.preprocess(content)

def load_all_docs(file_path) -> List[Document]:
  all_files = load(file_path)
  all_docs = []

  with ProcessPoolExecutor() as executor:
    all_docs = list(executor.map(process_file, all_files))
  return [doc for sublist in all_docs for doc in sublist]

In [None]:
all_docs = load_all_docs(FILEPATH)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", " ", ","],
    add_start_index = True
)

# Split documents into chunks
split_documents = text_splitter.split_documents(all_docs)

In [None]:
# Setup API key for text embedding
API_KEY = 'YOUR_API_KEY'
os.environ['OPENAI_API_KEY'] = API_KEY

In [None]:
# Setup Chroma database path and text embedding model
CHROMA_PATH = '/content/drive/FinScope3D/Unstructured_Data/chroma_db'
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

In [None]:
def batch_documents(documents, batch_size):
  return [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]

In [None]:
batch_size = 10000
batched_documents = batch_documents(split_documents, batch_size)

In [None]:
def process_and_persist_batch(batch, embeddings, persist_directory, batch_idx):
  text = [doc.page_content for doc in batch]
  embedding = embeddings.embed_documents(text)
  db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)

  db.add_documents(batch, embeddings=embedding)
  db.persist()
  print(f"Batch {batch_idx + 1} processed and persisted.")