In [52]:
from sec_parser.processing_steps import AbstractProcessingStep
from sec_parser.processing_engine import AbstractSemanticElementParser
from sec_parser import Edgar10QParser

import bs4

from sec_parser.semantic_elements import (
    TextElement,
    TableElement,
    TitleElement,
    TopSectionTitle,
    NotYetClassifiedElement,
    ImageElement
)

class MyClassifier(AbstractProcessingStep):
    def __init__(self):
        super().__init__()
        # You can hold state in your processing steps
        self.processed_tags_count = 0

    # This method must be implemented when inheriting from AbstractProcessingStep
    def _process(self, elements):
        parsed = []
        for e in elements:
            self.processed_tags_count += 1
            if e.html_tag.name == "b":
                parsed.append(TitleElement.create_from_element(e, ""))
            elif e.html_tag.name == "p":
                parsed.append(TextElement.create_from_element(e, ""))
            else:
                parsed.append(e)
        print(
            f"MyClassifier: Successfully processed {self.processed_tags_count} tags!\n"
        )
        return parsed

In [None]:
from sec_parser.semantic_elements import CompositeSemanticElement

class CompositeElementIdentificationStep(AbstractProcessingStep):
    def _process(self, elements):
        result = []
        for e in elements:
            if e.html_tag.name == "div":
                result.append(
                    CompositeSemanticElement.create_from_element(
                        e,
                        inner_elements=[
                            NotYetClassifiedElement(t)
                            for t in e.html_tag.get_children()
                        ],
                        log_origin="CompositeElementIdentificationStep",
                    )
                )
            else:
                result.append(e)
        return result

def show(elements):
    for element in elements:
        text = element.text[:10]
        if hasattr(element, "inner_elements"):
            print(f"{element} (has {len(element.inner_elements)} elements inside)")
        elif text:
            print(f"{element} (text: {text}...)")
        else:
            print(f"{element}")


def get_steps():
    return [
        CompositeElementIdentificationStep(),
        MyClassifier(),
    ]



In [78]:
with open('./Training_Filings/0001299709-20-000078.html', 'r', encoding='utf-8') as file:
    html = file.read()
    html = bs4.BeautifulSoup(html, features = 'lxml').find('text').decode_contents()

In [80]:
parser = Edgar10QParser(get_steps)
elements = parser.parse(html)
show(elements)

MyClassifier: Successfully processed 1 tags!

[NotYetClassifiedElement<div>]


In [79]:
from sec_parser.processing_steps import AbstractElementwiseProcessingStep


class BetterClassifier(AbstractElementwiseProcessingStep):
    def _process_element(self, element, context):
        if element.html_tag.name == "b":
            return TitleElement.create_from_element(element, "")
        elif element.html_tag.name == "p":
            return TextElement.create_from_element(element, "")
        elif element.html_tag.name == "img":
            return ImageElement.create_from_element(element, "")
        return element



AttributeError: 'str' object has no attribute 'text'