In [1]:
import re
import typer
import pandas as pd
from typing import Dict
from pathlib import Path
import edgar

# Overview

* Explain EDGAR data, background on why it's useful for Kellogg
* CLI example (screenshot, open terminal and run against sample)
* API example (start up server, go to FastApi doc page, provide example file)
* Code documentation (UML diagram, SQL tables for output)
* Internal code examples can pull directly from actual source
* Show grep on KLC

# ```edgar2data```: Extracting Information from EDGAR Documents

<br>
<br>
<br>

<center><img src="figures/library.png" width="33%" style='border:5px solid #000000'/></center>


# Example: Insider Trading Data: SEC Forms 3, 4, and 5

Forms 3, 4, and 5 filings are reports submitted to the SEC by investors who may buy or sell shares in companies where they are deemed insiders. The SEC defines an insider as any officer, director or more than 10% shareholder of a publicly traded company.

* https://www.sec.gov/files/forms-3-4-5.pdf
* https://www.sec.gov/Archives/edgar/data/1326190/000101297517000759/xslF345X03/edgar.xml
* https://whalewisdomalpha.com/form-4-insider-trading-analysis/
* https://www.sec.gov/Archives/edgar/data/1318605/000149473018000006/xslF345X03/edgardoc.xml

These filings are publicly available through the [SEC EDGAR website](https://www.sec.gov/edgar/search/)

For example, here is a [Form 4](https://www.sec.gov/Archives/edgar/data/1326190/000101297517000759/)

# ```edgar2data``` uses XML parsing, RegEx, (and in future NLP) to extract information

<br>
<br>
<br>

<center><img src="figures/unstrctured-data-types.png" width="80%" style='border:5px solid #000000'/></center>

# <span style="color:purple">Scaling up to Multiple Documents</span>

<br>
<br>
<br>

<center><img src="figures/information_extraction.png" width="100%" style='border:5px solid #000000'/></center>

In [None]:
from edgar.cli import process
help(process)

In [None]:
import re
from typing import Dict

document_fields_header: Dict[str, re.Pattern] = {
    "accession": re.compile(r"^\s*ACCESSION NUMBER:(.+?)$", flags=re.DOTALL | re.MULTILINE),
    "sec_document": re.compile(r"<SEC-DOCUMENT>(.+?):", flags=re.DOTALL | re.MULTILINE),
    "sec_header": re.compile(r"<SEC-DOCUMENT>(.+?):", flags=re.DOTALL | re.MULTILINE),
    "acceptance_datetime": re.compile(r"<ACCEPTANCE-DATETIME>(.+?)$", flags=re.DOTALL | re.MULTILINE)
}
    
for key, val in document_fields_header.items():
    typer.secho(f"key: {key}", fg=typer.colors.WHITE, bg=typer.colors.RED)
    typer.secho(f"val: {val}", fg=typer.colors.WHITE, bg=typer.colors.BLACK)

In [4]:
def extract_doc_header_info(f: Path) -> Dict[str, str]:
    text = f.read_text()
    row_dict = {"filename": f.name}
    for field, pat in document_fields_header.items():
        row_dict[field] = None
        match = pat.findall(text)
        if match:
            row_dict[field] = match[0].strip()
        else:
            typer.secho(f"WARNING: {f} does not contain {field}", fg=typer.colors.RED)
    return row_dict

typer.secho(f"{extract_doc_header_info}", fg=typer.colors.WHITE, bg=typer.colors.BLACK)

[37m[40m<function extract_doc_header_info at 0x7fcf252de3b0>[0m


In [None]:
from pathlib import Path
import pandas as pd

row_dicts = []
in_dir = Path("../tests/data/form-4/sample/2020")
for f in in_dir.glob("*.txt"):
    typer.secho(f"processing file: {f.name}", fg=typer.colors.WHITE, bg=typer.colors.BLACK)
    row_dicts.append(extract_doc_header_info(f))

header_df = pd.DataFrame(row_dicts)
header_df = header_df.set_index("filename")

In [None]:
header_df.head(20)

# <span style="color:purple">Working with XML</span>

https://www.xmlviewer.org/

In [None]:
from typing import Dict

document_fields: Dict[str, str] = {
    "schemaVersion": "schemaVersion",
    "documentType": "documentType",
    "periodOfReport": "periodOfReport",
    "notSubjectToSection16": "notSubjectToSection16",
    "issuerCik": "issuer/issuerCik",
    "issuerName": "issuer/issuerName",
    "issuerTradingSymbol": "issuer/issuerTradingSymbol"
}
    
for key, val in document_fields.items():
    typer.secho(f"key: {key}", fg=typer.colors.WHITE, bg=typer.colors.RED)
    typer.secho(f"val: {val}", fg=typer.colors.WHITE, bg=typer.colors.BLACK)

In [12]:
from typing import Dict
import xml.etree.ElementTree as ET
import re

def extract_doc_xml_info(f: Path) -> Dict[str, str]:
    text = f.read_text()
    row_dict = {"filename": f.name}
    
    # extract the XML portion of the document using a regex
    xml_pat = re.compile(r"<XML>(.+)</XML>", flags=re.DOTALL)
    match = xml_pat.findall(f.read_text())
    xml_text = match[0].strip()   
    root = ET.fromstring(xml_text)

    # iterate through fields and match on path 
    for field, pat in document_fields.items():
        row_dict[field] = None
        match = root.find(pat)
        if match is not None:
            row_dict[field] = match.text.strip()
        else:
            typer.secho(f"WARNING: {f.name} does not contain {field}", bg=typer.colors.BLACK, fg=typer.colors.WHITE)
    
    return row_dict

In [None]:
import typer
import pandas as pd
from pathlib import Path

row_dicts = []
in_dir = Path("../tests/data/form-4/sample/2020")
for f in in_dir.glob("*.txt"):
    row_dicts.append(extract_doc_xml_info(f))

xml_df = pd.DataFrame(row_dicts)
xml_df = xml_df.set_index("filename")

In [None]:
xml_df.head(20)

In [None]:
# join the results together
df = header_df.join(xml_df)
df.head(10)