In [135]:
from bs4 import BeautifulSoup
from IPython.core.display import display, HTML
import re
import sys

# increase the recursion limit for these large documents
sys.setrecursionlimit(10000)

In [136]:
EOG_10K = './data/0000821189-20-000010.txt' #https://www.sec.gov/ix?doc=/Archives/edgar/data/821189/000082118920000010/a2019123110-k.htm
MRO_10K = './data/0000101778-20-000023.txt' #https://www.sec.gov/ix?doc=/Archives/edgar/data/101778/000010177820000023/mro-20191231x10k2019.htm
XOM_10K = './data/0000034088-20-000016.txt' #https://www.sec.gov/ix?doc=/Archives/edgar/data/34088/000003408820000016/xom10k2019.htm

with open(XOM_10K,'rb') as f:

    soup = BeautifulSoup(f.read(), 'lxml')

In [92]:
# find and print out the document header
header = soup.find('sec-header')
header

<sec-header>0000034088-20-000016.hdr.sgml : 20200226
<acceptance-datetime>20200226161519
ACCESSION NUMBER:		0000034088-20-000016
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		138
CONFORMED PERIOD OF REPORT:	20191231
FILED AS OF DATE:		20200226
DATE AS OF CHANGE:		20200226

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			EXXON MOBIL CORP
		CENTRAL INDEX KEY:			0000034088
		STANDARD INDUSTRIAL CLASSIFICATION:	PETROLEUM REFINING [2911]
		IRS NUMBER:				135409005
		STATE OF INCORPORATION:			NJ
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-02256
		FILM NUMBER:		20655849

	BUSINESS ADDRESS:	
		STREET 1:		5959 LAS COLINAS BLVD
		CITY:			IRVING
		STATE:			TX
		ZIP:			75039-2298
		BUSINESS PHONE:		9729406000

	MAIL ADDRESS:	
		STREET 1:		5959 LAS COLINAS BLVD
		CITY:			IRVING
		STATE:			TX
		ZIP:			75039-2298

	FORMER COMPANY:	
		FORMER CONFORMED NAME:	EXXON CORP
		DATE OF NAME CHANGE:	19920703

	FORMER COMPANY:	
		FORMER

In [96]:
doc_soup = soup.find_all('document')
print("No. Documents:", len(doc_soup))

No. Documents: 137


In [95]:
doc = {}

for d in doc_soup:
       
    # extract some metadata from each document
    doc_type = d.type.find(text=True, recursive=False).strip()
    doc_filename = d.filename.find(text=True, recursive=False).strip()
    doc_extension = doc_filename.split(".")[1]
    
    if doc_type == "10-K":
        
        print("Found 10-K Document:")
        
        # extract the raw html and text
        doc_html = d #.extract()
        doc_text = d.find('text') #.extract()

        # store some metadata about the document
        doc['id'] = doc_type
        doc['filename'] = doc_filename
        doc['html'] = doc_html
        doc['text'] = doc_text
        
        print(doc['id'])
        print(doc['filename'])

Found 10-K Document:
10-K
xom10k2019.htm


In [100]:
# find all the page breaks
page_breaks = doc['text'].find_all('hr')

# convert all thematic breaks to a string so it can be used for parsing
page_breaks = [str(pb) for pb in page_breaks]

# prep the document text for splitting, this means converting it to a string.
doc_string = str(doc['text'])

# handle the case where there are thematic breaks.
if len(page_breaks) > 0:

    # define the regex delimiter pattern, this would just be all of our thematic breaks.
    delimiter = '|'.join(map(re.escape, page_breaks))

    # split the document along each thematic break.
    split_string = re.split(delimiter, doc_string)

    # store the document itself
    doc['pages'] = split_string

# handle the case where there are no thematic breaks.
elif len(page_breaks) == 0:

    # handles so it will display correctly.
    split_string = page_breaks

    # store the document as is, since there are no thematic breaks. In other words, no splitting.
    doc['pages'] = [doc_string]

In [102]:
print("No. Pages:",len(doc['pages']))

No. Pages: 131


In [139]:
page_no = 1

display(HTML(doc['pages'][page_no + 1]))