# Learning to scrape SEC.gov

I'm going to follow along with this video series from SigmaCoding on YouTube: https://youtu.be/TxUmufNnIaA

In [1]:
# import libraries
import re
import requests
import unicodedata
from bs4 import BeautifulSoup

### Define functions to be used later:

In [2]:
def restore_windows_1252_characters(restore_string):
    
    # this replaces the C1 characters in the Unicode string s
    # it it ultiamtely text normalization
    
    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # no character at the corresponding code point: remove it
            return ''
        
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

#### Grab the document content:

In [3]:
# define URL to the html_text file
new_html_text = r'https://www.sec.gov/Archives/edgar/data/1166036/000110465904027382/0001104659-04-027382.txt'

# https://www.sec.gov/Archives/edgar/data/1166036/000110465904027382/0001104659-04-027382.txt
# https://www.sec.gov/Archives/edgar/data/50863/000119312521009142/0001193125-21-009142.txt
# https://www.sec.gov/Archives/edgar/data/21344/000155278121000188/0001552781-21-000188.txt
################### 8-K example links ^^^^

# grab the response
response = requests.get(new_html_text)

# parse the response
soup = BeautifulSoup(response.content, 'lxml')

#### Define master dictionary for all filings:

In [4]:
# define a new dictionary that will house all our filings
master_filings_dict = {}

#define unique key for each filing
#for now, each number is identical to its above .txt slug
accession_number = '000110465904027382/0001104659-04-027382'

# add the key to the dictionary and add a new 'level'(?)
master_filings_dict[accession_number] = {}
master_filings_dict[accession_number]['sec_header_content'] = {}
master_filings_dict[accession_number]['filing_documents'] = None

#### Examining the SEC-Header Tag

In [5]:
# grab the sec-header document
sec_header_tag = soup.find('sec-header')
'''# check out it:
print(sec_header_tag.get_text())
# and please recomment, thank you!
'''
#store the sec header content inside the dictionary
master_filings_dict[accession_number]['sec_header_content']['sec_header_code'] = sec_header_tag

display(sec_header_tag)

<sec-header>0001104659-04-027382.hdr.sgml : 20040913
<acceptance-datetime>20040913074905
ACCESSION NUMBER:		0001104659-04-027382
CONFORMED SUBMISSION TYPE:	8-K/A
PUBLIC DOCUMENT COUNT:		7
CONFORMED PERIOD OF REPORT:	20040730
ITEM INFORMATION:		Completion of Acquisition or Disposition of Assets
ITEM INFORMATION:		Financial Statements and Exhibits
FILED AS OF DATE:		20040913
DATE AS OF CHANGE:		20040913

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			MARKWEST ENERGY PARTNERS L P
		CENTRAL INDEX KEY:			0001166036
		STANDARD INDUSTRIAL CLASSIFICATION:	CRUDE PETROLEUM &amp; NATURAL GAS [1311]
		IRS NUMBER:				270005456
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		8-K/A
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-31239
		FILM NUMBER:		041026639

	BUSINESS ADDRESS:	
		STREET 1:		155 INVERNESS DR WEST
		STREET 2:		STE 200
		CITY:			ENGLEWOOD
		STATE:			CO
		ZIP:			80112
		BUSINESS PHONE:		303-925-9275

	MAIL ADDRESS:	
		STREET 1:		155 INVERNESS DR WEST
		STREET 2:		STE 200
		C

#### Parsing the documents

In [6]:
# initalize the dictionary that will house all of our documents
master_document_dict = {}

# find all the documents in the filing.
for filing_document in soup.find_all('document'):
    
    # define the document type, found under the <type> tag, this will serve as our key for the dictionary.
    document_id = filing_document.type.find(text=True, recursive=False).strip()
    
    # here are the other parts if you want them.
    document_sequence = filing_document.sequence.find(text=True, recursive=False).strip()
    document_filename = filing_document.filename.find(text=True, recursive=False).strip()
    document_description = filing_document.description.find(text=True, recursive=False).strip()
    
    # initalize our document dictionary
    master_document_dict[document_id] = {}
    
    # add the different parts, we parsed up above.
    master_document_dict[document_id]['document_sequence'] = document_sequence
    master_document_dict[document_id]['document_filename'] = document_filename
    master_document_dict[document_id]['document_description'] = document_description
    
    # store the document itself, this portion extracts the HTML code. We will have to reparse it later.
    master_document_dict[document_id]['document_code'] = filing_document.extract()
    
    
    # grab the text portion of the document, this will be used to split the document into pages.
    filing_doc_text = filing_document.find('text').extract()

    
    # find all the thematic breaks, these help define page numbers and page breaks.
    all_thematic_breaks = filing_doc_text.find_all('hr',{'width':'100%'})
    
    
    '''
        THE FOLLOWING CODE IS OPTIONAL:
        -------------------------------
        
        This portion will demonstrate how to parse the page number from each "page". Now I would only do this if you
        want the ACTUAL page number on the document, if you don't need it then forget about it and just wait till the
        next seciton.
        
        Additionally, some of the documents appear not to have page numbers when they should so there is no guarantee
        that all the documents will be nice and organized.
    
    '''
    
    
    
    # grab all the page numbers, first one is usually blank
    all_page_numbers = [thematic_break.parent.parent.previous_sibling.previous_sibling.get_text(strip=True) 
                        for thematic_break in all_thematic_breaks]
    
    
    '''
    
        If the above list comprehension doesn't make sense to you, here is how it would look as a regular loop.
    
        # define a list to house all the page numbers
        all_page_numbers = []

        # loop throuhg all the thematic breaks.
        for thematic_break in all thematic_breaks:

           # this would grab the page number tag.
           page_number = thematic_break.parent.parent.previous_sibling.previous_sibling

           # this would grab the page number text
           page_number = page_number.get_text(strip=True)
           
           # store it in the list.
           all_page_numbers.append(page_number)

    '''
    
    # determine the number of pages, will be used for the upcoming if conditions.
    length_of_page_numbers = len(all_page_numbers)
    
    # as long as there are numbers to change then proceed.
    if length_of_page_numbers > 0:
        
        # grab the last number
        previous_number = all_page_numbers[-1]
        
        # initalize a new list
        all_page_numbers_cleaned = []
        
        # loop through the old list in reverse order.
        for number in reversed(all_page_numbers):
            
            # if it's blank proceed to cleaning.
            if number == '':
                
                # the tricky part, there are three scenarios.

                # the previous one we looped was 0 or 1.
                if previous_number == '1' or previous_number == '0':
                    
                    # in this case, it means this is a "new section", so restart at 0.
                    all_page_numbers_cleaned.append(str(0))
                    
                    # reset the page number and the previous number.
                    length_of_page_numbers = length_of_page_numbers - 1
                    previous_number = '0'
                
                # the previous one we looped it wasn't either of those.
                else:
                    
                    # if it was blank, take the current length, subtract 1, and add it to the list.
                    all_page_numbers_cleaned.append(str(length_of_page_numbers - 1))
                    
                    # reset the page number and the previous number.
                    length_of_page_numbers = length_of_page_numbers - 1
                    previous_number = number

            else:
                
                # add the number to the list.
                all_page_numbers_cleaned.append(number)
                
                # reset the page number and the previous number.
                length_of_page_numbers = length_of_page_numbers - 1
                previous_number = number
    else:
        
        # make sure that it has a page number even if there are none, just have it equal 0
        all_page_numbers_cleaned = ['0']
    
    # have the page numbers be the cleaned ones, in reversed order.
    all_page_numbers = list(reversed(all_page_numbers_cleaned))
    
    # store the page_numbers
    master_document_dict[document_id]['page_numbers'] = all_page_numbers
    
    
    '''
        -------------------------------
          THE OPTIONAL CODE HAS ENDED
        -------------------------------
    
        This next portion of code is really what made this all possible. Up above you saw I grabbed all the thematic
        breaks from our document because they sever as natural page breaks. Without those thematic breaks I'm not sure
        if this would be such an easy process. It's not to say we couldn't break it into pages, but I would bet the code
        would be more complex.
    
    '''
    
    
    # convert all thematic breaks to a string so it can be used for parsing
    all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]
    
    # prep the document text for splitting, this means converting it to a string.
    filing_doc_string = str(filing_doc_text)

    
    # handle the case where there are thematic breaks.
    if len(all_thematic_breaks) > 0:
    
        # define the regex delimiter pattern, this would just be all of our thematic breaks.
        regex_delimiter_pattern = '|'.join(map(re.escape, all_thematic_breaks))

        # split the document along each thematic break.
        split_filing_string = re.split(regex_delimiter_pattern, filing_doc_string)

        # store the document itself
        master_document_dict[document_id]['pages_code'] = split_filing_string

    # handle the case where there are no thematic breaks.
    elif len(all_thematic_breaks) == 0:

        # handles so it will display correctly.
        split_filing_string = all_thematic_breaks
        
        # store the document as is, since there are no thematic breaks. In other words, no splitting.
        master_document_dict[document_id]['pages_code'] = [filing_doc_string]
    

    # display some information to the user.
    print('-'*80)
    print('The document {} was parsed.'.format(document_id))
    print('There was {} page(s) found.'.format(len(all_page_numbers)))
    print('There was {} thematic breaks(s) found.'.format(len(all_thematic_breaks)))
    

# store the documents in the master_filing_dictionary.
master_filings_dict[accession_number]['filing_documents'] = master_document_dict

print('-'*80)
print('All the documents for filing {} were parsed and stored.'.format(accession_number))

--------------------------------------------------------------------------------
The document 8-K/A was parsed.
There was 31 page(s) found.
There was 31 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-2.1 was parsed.
There was 37 page(s) found.
There was 37 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-4.1 was parsed.
There was 35 page(s) found.
There was 35 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-4.2 was parsed.
There was 20 page(s) found.
There was 20 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-23.1 was parsed.
There was 1 page(s) found.
There was 0 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-99.1 

In [7]:
# initialize master document dictionary
master_document_dict = {}

# loop through each doc in filing
for filing_document in soup.find_all('document'):
    
    #define document id
    document_id = filing_document.type.find(text=True, recursive = False).strip()
    
    ## try it out:
    #print(document_id)
    ## and please recomment, thank you!

    # document sequence
    document_sequence = filing_document.sequence.find(text = True, recursive = False).strip()
    
    # document filename
    document_filename = filing_document.filename.find(text = True, recursive = False).strip()
    
    # document description
    document_description = filing_document.description.find(text = True, recursive = False).strip()
    
    #insert the key
    master_document_dict[document_id] = {}
    
    # add different parts of the document
    master_document_dict[document_id]['document_sequence'] = document_sequence
    master_document_dict[document_id]['document_filename'] = document_filename
    master_document_dict[document_id]['document_description'] = document_description 

## check it out:    
#master_document_dict
## and recomment please, thank you!

    # add the document itself
    master_document_dict[document_id]['document_code'] = filing_document.extract()
    
    # get all text in document
    filing_doc_text = filing_document.find('text').extract()
    
    # get all the thematic breaks
    all_thematic_breaks = filing_doc_text.find_all('hr',{'width':'100%'})
    
    # convert all the breaks into a string vist list comprehension
    all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]
    
    #prep document for being split
    filing_doc_string = str(filing_doc_text)
    
    if len(all_thematic_breaks) > 0:
        # creates our string pattern
        regex_delimited_pattern = '|'.join(map(re.escape,all_thematic_breaks))
        #split the document along thematic breaks
        split_filing_string = re.split(regex_delimited_pattern, filing_doc_string)
        #store document in dictionary
        master_document_dict[document_id]['pages_code'] = split_filing_string
        
    elif len(all_thematic_breaks) == 0:
        #store document inside a dictionary
        master_document_dict[document_id]['pages_code'] = [filing_doc_string]
        
    #display info to the user
    print('-'*80)
    print('The document {} was parsed'.format(document_id))
    print('There was {} thematic break(s) found.'.format(len(all_thematic_breaks)))
    
# store document in the master filing dictionary
master_filings_dict[accession_number]['filing_documents'] = master_document_dict

print('-'*80)
print('All the documents for filing {} were parsed and stored.'.format(accession_number))

--------------------------------------------------------------------------------
All the documents for filing 000110465904027382/0001104659-04-027382 were parsed and stored.


#### Normalizing text

In [8]:
#first grab the documents
filing_documents = master_filings_dict[accession_number]['filing_documents']

#loop through each document
for document_id in filing_documents:
    #grab all documents
    document_pages = filing_documents[document_id]['pages_code']
    #page length
    pages_length = len(document_pages)
    #initialize some dictionaries
    repaired_pages = {}
    normalized_text = {}
    
    for index, page in enumerate(document_pages):
        #pass through parser to repair it
        page_soup = BeautifulSoup(page, 'html5')
        #grab test from each page
        page_text = page_soup.html.body.get_text(' ', strip = True)
        #normalize the text
        page_text_norm = restore_windows_1252_characters(unicodedata.normalize('NFKD', page_text))
        page_text_norm = page_text_norm.replace('  ',' ').replace('\n',' ')
        
        ## check it out
        #print(page_text_norm)
        ## and please recomment, thank you!
        
        #define our page number
        page_number = index + 1
        #add normalized text to dictionary
        normalized_text[page_number] = page_text_norm
        #add repaired html code to dictionary
        repaired_pages[page_number] = page_soup
        # add normalized text dict to master filing dictionary
        filing_documents[document_id]['page_normalized_text'] = normalized_text
        filing_documents[document_id]['pages_code'] = repaired_pages
        
        # add page numbers we generate
        gen_page_numbers = list(repaired_pages.keys())
        
        # add normalized text dictionary to the master filing dictionary
        filing_documents[document_id]['pages_numbers_generated'] = gen_page_numbers
        
        # display a status to the user.
        print('All the pages from document {} have been normalized.'.format(document_id))

In [9]:
# example
# master_filings_dict[accession_number]['filing_documents']['EX-10.1']

#### Defining search words

In [10]:
search_dict = {
    
    # these could possibly be words that help us find pages that discuss financial statements.
    'financial_words':['liability', 'asset'],
    
    # these could possible be words that help us find sections that discuss administration topics.
    'admin_words':['administration', 'government']
}

This is not my code:

In [11]:
filing_documents = master_filings_dict[accession_number]['filing_documents']

# loop through each document
for document_id in filing_documents:
    
    
    ####################################
    # THIS WILL HANDLE THE WORD SEARCH #
    ####################################
    
    
    # let's grab the normalized text in this example, since it's cleaned and easier to search
    normalized_text_dict = filing_documents[document_id]['page_normalized_text']  
            
    # initalize a dictionary to store all the tables we find.
    matching_words_dict = {}
    
    # define the number of pages
    page_length = len(normalized_text_dict)
    
    # loop through all the text
    for page_num in normalized_text_dict:
        
        # grab the actual text
        normalized_page_text = normalized_text_dict[page_num]
        
        # each page is going to be checked, so let's have another dictionary that'll house each pages result.
        matching_words_dict[page_num] = {}
        
        # loop through each word list in the search dictionary.
        for search_list in search_dict:
            
            # grab the list of words.
            list_of_words = search_dict[search_list]
            
            # lets see if any of the words are found
            matching_words = [word for word in list_of_words if word in normalized_page_text]
            
            '''
                Again, I know list comprehension might be hard to understand so I'll show you what the loop
                looks like.
                
                # initalize a list of matching words.
                matching_words = []
                
                # loop through the list of words.
                for word in list_of_words:
                
                    # check to see if it's in the text
                    if word in normalized_page_text:
                        
                        # if it is then add it to the list.
                        matching_words.append(word)
            '''
            
            # each page will have a set of results, list of words
            matching_words_dict[page_num][search_list] = {}
            
            # let's add the list of words we search to the matching words dictionary first.
            matching_words_dict[page_num][search_list]['list_of_words'] = list_of_words
            
            # next let's add the list of matchings words to the matching words dictionary.
            matching_words_dict[page_num][search_list]['matches'] = matching_words
            
        
        # display a status to the user.
        print('Page {} of {} from document {} has been searched.'.format(page_num, page_length, document_id))
    
    
    # display a status to the user.
    print('-'*80)    
    print('All the pages from document {} have been searched.'.format(document_id))    
    
    
    ####################################
    # THIS WILL HANDLE THE LINK SEARCH #
    ####################################
    
    
    # let's grab the all pages code.
    pages_dict = filing_documents[document_id]['pages_code']  
            
    # initalize a dictionary to store all the anchors we find.
    link_anchor_dict = {}
    
    # loop through each page
    for page_num in pages_dict:
        
        # grab the actual text
        page_code = pages_dict[page_num]
        
        # find all the anchors in the page, that have the attribute 'name'
        anchors_found = page_code.find_all('a',{'name':True})
        
        # number of anchors found
        num_found = len(anchors_found)
        
        # each page is going to be checked, so let's have another dictionary that'll house all the anchors found.
        link_anchor_dict[page_num]= {(anchor_id + 1): anchor for anchor_id, anchor in enumerate(anchors_found)}        
    
        # display a status to the user.
        print('Page {} of {} from document {} contained {} anchors with names.'.format(page_num, 
                                                                                       page_length, 
                                                                                       document_id, 
                                                                                       num_found))
    
    # display a status to the user.  
    print('All the pages from document {} have been scraped for anchors with names.'.format(document_id)) 
    print('-'*80)  
    
    
    #####################################
    # THIS WILL HANDLE THE TABLE SEARCH #
    #####################################
    
         
    # let's grab the all pages code.
    pages_dict = filing_documents[document_id]['pages_code']  
            
    # initalize a dictionary to store matching words.
    tables_dict = {}
    
    # loop through each page
    for page_num in pages_dict:
        
        # grab the actual text
        page_code = pages_dict[page_num]
        
        # find all the tables
        tables_found = page_code.find_all('table')
        
        # number of tables found
        num_found = len(tables_found)
        
        # each page is going to be checked, so let's have another dictionary that'll house all the tables found.
        tables_dict[page_num] = {(table_id + 1): table for table_id, table in enumerate(tables_found)}        
    
        # display a status to the user.
        print('Page {} of {} from document {} contained {} tables.'.format(page_num, page_length, document_id, num_found))
    
    # display a status to the user.  
    print('All the pages from document {} have been scraped for tables.'.format(document_id)) 
    print('-'*80)    
    
        
    # let's add the matching words dict to the document.
    filing_documents[document_id]['word_search'] = matching_words_dict  
    
    # let's add the matching tables dict to the document.
    filing_documents[document_id]['table_search'] = tables_dict
    
    # let's add the matching anchors dict to the document.
    filing_documents[document_id]['anchor_search'] = link_anchor_dict

My code gives errors:

In [None]:
# first grab all documents
filing_documents = master_filings_dict[accession_number]['filing_documents']

#loop through each document
for document_id in filing_documents:
    
    #grab normalized text
    normalized_text_dict = filing_documents[document_id]['page_normalized_text']   
    
    #initialize a dictionary of our results
    matching_words_dict = {}
    
    # define number of pages
    page_length = len(normalized_text_dict)
    #loop through all the text
    for page_num in normalized_text_dict:
        #grab the actual text
        normalized_page_text = normalized_text_dict[page_num]
        #each page will have a set of results
        matching_words_dict[page_num] = {}
        
        # loop through each list in our search dictionary
        for search_list in search_dict:
            #grab list of words
            list_of_words = search_dict[search_list]
            # do any of the words match?
            matching_words = [word for word in list_of_words if word in normalized_page_text]
            
        # each page will have a set of results, list of words
        matching_words_dict[page_num][search_list]['list_of_words'] = list_of_words
        
        # each page will have a set of results, list of words
        matching_words_dict[page_num][search_list]['matches'] = matching_words
    
    filing_document[document_id]['word_search'] = matching_words_dict

In [13]:
def scrape_table_dictionary(table_dictionary):
    
    # initalize a new dicitonary that'll house all your results
    new_table_dictionary = {}
    
    if len(table_dictionary) != 0:

        # loop through the dictionary
        for table_id in table_dictionary:

            # grab the table
            table_html = table_dictionary[table_id]
            
            # grab all the rows.
            table_rows = table_html.find_all('tr')
            
            # parse the table, first loop through the rows, then each element, and then parse each element.
            parsed_table = [
                [element.get_text(strip=True) for element in row.find_all('td')]
                for row in table_rows
            ]
            
            # keep the original just to be safe.
            new_table_dictionary[table_id]['original_table'] = table_html
            
            # add the new parsed table.
            new_table_dictionary[table_id]['parsed_table'] = parsed_table
            
            # here some additional steps you can take to clean up the data - Removing '$'.
            parsed_table_cleaned = [
                [element for element in row if element != '$']
                for row in parsed_table
            ]
            
            # here some additional steps you can take to clean up the data - Removing Blanks.
            parsed_table_cleaned = [
                [element for element in row if element != None]
                for row in parsed_table_cleaned
            ]

            
    else:
        
        # if there are no tables then just have the id equal NONE
        new_table_dictionary[1]['original_table'] = None
        new_table_dictionary[1]['parsed_table'] = None
        
    return new_table_dictionary