Testing from this site: 

https://github.com/areed1192/sigma_coding_youtube/blob/master/python/python-finance/sec-web-scraping/Web%20Scraping%20SEC%20-%20Parsing%20SEC%20Documents%20-%20New%20Filings.ipynb

In [1]:
# import our libraries
import re
import requests
import unicodedata
from bs4 import BeautifulSoup

In [2]:
def restore_windows_1252_characters(restore_string):
    """
        Replace C1 control characters in the Unicode string s by the
        characters at the corresponding code points in Windows-1252,
        where possible.
    """

    def to_windows_1252(match):
        try:
            return bytes([ord(match.group(0))]).decode('windows-1252')
        except UnicodeDecodeError:
            # No character at the corresponding code point: remove it.
            return ''
        
    return re.sub(r'[\u0080-\u0099]', to_windows_1252, restore_string)

In [3]:
new_html_text = r"https://www.sec.gov/Archives/edgar/data/1166036/000110465904027382/0001104659-04-027382.txt"

# grab the response
response = requests.get(new_html_text)

# pass it through the parser, in this case let's just use lxml because the tags seem to follow xml.
soup = BeautifulSoup(response.content, 'lxml')

In [4]:
# define a dictionary that will house all filings.
master_filings_dict = {}

# let's use the accession number as the key. This 
accession_number = '0001104659-04-027382'

# add a new level to our master_filing_dict, this will also be a dictionary.
master_filings_dict[accession_number] = {}

# this dictionary will contain two keys, the sec header content, and a documents key.
master_filings_dict[accession_number]['sec_header_content'] = {}
master_filings_dict[accession_number]['filing_documents'] = None

In [5]:
# grab the sec-header tag, so we can store it in the master filing dictionary.
sec_header_tag = soup.find('sec-header')

# store the tag in the dictionary just as is.
master_filings_dict[accession_number]['sec_header_content']['sec_header_code'] = sec_header_tag

# display the sec header tag, so you can see how it looks.
display(sec_header_tag)

<sec-header>0001104659-04-027382.hdr.sgml : 20040913
<acceptance-datetime>20040913074905
ACCESSION NUMBER:		0001104659-04-027382
CONFORMED SUBMISSION TYPE:	8-K/A
PUBLIC DOCUMENT COUNT:		7
CONFORMED PERIOD OF REPORT:	20040730
ITEM INFORMATION:		Completion of Acquisition or Disposition of Assets
ITEM INFORMATION:		Financial Statements and Exhibits
FILED AS OF DATE:		20040913
DATE AS OF CHANGE:		20040913

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			MARKWEST ENERGY PARTNERS L P
		CENTRAL INDEX KEY:			0001166036
		STANDARD INDUSTRIAL CLASSIFICATION:	CRUDE PETROLEUM &amp; NATURAL GAS [1311]
		IRS NUMBER:				270005456
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		8-K/A
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	001-31239
		FILM NUMBER:		041026639

	BUSINESS ADDRESS:	
		STREET 1:		155 INVERNESS DR WEST
		STREET 2:		STE 200
		CITY:			ENGLEWOOD
		STATE:			CO
		ZIP:			80112
		BUSINESS PHONE:		303-925-9275

	MAIL ADDRESS:	
		STREET 1:		155 INVERNESS DR WEST
		STREET 2:		STE 200
		C

In [7]:
master_document_dict = {}

# find all the documents in the filing.
for filing_document in soup.find_all('document'):
    
    # define the document type, found under the <type> tag, this will serve as our key for the dictionary.
    document_id = filing_document.type.find(text=True, recursive=False).strip()
    
    # here are the other parts if you want them.
    document_sequence = filing_document.sequence.find(text=True, recursive=False).strip()
    document_filename = filing_document.filename.find(text=True, recursive=False).strip()
    document_description = filing_document.description.find(text=True, recursive=False).strip()
    
    # initalize our document dictionary
    master_document_dict[document_id] = {}
    
    # add the different parts, we parsed up above.
    master_document_dict[document_id]['document_sequence'] = document_sequence
    master_document_dict[document_id]['document_filename'] = document_filename
    master_document_dict[document_id]['document_description'] = document_description
    
    # store the document itself, this portion extracts the HTML code. We will have to reparse it later.
    master_document_dict[document_id]['document_code'] = filing_document.extract()
    
    
    # grab the text portion of the document, this will be used to split the document into pages.
    filing_doc_text = filing_document.find('text').extract()

    
    # find all the thematic breaks, these help define page numbers and page breaks.
    all_thematic_breaks = filing_doc_text.find_all('hr',{'width':'100%'})
    
    
    ###########################
    ### Optional part ommitted
    ###########################
    
    # convert all thematic breaks to a string so it can be used for parsing
    all_thematic_breaks = [str(thematic_break) for thematic_break in all_thematic_breaks]
    
    # prep the document text for splitting, this means converting it to a string.
    filing_doc_string = str(filing_doc_text)

    
    # handle the case where there are thematic breaks.
    if len(all_thematic_breaks) > 0:
    
        # define the regex delimiter pattern, this would just be all of our thematic breaks.
        regex_delimiter_pattern = '|'.join(map(re.escape, all_thematic_breaks))

        # split the document along each thematic break.
        split_filing_string = re.split(regex_delimiter_pattern, filing_doc_string)

        # store the document itself
        master_document_dict[document_id]['pages_code'] = split_filing_string

    # handle the case where there are no thematic breaks.
    elif len(all_thematic_breaks) == 0:

        # handles so it will display correctly.
        split_filing_string = all_thematic_breaks
        
        # store the document as is, since there are no thematic breaks. In other words, no splitting.
        master_document_dict[document_id]['pages_code'] = [filing_doc_string]
    

    # display some information to the user.
    print('-'*80)
    print('The document {} was parsed.'.format(document_id))
    print('There was {} thematic breaks(s) found.'.format(len(all_thematic_breaks)))
    

# store the documents in the master_filing_dictionary.
master_filings_dict[accession_number]['filing_documents'] = master_document_dict

print('-'*80)
print('All the documents for filing {} were parsed and stored.'.format(accession_number))
    
    
    

--------------------------------------------------------------------------------
The document EX-2.1 was parsed.
There was 37 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-4.1 was parsed.
There was 35 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-4.2 was parsed.
There was 20 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-23.1 was parsed.
There was 0 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-99.1 was parsed.
There was 235 thematic breaks(s) found.
--------------------------------------------------------------------------------
The document EX-99.2 was parsed.
There was 13 thematic breaks(s) found.
--------------------------------------------------------------------------------
All 

In [9]:
filing_documents = master_filings_dict[accession_number]['filing_documents']


# loop through each document
for document_id in filing_documents:
    
    # display some info to give status updates.
    print('-'*80)
    print('Pulling document {} for text normilzation.'.format(document_id))
    
    # grab all the pages for that document
    document_pages = filing_documents[document_id]['pages_code']
    
    # page length
    pages_length = len(filing_documents[document_id]['pages_code'])
    
    # initalize a dictionary that'll house our repaired html code for each page.
    repaired_pages = {}
    
    # initalize a dictionary that'll house all the normalized text.
    normalized_text = {}

    # loop through each page in that document.
    for index, page in enumerate(document_pages):
        
        # pass it through the parser. NOTE I AM USING THE HTML5 PARSER. YOU MUST USE THIS TO FIX BROKEN TAGS.
        page_soup = BeautifulSoup(page,'html5')
        
        # grab all the text, notice I go to the BODY tag to do this
        page_text = page_soup.html.body.get_text(' ',strip = True)
        
        # normalize the text, remove messy characters. Additionally, restore missing window characters.
        page_text_norm = restore_windows_1252_characters(unicodedata.normalize('NFKD', page_text)) 
        
        # Additional cleaning steps, removing double spaces, and new line breaks.
        page_text_norm = page_text_norm.replace('  ', ' ').replace('\n',' ')
                
        #########################
        ## Optional part ommitted
        #########################
        
    # add the normalized text back to the document dictionary
    filing_documents[document_id]['pages_normalized_text'] = normalized_text
    
    # add the repaired html code back to the document dictionary
    filing_documents[document_id]['pages_code'] = repaired_pages
    
    # define the generated page numbers
    gen_page_numbers = list(repaired_pages.keys())
    
    # add the page numbers we have.
    filing_documents[document_id]['pages_numbers_generated'] = gen_page_numbers    
    
    # display a status to the user.
    print('All the pages from document {} have been normalized.'.format(document_id))

--------------------------------------------------------------------------------
Pulling document EX-2.1 for text normilzation.
All the pages from document EX-2.1 have been normalized.
--------------------------------------------------------------------------------
Pulling document EX-4.1 for text normilzation.
All the pages from document EX-4.1 have been normalized.
--------------------------------------------------------------------------------
Pulling document EX-4.2 for text normilzation.
All the pages from document EX-4.2 have been normalized.
--------------------------------------------------------------------------------
Pulling document EX-23.1 for text normilzation.
All the pages from document EX-23.1 have been normalized.
--------------------------------------------------------------------------------
Pulling document EX-99.1 for text normilzation.
All the pages from document EX-99.1 have been normalized.
-----------------------------------------------------------------------

In [10]:
search_dict = {
    
    # these could possibly be words that help us find pages that discuss financial statements.
    'financial_words':['liability', 'asset'],
    
    # these could possible be words that help us find sections that discuss administration topics.
    'admin_words':['administration', 'government']
}

In [11]:
# first grab all the documents
filing_documents = master_filings_dict[accession_number]['filing_documents']

# loop through each document
for document_id in filing_documents:
    
    
    ####################################
    # THIS WILL HANDLE THE WORD SEARCH #
    ####################################
    
    
    # let's grab the normalized text in this example, since it's cleaned and easier to search
    normalized_text_dict = filing_documents[document_id]['pages_normalized_text']  
            
    # initalize a dictionary to store all the tables we find.
    matching_words_dict = {}
    
    # define the number of pages
    page_length = len(normalized_text_dict)
    
    # loop through all the text
    for page_num in normalized_text_dict:
        
        # grab the actual text
        normalized_page_text = normalized_text_dict[page_num]
        
        # each page is going to be checked, so let's have another dictionary that'll house each pages result.
        matching_words_dict[page_num] = {}
        
        # loop through each word list in the search dictionary.
        for search_list in search_dict:
            
            # grab the list of words.
            list_of_words = search_dict[search_list]
            
            # lets see if any of the words are found
            matching_words = [word for word in list_of_words if word in normalized_page_text]
            
            '''
                Again, I know list comprehension might be hard to understand so I'll show you what the loop
                looks like.
                
                # initalize a list of matching words.
                matching_words = []
                
                # loop through the list of words.
                for word in list_of_words:
                
                    # check to see if it's in the text
                    if word in normalized_page_text:
                        
                        # if it is then add it to the list.
                        matching_words.append(word)
            '''
            
            # each page will have a set of results, list of words
            matching_words_dict[page_num][search_list] = {}
            
            # let's add the list of words we search to the matching words dictionary first.
            matching_words_dict[page_num][search_list]['list_of_words'] = list_of_words
            
            # next let's add the list of matchings words to the matching words dictionary.
            matching_words_dict[page_num][search_list]['matches'] = matching_words
            
        
        # display a status to the user.
        print('Page {} of {} from document {} has been searched.'.format(page_num, page_length, document_id))
    
    
    # display a status to the user.
    print('-'*80)    
    print('All the pages from document {} have been searched.'.format(document_id))    
    
    
    ####################################
    # THIS WILL HANDLE THE LINK SEARCH #
    ####################################
    
    
    # let's grab the all pages code.
    pages_dict = filing_documents[document_id]['pages_code']  
            
    # initalize a dictionary to store all the anchors we find.
    link_anchor_dict = {}
    
    # loop through each page
    for page_num in pages_dict:
        
        # grab the actual text
        page_code = pages_dict[page_num]
        
        # find all the anchors in the page, that have the attribute 'name'
        anchors_found = page_code.find_all('a',{'name':True})
        
        # number of anchors found
        num_found = len(anchors_found)
        
        # each page is going to be checked, so let's have another dictionary that'll house all the anchors found.
        link_anchor_dict[page_num]= {(anchor_id + 1): anchor for anchor_id, anchor in enumerate(anchors_found)}        
    
        # display a status to the user.
        print('Page {} of {} from document {} contained {} anchors with names.'.format(page_num, 
                                                                                       page_length, 
                                                                                       document_id, 
                                                                                       num_found))
    
    # display a status to the user.  
    print('All the pages from document {} have been scraped for anchors with names.'.format(document_id)) 
    print('-'*80)  
    
    
    #####################################
    # THIS WILL HANDLE THE TABLE SEARCH #
    #####################################
    
         
    # let's grab the all pages code.
    pages_dict = filing_documents[document_id]['pages_code']  
            
    # initalize a dictionary to store matching words.
    tables_dict = {}
    
    # loop through each page
    for page_num in pages_dict:
        
        # grab the actual text
        page_code = pages_dict[page_num]
        
        # find all the tables
        tables_found = page_code.find_all('table')
        
        # number of tables found
        num_found = len(tables_found)
        
        # each page is going to be checked, so let's have another dictionary that'll house all the tables found.
        tables_dict[page_num] = {(table_id + 1): table for table_id, table in enumerate(tables_found)}        
    
        # display a status to the user.
        print('Page {} of {} from document {} contained {} tables.'.format(page_num, page_length, document_id, num_found))
    
    # display a status to the user.  
    print('All the pages from document {} have been scraped for tables.'.format(document_id)) 
    print('-'*80)    
    
        
    # let's add the matching words dict to the document.
    filing_documents[document_id]['word_search'] = matching_words_dict  
    
    # let's add the matching tables dict to the document.
    filing_documents[document_id]['table_search'] = tables_dict
    
    # let's add the matching anchors dict to the document.
    filing_documents[document_id]['anchor_search'] = link_anchor_dict

--------------------------------------------------------------------------------
All the pages from document EX-2.1 have been searched.
All the pages from document EX-2.1 have been scraped for anchors with names.
--------------------------------------------------------------------------------
All the pages from document EX-2.1 have been scraped for tables.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
All the pages from document EX-4.1 have been searched.
All the pages from document EX-4.1 have been scraped for anchors with names.
--------------------------------------------------------------------------------
All the pages from document EX-4.1 have been scraped for tables.
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
All the pages from document EX-4.2 have

In [13]:
def scrape_table_dictionary(table_dictionary):
    
    # initalize a new dicitonary that'll house all your results
    new_table_dictionary = {}
    
    if len(table_dictionary) != 0:

        # loop through the dictionary
        for table_id in table_dictionary:

            # grab the table
            table_html = table_dictionary[table_id]
            
            # grab all the rows.
            table_rows = table_html.find_all('tr')
            
            # parse the table, first loop through the rows, then each element, and then parse each element.
            parsed_table = [
                [element.get_text(strip=True) for element in row.find_all('td')]
                for row in table_rows
            ]
            
            # keep the original just to be safe.
            new_table_dictionary[table_id]['original_table'] = table_html
            
            # add the new parsed table.
            new_table_dictionary[table_id]['parsed_table'] = parsed_table
            
            # here some additional steps you can take to clean up the data - Removing '$'.
            parsed_table_cleaned = [
                [element for element in row if element != '$']
                for row in parsed_table
            ]
            
            # here some additional steps you can take to clean up the data - Removing Blanks.
            parsed_table_cleaned = [
                [element for element in row if element != None]
                for row in parsed_table_cleaned
            ]

            
    else:
        
        # if there are no tables then just have the id equal NONE
        new_table_dictionary[1]['original_table'] = None
        new_table_dictionary[1]['parsed_table'] = None
        
    return new_table_dictionary

In [14]:
def search_for_centered_headers(tag):

    # easy way to end early is check if the 'align' keet is in attributes.
    if 'align' not in tag.attrs:
        return
    
    # define the criteria.
    criteria1 = tag.name == 'p'                # I want the tag to be name of 'p'
    criteria2 = tag.parent.name != 'td'        # I want the parent tag NOT to be named 'td'
    criteria3 = tag['align'] == 'center'       # I want the 'align' attribute to be labeled 'center'.
    
    # if it matches all the criteria then return the text.
    if criteria1 and criteria2 and criteria3:         
        return tag.get_text(strip = True)

In [15]:
def search_for_bolded_tags(tag):
    
    # define the criteria.
    criteria1 = tag.name == 'b'                # I want the tag to be name of 'p'
    criteria2 = tag.parent.name != 'td'        # I want the parent tag NOT to be named 'td'
    
    # if it matches all the criteria then return the text.
    if criteria1 and criteria2:         
        return tag.get_text(strip = True).replace('\n',' ')

In [16]:
filing_documents = master_filings_dict[accession_number]['filing_documents']

# loop through each document
for document_id in filing_documents:   
    
    # let's grab the all pages code.
    pages_dict = filing_documents[document_id]['pages_code']  
            
    # initalize a dictionary to store all the anchors we find.
    centered_headers_dict = {}
    
    # loop through each page
    for page_num in pages_dict:
        
        # grab the actual text
        page_code = pages_dict[page_num]
        
        # find all the anchors in the page, that have the attribute 'name'
        centered_headers_found = page_code.find_all(search_for_centered_headers)
        
        # number of anchors found
        num_found = len(centered_headers_found)
   
        # display a status to the user.
        print('Page {} of {} from document {} contained {} centered headers.'.format(page_num, 
                                                                                     page_length, 
                                                                                     document_id, 
                                                                                     num_found))
    
    # display a status to the user.  
    print('All the pages from document {} have been scraped for centered headers.'.format(document_id)) 
    print('-'*80)

All the pages from document EX-2.1 have been scraped for centered headers.
--------------------------------------------------------------------------------
All the pages from document EX-4.1 have been scraped for centered headers.
--------------------------------------------------------------------------------
All the pages from document EX-4.2 have been scraped for centered headers.
--------------------------------------------------------------------------------
All the pages from document EX-23.1 have been scraped for centered headers.
--------------------------------------------------------------------------------
All the pages from document EX-99.1 have been scraped for centered headers.
--------------------------------------------------------------------------------
All the pages from document EX-99.2 have been scraped for centered headers.
--------------------------------------------------------------------------------
