# Learning to scrape SEC.gov - XBRL documents

I'm going to follow along with this video series from SigmaCoding on YouTube: https://youtu.be/dJymnTL3hgc

This is the second follow-along coding exercise I've done for SEC webscraping

In [2]:
import csv 
import pprint
import pathlib
import collections
import xml.etree.ElementTree as ET

In [3]:
# define our working directory
sec_directory = pathlib.Path.cwd().joinpath('fb10Q')

In [5]:
# define file paths to the documents
file_htm = sec_directory.joinpath('fb-09302019x10q.htm').resolve()
file_cal = sec_directory.joinpath('fb-20190930_cal.xml').resolve()
file_lab = sec_directory.joinpath('fb-20190930_lab.xml').resolve()
file_def = sec_directory.joinpath('fb-20190930_def.xml').resolve()

In [6]:
print(file_htm)

C:\Users\sfrea\Desktop\Python Files\SEC testing\fb10Q\fb-09302019x10q.htm


In [7]:
# to parse data we need to store it somewhere
# so let's define the different storage components
# we'll use a list and two dictionaries
storage_lists = []
storage_values = {}
storage_gaap = {}

In [8]:
#create a named tuple so we can pass a list of different elements we want a name for
FilingTuple = collections.namedtuple('FilingTuple',['file_path','namespace_root','namespace_label'])

In [9]:
# let's make our list of tuples containing filepath and namespace
# initialize list of named tuples to parse
files_list = [
    FilingTuple(file_cal, r'{http://www.xbrl.org/2003/linkbase}calculationLink', 'calculation'), 
    FilingTuple(file_def, r'{http://www.xbrl.org/2003/linkbase}definitionLink','definition'), 
    FilingTuple(file_lab, r'{http://www.xbrl.org/2003/linkbase}labelLink','label')
    ]

print(files_list)

[FilingTuple(file_path=WindowsPath('C:/Users/sfrea/Desktop/Python Files/SEC testing/fb10Q/fb-20190930_cal.xml'), namespace_root='{http://www.xbrl.org/2003/linkbase}calculationLink', namespace_label='calculation'), FilingTuple(file_path=WindowsPath('C:/Users/sfrea/Desktop/Python Files/SEC testing/fb10Q/fb-20190930_def.xml'), namespace_root='{http://www.xbrl.org/2003/linkbase}definitionLink', namespace_label='definition'), FilingTuple(file_path=WindowsPath('C:/Users/sfrea/Desktop/Python Files/SEC testing/fb10Q/fb-20190930_lab.xml'), namespace_root='{http://www.xbrl.org/2003/linkbase}labelLink', namespace_label='label')]


In [10]:
# define two categories of labels, those we want and those we don't
avoids = ['linkbase','roleRef']
parse = ['label','labelLink','labelArc','loc','definitionLink','definitionArc','calculationArc']

In [11]:
# we'll create a set object to house the information, used for efficiency as opposed to a list (also no duplicates)
# create two sets to store the keys
lab_list = set()
cal_list = set()

In [13]:
#loop through each file
for file in files_list:
    tree = ET.parse(file.file_path) #parse the file
    elements = tree.findall(file.namespace_root) # grab all the namespace elements
    
    #loop through each element
    for element in elements:
        
        #if the element has 'children' we want to parse those as well
        for child_element in element.iter():
            print(child_element)

<Element '{http://www.xbrl.org/2003/linkbase}calculationLink' at 0x0000021F4DFB0DB0>
<Element '{http://www.xbrl.org/2003/linkbase}calculationLink' at 0x0000021F4DFB0E00>
<Element '{http://www.xbrl.org/2003/linkbase}loc' at 0x0000021F4DFB0EF0>
<Element '{http://www.xbrl.org/2003/linkbase}loc' at 0x0000021F4DFB0F40>
<Element '{http://www.xbrl.org/2003/linkbase}calculationArc' at 0x0000021F4DFA6090>
<Element '{http://www.xbrl.org/2003/linkbase}loc' at 0x0000021F4DFA60E0>
<Element '{http://www.xbrl.org/2003/linkbase}calculationArc' at 0x0000021F4DFA6130>
<Element '{http://www.xbrl.org/2003/linkbase}calculationLink' at 0x0000021F4DFA6180>
<Element '{http://www.xbrl.org/2003/linkbase}loc' at 0x0000021F4DFA61D0>
<Element '{http://www.xbrl.org/2003/linkbase}loc' at 0x0000021F4DFA6220>
<Element '{http://www.xbrl.org/2003/linkbase}calculationArc' at 0x0000021F4DFA6270>
<Element '{http://www.xbrl.org/2003/linkbase}loc' at 0x0000021F4DFA62C0>
<Element '{http://www.xbrl.org/2003/linkbase}calculatio