In [None]:
import re

from bs4 import BeautifulSoup

## Opening ONE file

Given a path to the html file, we want to get a string of text we can search.

**This sounds like a function! Input: filepath. Output: string.**

_Note: This works a little differently than the example in 4.4.4.4 of the website, because the "txt" files are different than the html files sec_edgar_downloader downloads for us._

In [None]:
def extracto(fname):
    '''
    LDSF's little worker function to help with getting the text from the 
    filing_details.html files produced by sec_edgar_downloader.
    '''

    # when you open a file in python, the file is added to the RAM working memory
    # so you want to close the file when you are done with it (otherwise you will have
    # all 500 files open at the same time!)

    # "with" automatically closes the file once the block of code insideit ends
    # which reduces RAM/memory requirements, 
    with open(fname, encoding="utf-8") as report_file:
        html = report_file.read()
        # now it automatically closes the 10-K html

    # BeautifulSoup is one good way deal with the structure of this file
    # it takes the html, parses according it to some rules (xml), 
    # then creates an object called soup 
    soup = BeautifulSoup(html,'lxml')

    # this for-loop isn't on the website, but the txt file doesn't have all the xbrl crap
    # I looked at the HTML manually and noticed the first <div> is a large (hidden!) chunk of xbrl data, 
    # let's delete it
    for div in soup.find_all("div", {'style':'display:none'}): 
        div.decompose()

    # now, we get the text, using the good ideas from 4.4
    lower = soup.text.lower()
    no_punc = re.sub(r'\W',' ',lower)
    cleaned = re.sub(r'\s+',' ',no_punc).strip()    
    
    return cleaned

In [None]:
fname = "10k_files/sec-edgar-filings/TSLA/10-K/0001564590-20-004475/filing-details.html"

text = extracto(fname)


In [None]:
# len(text)
# len(re.findall(' ',text))+1   # wordcount based on how extracto works is # of spaces + 1
# text[:1000]
# text[-1000:]