In [1]:
import pandas as pd
import re
import requests
import unicodedata
from bs4 import BeautifulSoup


from helpers_v3 import *

import warnings
warnings.simplefilter("ignore", UserWarning)

In [2]:
cik_df = pd.read_csv('data/cik_df.csv', names=['company', 'cik'])
cik_df['cik'] = cik_df['cik'].apply('{:0>10}'.format)
cik_df.head()

# [str(cik) for cik in cik_df['cik'][:5]]

Unnamed: 0,company,cik
0,aapl,320193
1,msft,789019
2,amzn,1018724
3,goog,1652044
4,fb,1326801


Inputs:

List of companies,

List of years,

Form types:


## Set up master DF

In [3]:
## main function
# might need to add a column titled: parser_used. In case there are problems with html.parser and lxml

master_df = pd.DataFrame(columns=['filename', 'cik', 'filing_type', 'date', 'raw_doctext'])
master_df.head()

Unnamed: 0,filename,cik,filing_type,date,raw_doctext


In [4]:
first_five_companies = [str(company) for company in cik_df['cik'][:3]]
df = get_data_from_edgar(first_five_companies, ['2019', '2020'], ['10-Q'], master_df)



https://www.sec.gov/cgi-bin/srch-edgar?text=0000320193+10-Q&first=2019&last=2020
https://www.sec.gov/cgi-bin/srch-edgar?text=0000789019+10-Q&first=2019&last=2020
we're having problems. Here is what we're dealing with:
filing_location: https://www.sec.gov//Archives/edgar/data/789019/000156459019001392/0001564590-19-001392.txt 
date: 01/30/2019 
cik 0000789019
looks like the problem got fixed
https://www.sec.gov/cgi-bin/srch-edgar?text=0001018724+10-Q&first=2019&last=2020


In [5]:
df['toc'] = df['raw_doctext'].apply(lambda x: get_table_of_contents(x))
df

Unnamed: 0,filename,cik,filing_type,date,raw_doctext,toc
0,a10-qq320196292019.htm,320193,10-Q,07/31/2019,"[\n, [10-Q\n, [1\n, <filename>a10-qq3201962920...","[financial statements, management’s discussion..."
1,a10-qq220193302019.htm,320193,10-Q,05/01/2019,"[\n, [10-Q\n, [1\n, <filename>a10-qq2201933020...","[financial statements, management’s discussion..."
2,a10-qq1201912292018.htm,320193,10-Q,01/30/2019,"[\n, [10-Q\n, [1\n, <filename>a10-qq1201912292...","[financial statements, management’s discussion..."
3,aapl-20200627.htm,320193,10-Q,07/31/2020,"[\n, [10-Q\n, [1\n, <filename>aapl-20200627.ht...","[financial statements, management’s discussion..."
4,a10-qq220203282020.htm,320193,10-Q,05/01/2020,"[\n, [10-Q\n, [1\n, <filename>a10-qq2202032820...","[financial statements, management’s discussion..."
5,a10-qq1202012282019.htm,320193,10-Q,01/29/2020,"[\n, [10-Q\n, [1\n, <filename>a10-qq1202012282...","[financial statements, management’s discussion..."
6,msft-10q_20200930.htm,789019,10-Q,10/27/2020,"[\n, [10-Q\n, [1\n, <filename>msft-10q_2020093...","[financial statements, management’s discussion..."
7,msft-10q_20200331.htm,789019,10-Q,04/29/2020,"[\n, [10-Q\n, [1\n, <filename>msft-10q_2020033...","[financial statements, management’s discussion..."
8,msft-10q_20191231.htm,789019,10-Q,01/29/2020,"[\n, [10-Q\n, [1\n, <filename>msft-10q_2019123...","[financial statements, management’s discussion..."
9,msft-10q_20190930.htm,789019,10-Q,10/23/2019,"[\n, [10-Q\n, [1\n, <filename>msft-10q_2019093...","[financial statements, management’s discussion..."


In [27]:
import re

def seperate_by_toc(df):
    '''
    dataframe manipulation. Will add columns based on toc elements in df['toc'] and add the doc text of that section to column
    '''
    html_toc_locations = split_text_block(df)
    return html_toc_locations[1:]
    # for section in df['toc']:



def split_text_block(df):
    doc_text = df['raw_doctext']
    parts_to_split = df['toc']
    '''
    this needs to take in a block of text: (soup object or string)
    then will split it based on elements of list, parts_to_split
    
    parts_to_split will probably be: 
    master_dict[cik][document_id][document_filename]['table_of_contents'].keys()
    '''
    
    #this will be a list of strings. Each string is a different section from toc
    text_list = []
    
    for part in parts_to_split:
        #find the table of contents element if it is within a <b> tag
        next_part = doc_text.find_all(lambda tag:tag.name=='b' and part in tag.text.lower())


        ## look for font weight=bold
        if next_part == []:
            next_part = doc_text.find_all(lambda tag:tag.name=='span' and ('font-weight:bold' in str(tag)) and (part in tag.text.lower()))


        if next_part == []:
            next_part = doc_text.find_all(lambda tag:tag.name=='font' and ('font-weight:bold' in str(tag)) and (part in tag.text.lower()))
 
        if next_part == []:
            next_part = doc_text.find_all(lambda tag:tag.name=='span' and ('font-weight:700' in str(tag)) and (part in tag.text.lower()))            
            
        if next_part == []:
            next_part = doc_text.find_all(lambda tag:tag.name=='p' and ('font-weight:bold' in str(tag)) and (part in tag.text.lower()))
            
        if next_part == []:
            print("we can't find a match for ", part)
            text_list.append([])
            continue
            
#         if len(next_part) == 1:
        if part == 'other information':

            text_list.append(next_part[-1])
        else:
            text_list.append(next_part[0])


    #convert all parts to string
    all_parts = [str(part) for part in text_list]
    #prep the document text for splitting - convert to string
    doc_string = str(doc_text)
    #defing the regex delimeter pattern
    regex_delimiter_pattern = '|'.join(map(re.escape, all_parts))

    #split doc on each break
    split_doc_string = re.split(regex_delimiter_pattern, doc_string)
    
    return split_doc_string
      
    

# bug here- item getting split

In [28]:
from eda_v1 import *

df['split_doctext'] = df.apply(seperate_by_toc, axis=1)
# subset_df['split_doctext'] = seperate_by_toc(subset_df)
new_df = df[['filename', 'cik', 'filing_type', 'date', 'split_doctext', 'toc']]

new_df.head()

Unnamed: 0,filename,cik,filing_type,date,split_doctext,toc
0,a10-qq320196292019.htm,320193,10-Q,07/31/2019,"[</div><div><a id=""sA78FA309548257DD8AFEBBABDD...","[financial statements, management’s discussion..."
1,a10-qq220193302019.htm,320193,10-Q,05/01/2019,"[</div><div><a id=""s64BD8FD0492850BD996F138E5F...","[financial statements, management’s discussion..."
2,a10-qq1201912292018.htm,320193,10-Q,01/30/2019,"[</div><div><a name=""sF95D3DA660AB545C98339FC4...","[financial statements, management’s discussion..."
3,aapl-20200627.htm,320193,10-Q,07/31/2020,"[</div><div id=""i89708425362d489695803554d8765...","[financial statements, management’s discussion..."
4,a10-qq220203282020.htm,320193,10-Q,05/01/2020,"[</div><div><a id=""s9C1FDF4346FF5F8DA25D1744E1...","[financial statements, management’s discussion..."


In [30]:
def add_toc_to_column(toc, split_doctext):
    '''
    needs to make a new column for each item in toc. add the appropriate split_doctext to new column
    '''
    for element, element_code in zip(toc, split_doctext):
        df[element] = element_code
    return df


# toc_column_df = new_df.apply(lambda row: add_toc_to_column(row['toc'], row['split_doctext']), axis=1)
new_df[[t for t in new_df.iloc[0]['toc']]] = pd.DataFrame(new_df.split_doctext.tolist(), index= new_df.index)


# df = add_toc_to_column(test_df)

In [31]:
new_df

Unnamed: 0,filename,cik,filing_type,date,split_doctext,toc,financial statements,management’s discussion and analysis of financial condition and results of operations,quantitative and qualitative disclosures about market risk,controls and procedures,legal proceedings,risk factors,unregistered sales of equity securities and use of proceeds,defaults upon senior securities,mine safety disclosures,other information
0,a10-qq320196292019.htm,320193,10-Q,07/31/2019,"[</div><div><a id=""sA78FA309548257DD8AFEBBABDD...","[financial statements, management’s discussion...","</div><div><a id=""sA78FA309548257DD8AFEBBABDD1...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh..."
1,a10-qq220193302019.htm,320193,10-Q,05/01/2019,"[</div><div><a id=""s64BD8FD0492850BD996F138E5F...","[financial statements, management’s discussion...","</div><div><a id=""s64BD8FD0492850BD996F138E5F6...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh..."
2,a10-qq1201912292018.htm,320193,10-Q,01/30/2019,"[</div><div><a name=""sF95D3DA660AB545C98339FC4...","[financial statements, management’s discussion...","</div><div><a name=""sF95D3DA660AB545C98339FC4E...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh..."
3,aapl-20200627.htm,320193,10-Q,07/31/2020,"[</div><div id=""i89708425362d489695803554d8765...","[financial statements, management’s discussion...","</div><div id=""i89708425362d489695803554d87653...","</div><div style=""text-align:justify;margin-to...","</div><div style=""text-align:justify;margin-to...","</div><div style=""text-align:justify;margin-to...","</div><div style=""text-align:justify;margin-to...","</div><div style=""text-align:justify;margin-to...","</div><div style=""text-align:justify;margin-to...","</div><div style=""text-align:justify;margin-to...","</div><div style=""text-align:justify;margin-to...","</div><div style=""text-align:justify;margin-to..."
4,a10-qq220203282020.htm,320193,10-Q,05/01/2020,"[</div><div><a id=""s9C1FDF4346FF5F8DA25D1744E1...","[financial statements, management’s discussion...","</div><div><a id=""s9C1FDF4346FF5F8DA25D1744E1E...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh..."
5,a10-qq1202012282019.htm,320193,10-Q,01/29/2020,"[</div><div><a id=""s2039AF097F715C19B243C5BCAB...","[financial statements, management’s discussion...","</div><div><a id=""s2039AF097F715C19B243C5BCAB6...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh...","</div></td></tr></table><div style=""line-heigh..."
6,msft-10q_20200930.htm,789019,10-Q,10/27/2020,"[\n<p id=""INCOME_STATEMENTS"" style=""text-align...","[financial statements, management’s discussion...","\n<p id=""INCOME_STATEMENTS"" style=""text-align:...","\n<p style=""text-align:center;margin-bottom:0p...","\n<p style=""text-align:center;margin-bottom:0p...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""text-align:center;margin-bottom:0p...",,,
7,msft-10q_20200331.htm,789019,10-Q,04/29/2020,"[\n<p id=""INCOME_STATEMENTS"" style=""text-align...","[financial statements, management’s discussion...","\n<p id=""INCOME_STATEMENTS"" style=""text-align:...","\n<p style=""text-align:center;margin-bottom:0p...","\n<p style=""text-align:center;margin-bottom:0p...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""text-align:center;margin-bottom:0p...",,,
8,msft-10q_20191231.htm,789019,10-Q,01/29/2020,"[\n<p id=""INCOME_STATEMENTS"" style=""text-align...","[financial statements, management’s discussion...","\n<p id=""INCOME_STATEMENTS"" style=""text-align:...","\n<p style=""text-align:center;margin-bottom:0p...","\n<p style=""text-align:center;margin-bottom:0p...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""text-align:center;margin-bottom:0p...",,,
9,msft-10q_20190930.htm,789019,10-Q,10/23/2019,"[\n<p id=""INCOME_STATEMENTS"" style=""text-align...","[financial statements, management’s discussion...","\n<p id=""INCOME_STATEMENTS"" style=""text-align:...","\n<p style=""text-align:center;margin-bottom:0p...","\n<p style=""text-align:center;margin-bottom:0p...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""margin-bottom:0pt;margin-top:9pt;t...","\n<p style=""text-align:center;margin-bottom:0p...",,,


In [25]:
# df['risk factors'][2]
soup = BeautifulSoup(new_df['risk factors'][14], 'html.parser')
soup.text
# new_df['risk factors'][0]

'Please carefully consider the following discussion of significant factors, events, and uncertainties that make an investment in our securities risky. The events and consequences discussed in these risk factors could, in circumstances we may or may not be able to accurately predict, recognize, or control, have a material adverse effect on our business, growth, reputation, prospects, financial condition, operating results (including components of our financial results), cash flows, liquidity, and stock price. These risk factors do not identify all risks that we face; our operations could also be affected by factors, events, or uncertainties that are not presently known to us or that we currently do not consider to present significant risks to our operations. In addition to the effects of the COVID-19 pandemic and resulting global disruptions on our business and operations discussed in Item 2 of Part I, “Management’s Discussion and Analysis of Financial Condition and Results of Operation

In [None]:
soup = BeautifulSoup(df['risk factors'][2], 'html.parser')
soup.text

In [None]:
df['legal proceedings'][0]

In [None]:
m_df.head()

In [None]:
for t in m_df['split_doctext']:
    print(len(t))

In [None]:
htmlStr = m_df['split_doctext'][2][2]
htmlStr = unicodedata.normalize('NFKD', htmlStr)

soup = BeautifulSoup(htmlStr, 'html.parser')
soup.text

In [None]:
from tabulate import tabulate
# print(tabulate(df, headers='keys', tablefmt='psql'))
print(tabulate(m_df))