## Can go to this page: https://www.sec.gov/cgi-bin/srch-edgar

### Use a list of CIK numbers for relevant companies and iterate through 10-K and 10-Q docs

In [1]:
import pandas as pd
import re
import requests
import unicodedata
from bs4 import BeautifulSoup

from helpers import *

## get df of 11,871 largest companies

In [2]:
cik_df = pd.read_csv('data/cik_df.csv', names=['Company', 'CIK number'])
cik_df.head(10)

### need to pad with 0's to 10 numbers

Unnamed: 0,Company,CIK number
0,aapl,320193
1,msft,789019
2,amzn,1018724
3,goog,1652044
4,fb,1326801
5,tcehy,1293451
6,tsla,1318605
7,baba,1577552
8,brk-a,1067983
9,tsm,1046179


## Identify table of contents elements

In [3]:
def go_to_filing_doc_location(cik, years, doc_list):
    for doc in doc_list:
        path = 'https://www.sec.gov/cgi-bin/srch-edgar?text=' + cik + '+' + doc + '&first=' + years[0] + '&last=' + years[1]
        print(path)
        
        
        #get response
        response = requests.get(path)
        
        # parse response
        soup = BeautifulSoup(response.content, 'lxml')

        # each filing can be found in the href marked: '[text]'
        # save the links to a list to be returned
        txt_links = []
        for a in soup.find_all('a', href=True, text='[text]'):
            link = 'https://www.sec.gov/'+a['href']
            txt_links.append(link)
            
        return txt_links
        



In [4]:
## need a way to update and keep current master dict


def get_document_text(cik_list, years, doc_list, save_path, master_dict):
        
    '''
    accesses edgar database and downloads text from 10-K and/or 10-Q filings for specified companies and years
    
    inputs:
    cik_list - list of company cik numbers to extract data
    years - years to extract data from. Tuple for now, range of years inclusive
    doc_list - list of doc types to extract
    save_path - location to save data
    
    returns:
    None
    '''    
    
    ## for each company that we want to extract data for:
    for cik in cik_list:
        ## txt_links is a list of all the .txt links for 10-Q or 10-K filings for this specific company
        txt_links = go_to_filing_doc_location(cik, years, doc_list)
        
        ## for each linked filing, we ne
        for link in txt_links:
#             master_dict, doc_id, doc_filename = parse_filing(link, cik, master_dict)
            tple = parse_filing(link, cik, master_dict)
            master_dict = split_doc_by_section(master_dict, cik, tple[1], tple[2])

            ## extract actual text from each toc element for the filing
            master_dict = get_text(master_dict, cik, tple[1], tple[2])


#      now we should have a dictionary with:      cik -> 
#                                                 doc_type -> 
#                                                 year/quarter -> 
#                                                 toc -> 
#                                                 toc_element -> 
#                                                 raw_html
    
    
    return master_dict


In [5]:
# split_doc_by_section needs to take in the path to table of contents
# def split_doc_by_section(master_dict, cik, filing_type, date):


def split_doc_by_section(master_dict, cik, filing_type, date):
    '''
    '''
    
        
    ## manually inputting location of document_code for now for the sake of testing

    parts_to_split = get_table_of_contents(master_dict[cik][filing_type][date]['document_code'])
    
    doc_text = master_dict[cik][filing_type][date]['document_code'].find('text').extract()
    
    #get different segments of document
    text_split_into_parts = []

    
    
    ## add new dict for each toc element
#     for part in parts_to_split:
#         master_dict[cik][filing_type][date]['table_of_contents'][part] = parts_to_split
        
    
    # parts_to_split is a list of table of contents elements
    for part in parts_to_split:
        #find the table of contents element if it is within a <b> tag
        next_part = doc_text.find(lambda tag:tag.name=='b' and part in tag.text.lower())

        ## look for font weight=bold
        if next_part == None:
            print('no <b> tags found. Checking for font-weight:bold span tags')
            next_part = doc_text.find(lambda tag:tag.name=='span' and ('font-weight:bold' in str(tag)) and (part in tag.text.lower()))
        
        if next_part == None:
            print('no font-weight:bold tags found. Checking for font-weight:700 span tags\n')
            next_part = doc_text.find(lambda tag:tag.name=='span' and ('font-weight:700' in str(tag)) and (part in tag.text.lower()))            
            
                    
 
        text_split_into_parts.append(next_part)



        
    #convert all parts to string
    all_parts = [str(part) for part in text_split_into_parts]
    #prep the document text for splitting - convert to string
    doc_string = str(doc_text)
    #defing the regex delimeter pattern
    regex_delimiter_pattern = '|'.join(map(re.escape, all_parts))

    #split doc on each break
    split_doc_string = re.split(regex_delimiter_pattern, doc_string)
        
    #store parts in master dict
    i = 1
    for section in parts_to_split:
        master_dict[cik][filing_type][date]['table_of_contents'][section] = split_doc_string[i]
        i += 1


    return master_dict

# updated_master_dict = split_doc_by_section(updated_master_dict)
# doctext, parts = split_doc_by_section(updated_master_dict)

In [6]:
# def get_text(filing_doc_code_dict):
def get_text(master_dict, cik, id, date):
    tenk_text_dict = {}
    
    for key in master_dict[cik][id][date]['table_of_contents'].keys():
        raw_text = bsoup_extract_from_string(master_dict[cik][id][date]['table_of_contents'][key])
        decoded_text = decode_text(raw_text)
        decoded_text = remove_extra_spaces(decoded_text)
        master_dict[cik][id][date]['table_of_contents'][key] = decoded_text
        
    return master_dict

In [7]:
updated_master_dict = get_document_text(['0000320193'], ['2020', '2021'], ['10-Q'], None, {})
## get_document_text -> 

https://www.sec.gov/cgi-bin/srch-edgar?text=0000320193+10-Q&first=2020&last=2021
new company
from parse_filing. document_id:  10-Q
doc_id:  10-Q 
doc_filename:  aapl-20201226.htm
cik:  0000320193 
filing_type:  10-Q 
date:  aapl-20201226.htm
no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for

In [8]:
updated_master_dict.keys()

dict_keys(['0000320193'])

In [9]:
updated_master_dict['0000320193'].keys()

dict_keys(['10-K', '10-Q'])

In [10]:
updated_master_dict['0000320193']['10-Q'].keys()

dict_keys(['aapl-20201226.htm', 'aapl-20200627.htm', 'a10-qq220203282020.htm', 'a10-qq1202012282019.htm'])

In [11]:
updated_master_dict['0000320193']['10-Q']['a10-qq220203282020.htm'].keys()

dict_keys(['header', 'document_code', 'table_of_contents'])

In [12]:
updated_master_dict['0000320193']['10-Q']['a10-qq220203282020.htm']['table_of_contents'].keys()

dict_keys(['financial statements', 'management’s discussion and analysis of financial condition and results of operations', 'quantitative and qualitative disclosures about market risk', 'controls and procedures', 'legal proceedings', 'risk factors', 'unregistered sales of equity securities and use of proceeds', 'defaults upon senior securities', 'mine safety disclosures', 'other information', 'exhibits'])

In [13]:
updated_master_dict['0000320193']['10-Q']['a10-qq220203282020.htm']['table_of_contents']['financial statements']

'Apple Inc. CONDENSED CONSOLIDATED STATEMENTS OF OPERATIONS (Unaudited) (In millions, except number of shares which are reflected in thousands and per share amounts) Three Months Ended Six Months Ended March 28, 2020 March 30, 2019 March 28, 2020 March 30, 2019 Net sales: Products $ 44,965 $ 46,565 $ 124,069 $ 120,000 Services 13,348 11,450 26,063 22,325 Total net sales 58,313 58,015 150,132 142,325 Cost of sales: Products 31,321 32,047 83,396 80,285 Services 4,622 4,147 9,149 8,188 Total cost of sales 35,943 36,194 92,545 88,473 Gross margin 22,370 21,821 57,587 53,852 Operating expenses: Research and development 4,565 3,948 9,016 7,850 Selling, general and administrative 4,952 4,458 10,149 9,241 Total operating expenses 9,517 8,406 19,165 17,091 Operating income 12,853 13,415 38,422 36,761 Other income/(expense), net 282 378 631 938 Income before provision for income taxes 13,135 13,793 39,053 37,699 Provision for income taxes 1,886 2,232 5,568 6,173 Net income $ 11,249 $ 11,561 $ 33