## Can go to this page: https://www.sec.gov/cgi-bin/srch-edgar

### Use a list of CIK numbers for relevant companies and iterate through 10-K and 10-Q docs

In [1]:
import pandas as pd
import re
import requests
import unicodedata
from bs4 import BeautifulSoup

from helpers import *

## get df of 11,871 largest companies

In [2]:
cik_df = pd.read_csv('data/cik_df.csv', names=['Company', 'CIK number'])
cik_df.head(10)

### need to pad with 0's to 10 numbers

Unnamed: 0,Company,CIK number
0,aapl,320193
1,msft,789019
2,amzn,1018724
3,goog,1652044
4,fb,1326801
5,tcehy,1293451
6,tsla,1318605
7,baba,1577552
8,brk-a,1067983
9,tsm,1046179


## Identify table of contents elements

In [3]:
def go_to_filing_doc_location(cik, years, doc_list):
    for doc in doc_list:
        path = 'https://www.sec.gov/cgi-bin/srch-edgar?text=' + cik + '+' + doc + '&first=' + years[0] + '&last=' + years[1]
        print(path)
        
        
        #get response
        response = requests.get(path)
        
        # parse response
        soup = BeautifulSoup(response.content, 'lxml')

        # each filing can be found in the href marked: '[text]'
        # save the links to a list to be returned
        txt_links = []
        for a in soup.find_all('a', href=True, text='[text]'):
            link = 'https://www.sec.gov/'+a['href']
            txt_links.append(link)
            
        return txt_links
        



In [4]:
## need a way to update and keep current master dict


def get_document_text(cik_list, years, doc_list, save_path, master_dict):
        
    '''
    accesses edgar database and downloads text from 10-K and/or 10-Q filings for specified companies and years
    
    inputs:
    cik_list - list of company cik numbers to extract data
    years - years to extract data from. Tuple for now, range of years inclusive
    doc_list - list of doc types to extract
    save_path - location to save data
    
    returns:
    None
    '''    
    
    ## for each company that we want to extract data for:
    for cik in cik_list:
        ## txt_links is a list of all the .txt links for 10-Q or 10-K filings for this specific company
        txt_links = go_to_filing_doc_location(cik, years, doc_list)
        
        ## for each linked filing, we ne
        for link in txt_links:
#             master_dict, doc_id, doc_filename = parse_filing(link, cik, master_dict)
            tple = parse_filing(link, cik, master_dict)
            
            try:
                print(tple[2])
            except Exception:
                print('damn we having problems. sorry man')
                continue
    
            master_dict = split_doc_by_section(master_dict, cik, tple[1], tple[2])
            ## extract actual text from each toc element for the filing
            master_dict = get_text(master_dict, cik, tple[1], tple[2])


            ## extract actual text from each toc element for the filing
            master_dict = get_text(master_dict, cik, tple[1], tple[2])
        
    return master_dict


In [5]:
def split_doc_by_section(master_dict, cik, filing_type, date):
    '''
    '''
    
    parts_to_split = get_table_of_contents(master_dict[cik][filing_type][date]['document_code'])
    
    
    doc_text = master_dict[cik][filing_type][date]['document_code'].find('text').extract()
    doc_text = remove_tables(doc_text)
    
    #get different segments of document
    text_split_into_parts = []

    
        
    # parts_to_split is a list of table of contents elements
    for part in parts_to_split:
        #find the table of contents element if it is within a <b> tag
        next_part = doc_text.find(lambda tag:tag.name=='b' and part in tag.text.lower())

        ## look for font weight=bold
        if next_part == None:
            print('no <b> tags found. Checking for font-weight:bold span tags')
            next_part = doc_text.find(lambda tag:tag.name=='span' and ('font-weight:bold' in str(tag)) and (part in tag.text.lower()))
        
        if next_part == None:
            print('no font-weight:bold tags found. Checking for font-weight:700 span tags\n')
            next_part = doc_text.find(lambda tag:tag.name=='span' and ('font-weight:700' in str(tag)) and (part in tag.text.lower()))            
            
 
        text_split_into_parts.append(next_part)

        
    #convert all parts to string
    all_parts = [str(part) for part in text_split_into_parts]
    #prep the document text for splitting - convert to string
    doc_string = str(doc_text)
    #defing the regex delimeter pattern
    regex_delimiter_pattern = '|'.join(map(re.escape, all_parts))

    #split doc on each break
    split_doc_string = re.split(regex_delimiter_pattern, doc_string)
        
    #store parts in master dict
    i = 1
    for section in parts_to_split:
        master_dict[cik][filing_type][date]['table_of_contents'][section] = split_doc_string[i]
        i += 1


    return master_dict

# updated_master_dict = split_doc_by_section(updated_master_dict)
# doctext, parts = split_doc_by_section(updated_master_dict)

In [6]:
# def get_text(filing_doc_code_dict):
def get_text(master_dict, cik, id, date):
    
    for key in master_dict[cik][id][date]['table_of_contents'].keys():
#         raw_text_without_tables = remove_tables(master_dict, cik, id, date, key)
        raw_text = bsoup_extract_from_string(master_dict[cik][id][date]['table_of_contents'][key])        
        decoded_text = decode_text(raw_text)
        decoded_text = remove_extra_spaces(decoded_text)
        master_dict[cik][id][date]['table_of_contents'][key] = decoded_text
        
    return master_dict



In [11]:
def remove_tables(soup):
    print('soup type: ', type(soup), '\nlength of soup: ', len(str(soup)))
    
    
    
    tables = soup.find_all('table')
    for table in tables:
        table.decompose()
    
    print('length of soup after decompose: ', len(str(soup)))


    return soup
    
    


In [12]:
updated_master_dict = get_document_text(['0000320193'], ['2020', '2021'], ['10-Q'], None, {})
## get_document_text -> 

https://www.sec.gov/cgi-bin/srch-edgar?text=0000320193+10-Q&first=2020&last=2021
new company
from parse_filing. document_id:  10-Q
aapl-20210327.htm
soup type:  <class 'bs4.element.Tag'> 
length of soup:  1689583
length of soup after decompose:  403894
no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. Checking for font-weight:700 span tags

no <b> tags found. Checking for font-weight:bold span tags
no font-weight:bold tags found. C

In [None]:
updated_master_dict.keys()

In [None]:
updated_master_dict['0000320193'].keys()

In [None]:
updated_master_dict['0000320193']['10-Q'].keys()

In [None]:
updated_master_dict['0000320193']['10-Q']['a10-qq220203282020.htm'].keys()

In [None]:
updated_master_dict['0000320193']['10-Q']['a10-qq220203282020.htm']['table_of_contents'].keys()

In [None]:
updated_master_dict['0000320193']['10-Q']['a10-qq220203282020.htm']['table_of_contents']['financial statements']

In [None]:
import numpy as np
import pandas as pd
# here is your column that you want to sort
your_column_to_sort = ['a121ad', 'b13sd', 'c12ds', 'd115fs']
# convert to a dataframe
df = pd.DataFrame(a, columns=['dna'])
df

In [None]:
dna_ints = []
# use regex to find the integers within the letters
for dna in df['dna']:
    ## re.search is looking for matches within each string
    ## \d means any numerical value
    ## (r'\d+', dna) means: look for all numbers within the string dna
    ## .group() will group all matches together
    dna_ints.append(int(re.search(r'\d+', dna).group()))
# add a new column to your dataframe
df['dna_ints'] = dna_ints

In [None]:
# then sort your dataframe by your integer column
sorted_df = df.sort_values(by='dna_ints')
sorted_df

In [None]:
a = [1,2,3,4]
for i in a:
    if i == 3:
        continue
    print(i)
    