In [102]:
import time

import requests
import urllib
from bs4 import BeautifulSoup
import pandas as pd

In [103]:
def make_url(base_url, comp):
    url = base_url

    #add each component to base url
    for c in comp:
        url ='{}/{}'.format(url, c)

    return url

base_url = r"https://www.sec.gov"
json_format = 'index.json'

In [104]:
#request url and decode
url = r"https://www.sec.gov/Archives/edgar/data/0000050863/000005086322000007/index.json"
headers = {'user-agent': 'sample text'}
content = requests.get(url,headers=headers).json()

for file in content['directory']['item']:

    if file['name'] == 'FilingSummary.xml':
        xml_summary =base_url + content['directory']['name'] + '/' +file['name']

        print('_' * 100)
        print('File Name: ' + file['name'])
        print('File Path: ' + xml_summary)

____________________________________________________________________________________________________
File Name: FilingSummary.xml
File Path: https://www.sec.gov/Archives/edgar/data/50863/000005086322000007/FilingSummary.xml


In [105]:
# define a new base url that represents the filing folder. This will come in handy when we need to download the reports.
base_url = xml_summary.replace('FilingSummary.xml', '')

# request and parse the content
content = requests.get(xml_summary, headers=headers).content
soup = BeautifulSoup(content, 'lxml')
#print(soup)

# find the 'myreports' tag because this contains all the individual reports submitted.
reports = soup.find('myreports')

# I want a list to store all the individual components of the report, so create the master list.
master_reports = []

#loop through each report in the 'myreports' tag but avoid the last one as this will cause an error.
for report in reports.find_all('report')[:-1]:

    # let's create a dictionary to store all the different parts we need.
    report_dict = {}
    report_dict['name_short'] = report.shortname.text
    report_dict['name_long'] = report.longname.text
    report_dict['position'] = report.position.text
    report_dict['category'] = report.menucategory.text
    report_dict['url'] = base_url + report.htmlfilename.text

    # append the dictionary to the master list.
    master_reports.append(report_dict)

    # print the info to the user.
    print('-'*100)
    print(base_url + report.htmlfilename.text)
    print(report.longname.text)
    print(report.shortname.text)
    print(report.menucategory.text)
    print(report.position.text)
print(master_reports[1])

----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/50863/000005086322000007/R1.htm
0001001 - Document - Cover Page
Cover Page
Cover
1
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/50863/000005086322000007/R2.htm
0002002 - Document - Audit Information
Audit Information
Notes
2
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/50863/000005086322000007/R3.htm
1001003 - Statement - Consolidated Statements of Income
Consolidated Statements of Income
Uncategorized
3
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/50863/000005086322000007/R4.htm
1002004 - Statement - Consolidated Statements of Comprehensive Income
Consolidated Sta

In [106]:
# create the list to hold the statement urls
statements_url = []

for report_dict in master_reports:

    # define the statements we want to look for.
    item1 = r"Consolidated Balance Sheets"
    item2 = r"Consolidated Statements of Income"
    item3 = r"Consolidated Statements of Cash Flows"
    item4 = r"Consolidated Statements of Stockholder's (Deficit) Equity"

    # store them in a list.
    report_list = [item1, item2, item3, item4]

    # if the short name can be found in the report list.
    if report_dict['name_short'] in report_list:

        # print some info and store it in the statements url.
        print('-'*100)
        print(report_dict['name_short'])
        print(report_dict['url'])

        statements_url.append(report_dict['url'])

----------------------------------------------------------------------------------------------------
Consolidated Statements of Income
https://www.sec.gov/Archives/edgar/data/50863/000005086322000007/R3.htm
----------------------------------------------------------------------------------------------------
Consolidated Balance Sheets
https://www.sec.gov/Archives/edgar/data/50863/000005086322000007/R5.htm
----------------------------------------------------------------------------------------------------
Consolidated Statements of Cash Flows
https://www.sec.gov/Archives/edgar/data/50863/000005086322000007/R7.htm


In [107]:
# let's assume we want all the statements in a single data set.
statements_data = []

# loop through each statement url
for statement in statements_url:

    # define a dictionary that will store the different parts of the statement.
    statement_data = {}
    statement_data['headers'] = []
    statement_data['sections'] = []
    statement_data['data'] = []
    
    # request the statement file content
    content = requests.get(statement,headers=headers).content
    report_soup = BeautifulSoup(content, 'html')

    # find all the rows, figure out what type of row it is, parse the elements, and store in the statement file list.
    for index, row in enumerate(report_soup.table.find_all('tr')):
        
        # first let's get all the elements.
        cols = row.find_all('td')
        
        # if it's a regular row and not a section or a table header
        if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0): 
            reg_row = [ele.text.strip() for ele in cols]
            statement_data['data'].append(reg_row)
            
        # if it's a regular row and a section but not a table header
        elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0):
            sec_row = cols[0].text.strip()
            statement_data['sections'].append(sec_row)
            
        # finally if it's not any of those it must be a header
        elif (len(row.find_all('th')) != 0):            
            hed_row = [ele.text.strip() for ele in row.find_all('th')]
            statement_data['headers'].append(hed_row)
            
        else:            
            print('We encountered an error.')

    # append it to the master list.
    statements_data.append(statement_data)  

In [108]:
# Grab the proper components
balance_header =  statements_data[1]['headers']
balance_hed = [item[1:] for item in balance_header]
balance_data = statements_data[1]['data']

# Put the data in a DataFrame
balance_df = pd.DataFrame(balance_data)

# Display
print('-'*100)
print('Before Reindexing')
print('-'*100)
display(balance_df.head())

# Define the Index column, rename it, and we need to make sure to drop the old column once we reindex.
balance_df.index = balance_df[0]
balance_df.index.name = 'Category'
balance_df = balance_df.drop(0, axis = 1)

# Display
print('-'*100)
print('Before Regex')
print('-'*100)
display(balance_df.head())

# Get rid of the '$', '(', ')', and convert the '' to NaNs.
balance_df = balance_df.replace('[\$,)]','', regex=True )\
                     .replace( '[(]','-', regex=True)\
                     .replace( '', 'NaN', regex=True)

# Display
print('-'*100)
print('Before type conversion')
print('-'*100)
display(balance_df.head())
balance_df.head()

# everything is a string, so let's convert all the data to a float.
balance_df = balance_df.astype(float)

# Change the column headers
balance_df.columns = balance_hed

# Display
print('-'*100)
print('Final Product')
print('-'*100)

# show the df
balance_df

# drop the data in a CSV file if need be.
# income_df.to_csv('income_state.csv')

----------------------------------------------------------------------------------------------------
Before Reindexing
----------------------------------------------------------------------------------------------------


Unnamed: 0,0,1,2
0,"Cash, Cash Equivalents, Restricted Cash and Re...","$ 4,827","$ 5,865"
1,Short-term investments,2103,2292
2,Trading assets,21483,15738
3,"Accounts receivable, net of allowance for doub...",9457,6782
4,Inventories,10776,8427


----------------------------------------------------------------------------------------------------
Before Regex
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
"Cash, Cash Equivalents, Restricted Cash and Restricted Cash Equivalents","$ 4,827","$ 5,865"
Short-term investments,2103,2292
Trading assets,21483,15738
"Accounts receivable, net of allowance for doubtful accounts",9457,6782
Inventories,10776,8427


----------------------------------------------------------------------------------------------------
Before type conversion
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
"Cash, Cash Equivalents, Restricted Cash and Restricted Cash Equivalents",4827,5865
Short-term investments,2103,2292
Trading assets,21483,15738
"Accounts receivable, net of allowance for doubtful accounts",9457,6782
Inventories,10776,8427


----------------------------------------------------------------------------------------------------
Final Product
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,"Dec. 25, 2021","Dec. 26, 2020"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
"Cash, Cash Equivalents, Restricted Cash and Restricted Cash Equivalents",4827.0,5865.0
Short-term investments,2103.0,2292.0
Trading assets,21483.0,15738.0
"Accounts receivable, net of allowance for doubtful accounts",9457.0,6782.0
Inventories,10776.0,8427.0
Assets held for sale,6942.0,5400.0
Other current assets,2130.0,2745.0
Total current assets,57718.0,47249.0
"Property, plant and equipment, net",63245.0,56584.0
Equity investments,6298.0,5152.0


In [109]:
# Grab the proper components
income_header =  statements_data[0]['headers'][1]
income_data = statements_data[0]['data']

# Put the data in a DataFrame
income_df = pd.DataFrame(income_data)

# Display
print('-'*100)
print('Before Reindexing')
print('-'*100)
display(income_df.head())

# Define the Index column, rename it, and we need to make sure to drop the old column once we reindex.
income_df.index = income_df[0]
income_df.index.name = 'Category'
income_df = income_df.drop(0, axis = 1)

# Display
print('-'*100)
print('Before Regex')
print('-'*100)
display(income_df.head())

# Get rid of the '$', '(', ')', and convert the '' to NaNs.
income_df = income_df.replace('[\$,)]','', regex=True )\
                     .replace( '[(]','-', regex=True)\
                     .replace( '', 'NaN', regex=True)

# Display
print('-'*100)
print('Before type conversion')
print('-'*100)
display(income_df.head())

# everything is a string, so let's convert all the data to a float.
income_df = income_df.astype(float)

# Change the column headers
income_df.columns = income_header

# Display
print('-'*100)
print('Final Product')
print('-'*100)

# show the df
income_df

# drop the data in a CSV file if need be.
# income_df.to_csv('income_state.csv')

----------------------------------------------------------------------------------------------------
Before Reindexing
----------------------------------------------------------------------------------------------------


Unnamed: 0,0,1,2,3
0,Net revenue,"$ 79,024","$ 77,867","$ 71,965"
1,Cost of sales,35209,34255,29825
2,Gross margin,43815,43612,42140
3,Research and development,15190,13556,13362
4,"Marketing, general and administrative",6543,6180,6350


----------------------------------------------------------------------------------------------------
Before Regex
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2,3
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Net revenue,"$ 79,024","$ 77,867","$ 71,965"
Cost of sales,35209,34255,29825
Gross margin,43815,43612,42140
Research and development,15190,13556,13362
"Marketing, general and administrative",6543,6180,6350


----------------------------------------------------------------------------------------------------
Before type conversion
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2,3
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Net revenue,79024,77867,71965
Cost of sales,35209,34255,29825
Gross margin,43815,43612,42140
Research and development,15190,13556,13362
"Marketing, general and administrative",6543,6180,6350


----------------------------------------------------------------------------------------------------
Final Product
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,"Dec. 25, 2021","Dec. 26, 2020","Dec. 28, 2019"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Net revenue,79024.0,77867.0,71965.0
Cost of sales,35209.0,34255.0,29825.0
Gross margin,43815.0,43612.0,42140.0
Research and development,15190.0,13556.0,13362.0
"Marketing, general and administrative",6543.0,6180.0,6350.0
Restructuring and other charges,2626.0,198.0,393.0
Operating expenses,24359.0,19934.0,20105.0
Operating income,19456.0,23678.0,22035.0
"Gains (losses) on equity investments, net",2729.0,1904.0,1539.0
"Interest and other, net",-482.0,-504.0,484.0


In [110]:
# Grab the proper components
cf_header =  statements_data[2]['headers'][1]
cf_data = statements_data[2]['data']

# Put the data in a DataFrame
cf_df = pd.DataFrame(cf_data)

# Display
print('-'*100)
print('Before Reindexing')
print('-'*100)
display(income_df.head())

# Define the Index column, rename it, and we need to make sure to drop the old column once we reindex.
cf_df.index = cf_df[0]
cf_df.index.name = 'Category'
cf_df = cf_df.drop(0, axis = 1)

# Display
print('-'*100)
print('Before Regex')
print('-'*100)
display(cf_df.head())

# Get rid of the '$', '(', ')', and convert the '' to NaNs.
cf_df = cf_df.replace('[\$,)]','', regex=True )\
                     .replace( '[(]','-', regex=True)\
                     .replace( '', 'NaN', regex=True)

# Display
print('-'*100)
print('Before type conversion')
print('-'*100)
display(cf_df.head())

# everything is a string, so let's convert all the data to a float.
cf_df = cf_df.astype(float)

# Change the column headers
cf_df.columns = cf_header

# Display
print('-'*100)
print('Final Product')
print('-'*100)

# show the df
cf_df

# drop the data in a CSV file if need be.
# income_df.to_csv('income_state.csv')

----------------------------------------------------------------------------------------------------
Before Reindexing
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,"Dec. 25, 2021","Dec. 26, 2020","Dec. 28, 2019"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Net revenue,79024.0,77867.0,71965.0
Cost of sales,35209.0,34255.0,29825.0
Gross margin,43815.0,43612.0,42140.0
Research and development,15190.0,13556.0,13362.0
"Marketing, general and administrative",6543.0,6180.0,6350.0


----------------------------------------------------------------------------------------------------
Before Regex
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2,3
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Cash and cash equivalents, beginning of period","$ 5,865","$ 4,194","$ 3,019"
Net income,19868,20899,21048
Depreciation,9953,10482,9204
Share-based compensation,2036,1854,1705
Restructuring and other charges,2626,198,393


----------------------------------------------------------------------------------------------------
Before type conversion
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,1,2,3
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Cash and cash equivalents, beginning of period",5865,4194,3019
Net income,19868,20899,21048
Depreciation,9953,10482,9204
Share-based compensation,2036,1854,1705
Restructuring and other charges,2626,198,393


----------------------------------------------------------------------------------------------------
Final Product
----------------------------------------------------------------------------------------------------


Unnamed: 0_level_0,"Dec. 25, 2021","Dec. 26, 2020","Dec. 28, 2019"
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Cash and cash equivalents, beginning of period",5865.0,4194.0,3019.0
Net income,19868.0,20899.0,21048.0
Depreciation,9953.0,10482.0,9204.0
Share-based compensation,2036.0,1854.0,1705.0
Restructuring and other charges,2626.0,198.0,393.0
Amortization of intangibles,1839.0,1757.0,1622.0
"(Gains) losses on equity investments, net",-1458.0,-1757.0,-892.0
Accounts receivable,-2674.0,883.0,-935.0
Inventories,-2339.0,-687.0,-1481.0
Accounts payable,1190.0,405.0,696.0
