# SEC Finance Data Analysis

Our group chose to research financial records from the SEC of at least ten companies for a period of five years. Of those ten companies, we plan to extract 15 to 17 key elements. We will design our database in a way that allows for expansion, if necessary.

SEC Form **10-Q** is a comprehensive report of financial performance that must be submitted quarterly by all public companies to the Securities and Exchange Commission (SEC)

SEC Form **10-K** is a comprehensive report filed annually by a publicly-traded company about its financial performance and is required by the U.S. Securities and Exchange Commission (SEC). 

SEC Form **8-K** is the “current report” companies must file with the SEC to announce major events that shareholders should know about.


In [424]:
import requests
import pandas as pd

In [425]:
# SEC API requires an email ID to be provided

email_id = input("what is your email id? [SEC requires you to provide your email ID to pull data from SEC website]")


what is your email id? [The email will be used to pull data from SEC website]ramkumarpj@gmail.com


In [426]:

# Base URL to retreive company facts from SEC Data API
base_url = "https://data.sec.gov/api/xbrl/companyfacts/"


# Headers to be set to receive appropriate respnse from SEC Data API
headers = {
    'User-Agent' : email_id,
    'Host' : 'data.sec.gov'
}


In [427]:
# CIK - Central Index Key - Unique key that identifies a company in SEC Database

# List of CIKs under analysis
cik_list = ['808362', '1652044', '1637459' ]

# Data elements to be explored for each CIK

data_elements = [ 'Revenues',
                      'SalesRevenueGoodsNet',
                      'SalesRevenueServicesNet',
                      'RevenueFromContractWithCustomerIncludingAssessedTax',
                      'GrossProfit',
                      'OperatingIncomeLoss',
                      'NetIncomeLoss',
                      'ResearchAndDevelopmentExpense',
                      'SellingAndMarketingExpense',
                      'ShareBasedCompensation',
                      'Depreciation',
                      'AllocatedShareBasedCompensationExpense',
                      'CostsAndExpenses',
                      'GeneralAndAdministrativeExpense',
                      'InterestExpense',
                      'LeaseAndRentalExpense',
                      'MarketingAndAdvertisingExpense',
                      'OtherAccruedLiabilitiesCurrent',
                      'EntityCommonStockSharesOutstanding',
                      'EntityPublicFloat']




In [428]:

# Function to extract financial data from a dictionary of items (us-gaap or dei)
# Parameter - tenQ_tenK_list keeps finance data for each CIK
# Parameter - form_data_elements stores unique forms associated with each data elements
# Parameter - fp_elements keeps unique filing period recorded for each data elements
# Parameter - items dictonary (us-gaap or dei)
# Return    - None

#{
#'key': ''
#'units': ''
#'10Q' : nbr of 10Qs
#'10K' : nbr of 10Ks
#}

def extract_data(tenQ_tenK_list, form_data_elements, fp_elements, items):
    
    for key in items.keys():
    
        if key in data_elements:
            
            forms = ()
            fp = ()
            
            i = 1
            for key2 in items[key]['units'].keys():
                
                fin_list = items[key]['units'][key2]
                tenQCount = len([i for i in fin_list if i['form'] == '10-Q'])
                tenKCount = len([i for i in fin_list if i['form'] == '10-K'])
                eigthKCount = len([i for i in fin_list if i['form'] == '8-K'])
                tenQACount = len([i for i in fin_list if i['form'] == '10-Q/A'])
                tenKACount = len([i for i in fin_list if i['form'] == '10-K/A'])
                
                forms = set([i['form'] for i in fin_list if 'form' in i])
                
                fp = set([i['fp'] for i in fin_list if 'fp' in i and i['fp'] != ''])
                
                #print(f"{i}. {key} {key2} 10Qs- {tenQCount}, 10Ks - {tenKCount}")
                
                i+=1
                
                tenQ_tenK_list.append({
                    'key' : key,
                    'units' : key2,
                    '10Qs' : tenQCount,
                    '10Ks' : tenKCount,
                    '8Ks' : eigthKCount,
                    '10QAs' : tenQACount,
                    '10KAs' : tenKACount
                })
                
            if key in form_data_elements:
                form_data_elements[key].update(forms)
            else:
                form_data_elements[key] = forms
            
            if key in fp_elements:
                fp_elements[key].update(fp)
            else:
                fp_elements[key] = fp        
            

In [429]:
    
# List that holds finance data for each CIK

finance_data_analysis = []

# List to keep track of unique forms(10Q, 10K etc) associated with each data elements 
all_forms = {}

# List to keep track of unique filing periods associated with each data elements 
all_fp = {}


# Iterate through all the CIKs data needs to be extracted

for cik in cik_list:
    
    # Create the URL to retrieve data for specific CIK
    url = base_url + f'CIK{cik.zfill(10)}.json'

    print(url)
    
    # Fetch the data from SEC Data API
    response = requests.get(url, headers=headers).json()

    print(f"received data for company- {response['entityName']}, cik = {response['cik']}")
    
    # Get DEI Items from response
    dei = response['facts']['dei']

    # Get US-GAAP Items from response
    us_gaap = response['facts']['us-gaap']
    
    # List to keep count of 10Q, 10K etc
    tenQ_tenK_list = []
    
    # Extract data for us_gaap
    extract_data(tenQ_tenK_list, all_forms, all_fp, us_gaap)
    
    # Extract data for dei
    extract_data(tenQ_tenK_list, all_forms, all_fp, dei)
    
    # update finance_data_analysis list for this
    finance_data_analysis.append(
    {
        'cik' : response['cik'],
        'company' : response['entityName'],
        'data_elements_analysis' : tenQ_tenK_list
    })
    


https://data.sec.gov/api/xbrl/companyfacts/CIK0000808362.json
received data for company- Baker Hughes Holdings LLC, cik = 808362
https://data.sec.gov/api/xbrl/companyfacts/CIK0001652044.json
received data for company- Alphabet Inc., cik = 1652044
https://data.sec.gov/api/xbrl/companyfacts/CIK0001637459.json
received data for company- Kraft Heinz Co, cik = 1637459


In [430]:
# Create Company DataFrame
company_df = pd.DataFrame(finance_data_analysis)
company_df = company_df.drop('data_elements_analysis', axis=1)
company_df

Unnamed: 0,cik,company
0,808362,Baker Hughes Holdings LLC
1,1652044,Alphabet Inc.
2,1637459,Kraft Heinz Co


In [431]:
# Extract the count of SEC 10-Q, SEC-10K and other filings 
# available for each data element for all the CIKs

data_elements_analysis = []

for element in data_elements:
    element_analysis = {
                'data_element' : element
        }
    for fin_data in finance_data_analysis:
        cik = str(fin_data['cik'])
        for data in fin_data['data_elements_analysis']:
            if data['key'] == element:
                element_analysis[cik + '_10Q'] = data['10Qs']
                element_analysis[cik + '_10K'] = data['10Ks']
                element_analysis[cik + '_8K'] = data['8Ks']
                element_analysis[cik + '_10QA'] = data['10QAs']
                element_analysis[cik + '_10KA'] = data['10KAs']
        if not cik + '_10Q' in element_analysis:
            element_analysis[cik+'_10Q'] = 0
            element_analysis[cik+'_10K'] = 0
            element_analysis[cik+'_8K'] = 0
            element_analysis[cik+'_10QA'] = 0
            element_analysis[cik+'_10KA'] = 0
    data_elements_analysis.append(element_analysis)
                


In [432]:
# Create DataFrame to show the form count for each data element

form_count_df = pd.DataFrame(data_elements_analysis)
form_count_df = form_count_df.set_index('data_element')
form_count_df = form_count_df.fillna(0)

In [433]:
print(company_df)
form_count_df

       cik                    company
0   808362  Baker Hughes Holdings LLC
1  1652044              Alphabet Inc.
2  1637459             Kraft Heinz Co


Unnamed: 0_level_0,808362_10Q,808362_10K,808362_8K,808362_10QA,808362_10KA,1652044_10Q,1652044_10K,1652044_8K,1652044_10QA,1652044_10KA,1637459_10Q,1637459_10K,1637459_8K,1637459_10QA,1637459_10KA
data_element,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Revenues,0,0,0,0,0,42,15,3,0,0,0,0,11,0,0
SalesRevenueGoodsNet,90,27,0,0,6,0,0,0,0,0,34,30,0,6,0
SalesRevenueServicesNet,90,27,0,0,6,0,0,0,0,0,0,0,0,0,0
RevenueFromContractWithCustomerIncludingAssessedTax,8,0,0,0,0,0,0,0,0,0,54,46,11,0,0
GrossProfit,0,100,0,0,20,0,0,0,0,0,88,76,22,6,0
OperatingIncomeLoss,148,42,0,0,6,84,27,3,0,0,88,36,6,6,0
NetIncomeLoss,130,119,0,0,22,85,27,3,0,0,88,52,14,6,0
ResearchAndDevelopmentExpense,88,42,0,0,6,84,27,3,0,0,0,24,6,0,0
SellingAndMarketingExpense,0,0,0,0,0,84,27,3,0,0,0,0,0,0,0
ShareBasedCompensation,32,24,0,0,6,58,27,3,0,0,54,30,6,4,0


In [434]:
# Sort values in all_forms dictionary            
# Insert "-" for missing values

unique_forms = ['8-K', '10-Q/A', '10-Q', '10-K/A', '10-K']

for key in all_forms.keys():

    all_forms[key] = sorted(all_forms[key], reverse=True)
    
    for index, form in enumerate(unique_forms):
        if all_forms[key][index] != form :
            all_forms[key].insert(index, '-')
            
        


In [435]:
# Create DataFrame to show the various forms filed for each data element

forms_df = pd.DataFrame.from_dict(all_forms, orient='index')

forms_df

Unnamed: 0,0,1,2,3,4
AllocatedShareBasedCompensationExpense,8-K,-,10-Q,10-K/A,10-K
CostsAndExpenses,8-K,-,10-Q,10-K/A,10-K
Depreciation,8-K,-,10-Q,10-K/A,10-K
GrossProfit,8-K,10-Q/A,10-Q,10-K/A,10-K
InterestExpense,8-K,10-Q/A,10-Q,10-K/A,10-K
LeaseAndRentalExpense,8-K,-,-,10-K/A,10-K
NetIncomeLoss,8-K,10-Q/A,10-Q,10-K/A,10-K
OperatingIncomeLoss,8-K,10-Q/A,10-Q,10-K/A,10-K
OtherAccruedLiabilitiesCurrent,-,-,10-Q,10-K/A,10-K
ResearchAndDevelopmentExpense,8-K,-,10-Q,10-K/A,10-K


In [436]:
# Sort values in all_fp dictionary            
# Insert "-" for missing values

unique_fps = ['FY', 'Q1', 'Q2', 'Q3' ]

for key in all_fp.keys():

    all_fp[key] = sorted(all_fp[key])
    
    for index, fp in enumerate(unique_fps):
        if len(all_fp[key]) > index:
            if all_fp[key][index] != fp :
                all_fp[key].insert(index, '-')
        else:
            all_fp[key].append('-')

In [437]:
# Create DataFrame to show the filing periods for each data element

fp_df = pd.DataFrame.from_dict(all_fp, orient='index')

fp_df

Unnamed: 0,0,1,2,3
AllocatedShareBasedCompensationExpense,FY,Q1,Q2,Q3
CostsAndExpenses,FY,Q1,Q2,Q3
Depreciation,FY,-,Q2,Q3
GrossProfit,FY,Q1,Q2,Q3
InterestExpense,FY,Q1,Q2,Q3
LeaseAndRentalExpense,FY,-,-,-
NetIncomeLoss,FY,Q1,Q2,Q3
OperatingIncomeLoss,FY,Q1,Q2,Q3
OtherAccruedLiabilitiesCurrent,FY,Q1,Q2,Q3
ResearchAndDevelopmentExpense,FY,Q1,Q2,Q3
