In [1]:
import requests
import pandas as pd
from datetime import datetime
import pprint
import json

pp = pprint.PrettyPrinter(indent=4)

In [2]:
# Base URL to retreive company facts from SEC Data API
base_url = "https://data.sec.gov/api/xbrl/companyfacts/"

# Headers to be set to receive appropriate respnse from SEC Data API
headers = {
    'User-Agent' : 'ramkumarpj@gmail.com',
    'Host' : 'data.sec.gov'
}


In [3]:
# CIK - Central Index Key - Unique key that identifies a company in SEC Database

# List of CIKs under analysis

cik_list = ['808362', '1652044', '1637459' ]

# Data elements to be explored for each CIK

data_elements = [ 'Revenues',
                      'SalesRevenueGoodsNet',
                      'SalesRevenueServicesNet',
                      'RevenueFromContractWithCustomerIncludingAssessedTax',
                      'GrossProfit',
                      'OperatingIncomeLoss',
                      'NetIncomeLoss',
                      'ResearchAndDevelopmentExpense',
                      'SellingAndMarketingExpense',
                      'ShareBasedCompensation',
                      'Depreciation',
                      'AllocatedShareBasedCompensationExpense',
                      'CostsAndExpenses',
                      'GeneralAndAdministrativeExpense',
                      'InterestExpense',
                      'LeaseAndRentalExpense',
                      'MarketingAndAdvertisingExpense',
                      'OtherAccruedLiabilitiesCurrent',
                      'EntityCommonStockSharesOutstanding',
                      'EntityPublicFloat']





In [4]:
def isQuarterlyOrAnnualFiling(start, end):

    start_date = datetime.strptime(start, '%Y-%m-%d')
    end_date = datetime.strptime(end, '%Y-%m-%d')
    
    return end_date.month - start_date.month <3 or end_date.month - start_date.month == 11

def isQuarterlyFiling(start, end):

    start_date = datetime.strptime(start, '%Y-%m-%d')
    end_date = datetime.strptime(end, '%Y-%m-%d')
    
    return end_date.month - start_date.month <3 

def isAnnualFiling(start, end):

    start_date = datetime.strptime(start, '%Y-%m-%d')
    end_date = datetime.strptime(end, '%Y-%m-%d')
    
    return end_date.month - start_date.month == 11
    

In [5]:
def getQuarterInFiling(filing):
    
    if 'start' in filing:
        if isQuarterlyFiling(filing['start'], filing['end']):
            if filing['fp'][1] != 'Y' :
                return int(filing['fp'][1])
            else :
                return datetime.strptime(filing['end'], '%Y-%m-%d').month/3
        elif isAnnualFiling(filing['start'], filing['end']):
            return 0
    else:
        if filing['fp'][1] == 'Y':
            return 0
        else:
            return int(filing['fp'][1])
    

In [6]:
def getYearInFiling(filing):
    
    end_date = datetime.strptime(filing['end'], '%Y-%m-%d') 
    
    return end_date.year

In [7]:
def extractData(tenQ_tenK_filings_list, items, key):
    
         
    if key in items.keys():
        i = 1
        for key2 in items[key]['units'].keys():

            fin_list = items[key]['units'][key2]
            tenQ_tenK_filings = [i for i in fin_list if i['form'] == '10-Q' or i['form'] == '10-K']

            #print(f"{i}. {key} {key2} 10Qs- {tenQCount}, 10Ks - {tenKCount}")

            i+=1

            tenQ_tenK_filings_list.append({
                'key' : key,
                'units' : key2,
                'filings' : tenQ_tenK_filings
            })
            

In [8]:
def transformData(tenQ_tenK_filings_list, key):
    
    if len(tenQ_tenK_filings_list) > 0 :
        filings = tenQ_tenK_filings_list[0]['filings']
    
        print(f' key={key},  Length of the list {len(filings)}')

    
        tenQ_tenK_filings_indexed = {}

        # Build a dictionary with a key using start and end fields 
        for filing in filings:
            #print(filing)
            if 'start' not in filing:
                start = '-'
            else:
                start = filing['start']
            
            index = start + ':' + filing['end']
            
    
            if index in tenQ_tenK_filings_indexed :
                tenQ_tenK_filings_indexed[index].append(filing)
            else :
                tenQ_tenK_filings_indexed[index] = [filing]
                
        # Identify multiple filings for same period
        tenQ_tenK_multiple_filings = [filing for filing in tenQ_tenK_filings_indexed.values() 
                                      if len(filing) > 1]
        
        # Identify single filings for same period
        tenQ_tenK_single_filings = [filing[0] for filing in tenQ_tenK_filings_indexed.values() 
                                    if len(filing) < 2]
        
        # Sort multiple filings in descending order of filed date 
        # Append the latest filing to single filings list
        for filing in tenQ_tenK_multiple_filings:
            filing.sort(key = lambda x: 
                        datetime.strptime(x['filed'], '%Y-%m-%d'), reverse = True)
            tenQ_tenK_single_filings.append(filing[0])
 
        # Sort single filings in ascending order of end date
        tenQ_tenK_single_filings.sort(key = lambda x: datetime.strptime(x['end'], '%Y-%m-%d'))
        
        #pp.pprint(tenQ_tenK_single_filings)
        
        print(f' key={key},  Length of the list single filings {len(tenQ_tenK_single_filings)}')
        
        # Filter single filings to keep only quatery and annual filings (eliminate 6 months, 9 months filings)
        tenQ_tenK_single_filings_qtr_annnual_filtered = [filing for filing 
                                                          in tenQ_tenK_single_filings 
                                                          if 'start' in filing and isQuarterlyOrAnnualFiling(
                                                              filing['start'], filing['end'])]
        
        print(f' key={key},  Length of the list after filtering {len(tenQ_tenK_single_filings_qtr_annnual_filtered)}')
        
        if len(tenQ_tenK_single_filings_qtr_annnual_filtered) == 0:
            tenQ_tenK_single_filings_qtr_annnual_filtered = tenQ_tenK_single_filings
        
        print(f' key={key},  Length of the list final {len(tenQ_tenK_single_filings_qtr_annnual_filtered)}')

        #pp.pprint(tenQ_tenK_single_filings_qtr_annnual_filtered) 
        
        return tenQ_tenK_single_filings_qtr_annnual_filtered
        

In [9]:
# Create List of dictionaries to be imported into MongoDB Finance Collection

# { 
# 'cik' : 4949494,
# 'dataType' : 'Revenues',
# 'value' : 4949494949,
# 'qtr' : 1
# 'year' : 2018
# }
    
def getFinanceData(tenQ_tenK_filings_transformed, cik, key):
    
    finance_records = [ {'cik' : cik,
                        'dataType' : key,
                        'value' : filing['val'],
                        'qtr' : getQuarterInFiling(filing),
                        'year' : getYearInFiling(filing)
                        } for filing in tenQ_tenK_filings_transformed]

    print(f' cik={cik}, key={key},  Length of the finance record {len(finance_records)}')
    
    #pp.pprint(finance_records)
    
    return finance_records
    
    

In [10]:
def saveDataToFile(data, file_name):
    
    with open(file_name,'w') as fi:
        fi.write(json.dumps(data, indent=4))
    
    print(f"Completed writing to file {file_name}")

In [11]:

company_data = []
finance_data = []

for cik in cik_list:
    
    # Create the URL to retrieve data for specific CIK
    url = base_url + f'CIK{cik.zfill(10)}.json'

    print(url)
    
    # Fetch the data from SEC Data API
    response = requests.get(url, headers=headers).json()

    print(f"received data for company- {response['entityName']}, cik = {response['cik']}")
    company_data.append({'cik' : response['cik'],
                        'compnanyName' : response['entityName']})
    # Get DEI Items from response
    dei = response['facts']['dei']

    # Get US-GAAP Items from response
    us_gaap = response['facts']['us-gaap']
    
    for key in data_elements: 
        tenQ_tenK_filings_list = []
        extractData(tenQ_tenK_filings_list, us_gaap, key)
        extractData(tenQ_tenK_filings_list, dei, key)
        tenQ_tenK_filings_transformed = []
        tenQ_tenK_filings_transformed = transformData(tenQ_tenK_filings_list, key)
        if tenQ_tenK_filings_transformed is not None:
            finance_data.extend(getFinanceData(tenQ_tenK_filings_transformed, cik, key))
        

print(f"Length of Finance Data = {len(finance_data)}")
saveDataToFile(finance_data, "../../data/output/finance_data.json") 
saveDataToFile(company_data, "../../data/output/company_data.json") 


https://data.sec.gov/api/xbrl/companyfacts/CIK0000808362.json
received data for company- Baker Hughes Holdings LLC, cik = 808362
 key=SalesRevenueGoodsNet,  Length of the list 117
 key=SalesRevenueGoodsNet,  Length of the list single filings 61
 key=SalesRevenueGoodsNet,  Length of the list after filtering 41
 key=SalesRevenueGoodsNet,  Length of the list final 41
 cik=808362, key=SalesRevenueGoodsNet,  Length of the finance record 41
 key=SalesRevenueServicesNet,  Length of the list 117
 key=SalesRevenueServicesNet,  Length of the list single filings 61
 key=SalesRevenueServicesNet,  Length of the list after filtering 41
 key=SalesRevenueServicesNet,  Length of the list final 41
 cik=808362, key=SalesRevenueServicesNet,  Length of the finance record 41
 key=RevenueFromContractWithCustomerIncludingAssessedTax,  Length of the list 8
 key=RevenueFromContractWithCustomerIncludingAssessedTax,  Length of the list single filings 8
 key=RevenueFromContractWithCustomerIncludingAssessedTax,  Le

received data for company- Kraft Heinz Co, cik = 1637459
 key=Revenues,  Length of the list 0
 key=Revenues,  Length of the list single filings 0
 key=Revenues,  Length of the list after filtering 0
 key=Revenues,  Length of the list final 0
 cik=1637459, key=Revenues,  Length of the finance record 0
 key=SalesRevenueGoodsNet,  Length of the list 64
 key=SalesRevenueGoodsNet,  Length of the list single filings 31
 key=SalesRevenueGoodsNet,  Length of the list after filtering 17
 key=SalesRevenueGoodsNet,  Length of the list final 17
 cik=1637459, key=SalesRevenueGoodsNet,  Length of the finance record 17
 key=RevenueFromContractWithCustomerIncludingAssessedTax,  Length of the list 100
 key=RevenueFromContractWithCustomerIncludingAssessedTax,  Length of the list single filings 47
 key=RevenueFromContractWithCustomerIncludingAssessedTax,  Length of the list after filtering 28
 key=RevenueFromContractWithCustomerIncludingAssessedTax,  Length of the list final 28
 cik=1637459, key=RevenueF