Libraries

In [3]:
import os
import json
import requests
from datetime import datetime
from ratelimit import limits, sleep_and_retry
from bs4 import BeautifulSoup
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

SIC Code Map

In [4]:
def get_sic_list():
    url = 'https://www.sec.gov/corpfin/division-of-corporation-finance-standard-industrial-classification-sic-code-list'
    response = requests.get(url, headers = {'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'})
    table = BeautifulSoup(response.content, 'html.parser').find('table').find_all('tr')[1:]
    sic_map = {}
    for row in table:
        cells = row.find_all('td')
        sic_map[cells[0].text] = {'office': cells[1].text, 'industry': cells[2].text}
    return sic_map

sic_list = get_sic_list()

Existing CIKs

In [6]:
def get_cik_list():
    url = f'https://www.sec.gov/files/company_tickers_exchange.json'
    js = requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'}).json()
    cik_list = []
    for entry in js.get('data'):
        cik_list.append(str(entry[0]))
    return cik_list

cik_list = get_cik_list()

### Labels & Descriptions
#### Uses API to extract labels and descriptions of accounting metrics, run time ~ 1 hour
##### Code run schedule ~ annually

In [8]:
def get_labels_descriptions(cik_list):
    labels_descriptions = {}
    with tqdm(total=len(cik_list), desc='Processing', unit='URL') as pbar:
        for cik in cik_list:
            try:
                url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik.zfill(10)}.json'
                js = (requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'})).json()
                for accounting in js.get('facts'):
                    for metric in js.get('facts').get(accounting):
                        labels_descriptions.update({js.get('facts').get(accounting).get(metric).get('label'): js.get('facts').get(accounting).get(metric).get('description')})
            except:
                pass
            pbar.update(1)
    return labels_descriptions

labels_descriptions = get_labels_descriptions(cik_list)

Processing:   1%|          | 60/10188 [00:31<1:43:34,  1.63URL/s]

In [None]:
RATE_LIMIT = 10
ONE_SECOND = 1

@sleep_and_retry
@limits(calls=RATE_LIMIT, period=ONE_SECOND)

def get_labels_descriptions(cik_list):
    labels_descriptions = {}
    def fetch_labels_descriptions(cik):
        try:
            url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik.zfill(10)}.json'
            js = (requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'})).json()
            for accounting in js.get('facts'):
                for metric in js.get('facts').get(accounting):
                    labels_descriptions.update({js.get('facts').get(accounting).get(metric).get('label'): js.get('facts').get(accounting).get(metric).get('description')})
        except:
            pass

    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = {executor.submit(fetch_labels_descriptions, cik): cik for cik in cik_list}
        for future in tqdm(as_completed(futures), total=len(futures), desc='Processing', unit='URL'):
            future.result()

    return labels_descriptions

labels_descriptions = get_labels_descriptions(cik_list)

### Submissions by CIK
#### Uses SEC API to extract descriptive values for the company, run time ~ 35 minutes
##### Code run schedule ~ quarterly

In [None]:
def get_cik_information(cik_list, sic_list):
    with tqdm(total=len(cik_list), desc='Processing', unit='URL') as pbar:
        filings = {}
        for cik in cik_list:
            try:
                url = f'https://data.sec.gov/submissions/CIK{cik.zfill(10)}.json'
                js = (requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'})).json()
                sic = js.get('sic')
                addresses = js.get('addresses').get('business')
                filings[cik] = {
                    'name': (js.get('name')).upper(),
                    'ticker': js.get('tickers')[0],
                    'exchange': (js.get('exchanges')[0]).upper(),
                    'sic': sic,
                    'office': sic_list[sic].get('office'),
                    'industry': sic_list[sic].get('industry'),
                    'ein':  js.get('ein'),
                    'category': js.get('category'),
                    'tax_state': js.get('stateOfIncorporation'),
                    'fiscal_year_end': js.get('fiscalYearEnd')}
                    #'business_address': addresses.get('street1') + ',' + addresses.get('city') + ',' + addresses.get('stateOrCountry') + addresses.get('zipCode')}
            except:
                pass
            pbar.update(1)
    return filings

cik_information = get_cik_information(cik_list, sic_list)

In [None]:
def get_cik_information(cik_list, sic_list):
    filings = {}
    def fetch_cik_data(cik):
        try:
            url = f'https://data.sec.gov/submissions/CIK{cik.zfill(10)}.json'
            js = (requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'})).json()
            sic = js.get('sic')
            #business = js.get('addresses').get('business')
            #mailing = js.get('addresses').get('mailing')
            filings[cik] = {
                'name': js.get('name').upper(),
                'ticker': js.get('tickers')[0],
                'exchange': js.get('exchanges')[0].upper(),
                'sic': sic,
                'office': sic_list.get(sic).get('office'),
                'industry': sic_list.get(sic).get('industry'),
                'ein': js.get('ein'),
                'category': js.get('category'),
                'tax_state': js.get('stateOfIncorporation'),
                'fiscal_year_end': js.get('fiscalYearEnd') #,
                #'business_address': business.get('street1') + ', ' + business.get('street2') + ', ' + business.get('city') + ', ' + business.get('stateOrCountry') + ' ' + business.get('zipCode'),
                #'mailing_address': mailing.get('street1') + ', ' + mailing.get('street2') + ', ' + mailing.get('city') + ', ' + mailing.get('stateOrCountry') + ' ' + mailing.get('zipCode')
            }
        except Exception as e:
            print(f"Error fetching data for CIK {cik}: {e}")

    with ThreadPoolExecutor(max_workers=1) as executor:
        futures = {executor.submit(fetch_cik_data, cik): cik for cik in cik_list}
        for future in tqdm(as_completed(futures), total=len(futures), desc='Processing', unit='URL'):
            future.result()

    return filings

cik_information = get_cik_information(cik_list, sic_list)

### Company Facts by CIK
#### Uses bulk data folder to extract accounting metrics by cik, run time ~ XX minutes
##### Code run schedule ~ quarterly
##### Update Company Facts folder ~ quarterly

In [None]:
# This code requires the bulk data content form the companyfacts zip file to be extracted 
def folder_companyfacts(folder_path):
    filings_database = {}
    files = os.listdir(folder_path)
    with tqdm(total=len(files), desc='Processing Files', unit='file') as pbar:
        for file_name in files:
            try:
                js = json.load(open(os.path.join(folder_path, file_name), 'r'))
                filings = {}
                for accounting in js.get('facts'):
                    for metric in js.get('facts').get(accounting):
                        filings[metric] = next(iter((js.get('facts').get(accounting).get(metric).get('units')).values()))
            except:
                pass
            filings_database[str(js.get('cik'))] = filings
            pbar.update(1)
    return filings_database
#company_facts = folder_companyfacts("C:\EDGAR\companyfacts")

# This code requires the correct URL to access each CIK's SEC filings
def api_company_facts(cik_list):
    company_facts = {}
    with tqdm(total=len(cik_list), desc='Processing', unit='URL') as pbar:
        for cik in cik_list:
            try:
                url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik.zfill(10)}.json'
                js = (requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'})).json()
                filings = {}
                for accounting in js.get('facts'):
                    for metric in js.get('facts').get(accounting):
                        filings[metric] = next(iter((js.get('facts').get(accounting).get(metric).get('units')).values()))
            except:
                pass
            company_facts[cik] = filings
            pbar.update(1)
    return company_facts

company_facts = api_company_facts(cik_list)

In [None]:
def api_company_facts(cik_list):
    company_facts = {}

    def fetch_company_facts(cik):
        filings = {}
        try:
            url = f'https://data.sec.gov/api/xbrl/companyfacts/CIK{cik.zfill(10)}.json'
            js = requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'}).json()
            for accounting in js.get('facts', {}):
                for metric in js.get('facts').get(accounting, {}):
                    units = js.get('facts').get(accounting).get(metric).get('units', {})
                    if units:
                        filings[metric] = next(iter(units.values()))
        except Exception as e:
            print(f"Error fetching data for CIK {cik}: {e}")
        return cik, filings

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_company_facts, cik): cik for cik in cik_list}
        for future in tqdm(as_completed(futures), total=len(futures), desc='Processing', unit='URL'):
            cik, filings = future.result()
            company_facts[cik] = filings

    return company_facts

company_facts = api_company_facts(cik_list)

### Update Company Facts
#### Uses SEC Daily-Index to check CIKs requiring updates, run time ~ 1 second
##### Code run schedule ~ daily
##### Update URL with FY & FP ~ annually, quarterly

In [None]:
def update_filings():
    today = datetime.today()
    date = today.strftime('%Y%m%d')
    try:
        url = f'https://www.sec.gov/Archives/edgar/daily-index/2024/QTR2/master.{date}.idx'
        idx = (requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'})).text
        content = idx.split('\n')[7:]
        sets = set()
        desired_forms = ['10-Q', '10-K']
        for entry in content:
            data = entry.split('|')
            cik, form = data[0], data[2]
            if form in desired_forms:
                sets.add(str(cik))
    except:
        pass
    return sets

update_filings()

Writing to JSON 

In [None]:
with open(os.path.join('C:\EDGAR', 'sic_list.json'), 'w') as file:
    json.dump(sic_list, file, indent=0)

with open(os.path.join('C:\EDGAR', 'cik_information.json'), 'w') as file:
    json.dump(cik_information, file, indent=0)

with open(os.path.join('C:\EDGAR', 'labels_descriptions.json'), 'w') as file:
    json.dump(labels_descriptions, file, indent=0)

# with open(os.path.join('C:\EDGAR', 'company_facts.json'), 'w') as file:
#     json.dump(company_facts, file, indent=0)

### Form 4 Filings by CIK
#### Uses SEC Full-Index to import submisions for each existing CIK, run time ~ XXX
##### Code run schedule ~ once
##### However, requires a dialy update to avoid redundant computations

In [None]:
def get_form_4_directories():
    list = []
    current_year = datetime.now().year
    for year in range(2024, current_year + 1):
        for quarter in ['QTR2']:
            try:
                url = f'https://www.sec.gov/Archives/edgar/full-index/{year}/{quarter}/master.idx'
                idx = (requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'})).text
                content = idx.split('\n')[11:]
                for entry in content:
                    data =  entry.split('|')
                    if data[2] == '4':
                        list.append(data[4])
            except:
                pass
    return list

form_4_directories = get_form_4_directories()

In [None]:
cik_inside_trades = {cik: {} for cik in cik_list}

def true_false(value):
    return value == '1' or value.lower() == 'true'

def get_form_4_submissions(directory):
    try:
        url = f'https://www.sec.gov/Archives/{directory}'
        content = BeautifulSoup(requests.get(url, headers={'User-Agent': 'Simeon Kolev simeon_kolev@outlook.com'}).text, 'xml')

        issuer_cik = content.find('issuerCik').text.lstrip('0')
        owner_cik = content.find('rptOwnerCik').text.lstrip('0')

        if issuer_cik not in cik_inside_trades:
            return
        if owner_cik not in cik_inside_trades[issuer_cik]:
            cik_inside_trades[issuer_cik][owner_cik] = {
                'reporting': [],
                'non_derivative_transactions': [],
                'non_derivative_holdings': [], 
                'derivative_holdings': [],
                'underlying_securities': [],
                'securities': []}
            
        cik_inside_trades[issuer_cik][owner_cik]['reporting'].append({
            'date': content.find('periodOfReport').text
            'name': content.find('rptOwnerName').text,
            'officer': true_false(content.find('isOfficer').text),
            'title': true_false(content.find('officerTitle').text),
            'director': true_false(content.find('isDirector').text),
            '10% owner': true_false(content.find('isTenPercentOwner').text),
            'other': content.find('isOther').text,
            'aff': content.find('aff10b5One').text})

        for entry in content.find_all('nonDerivativeTransaction'):
            cik_inside_trades[issuer_cik][owner_cik]['non_derivative_transactions'].append({
                'date': entry.find('transactionDate').find('value').text,
                'security': entry.find('securityTitle').find('value').text,
                'transaction': entry.find('transactionCode').text,
                'shares': entry.find('transactionShares').find('value').text,
                'price': entry.find('transactionPricePerShare').find('value').text,
                'shares_owned': entry.find('sharesOwnedFollowingTransaction').find('value').text,
                'ownership': entry.find('directOrIndirectOwnership').find('value').text,
                'ownership_nature': entry.find('natureOfOwnership').find('value').text,
                'code': entry.find('transactionAcquiredDisposedCode').find('value').text,
                'equity_swap': entry.find('equitySwapInvolved').text})

        for entry in content.find_all('nonDerivativeHolding'):
            cik_inside_trades[issuer_cik][owner_cik]['non_derivative_holdings'].append({
                'security': entry.find('securityTitle').find('value').text,
                'shares_owned': entry.find('sharesOwnedFollowingTransaction').find('value').text,
                'ownership': entry.find('directOrIndirectOwnership').find('value').text})

        for entry in content.find_all('derivativeHolding'):
            cik_inside_trades[issuer_cik][owner_cik]['derivative_holdings'].append({
                'date': entry.find('exerciseDate').text,
                'security': entry.find('securityTitle').find('value').text,
                'price': entry.find('conversionOrExercisePrice').find('value').text,
                'expiration_date': entry.find('expirationDate').text})

        for entry in content.find_all('underlyingSecurity'):
            cik_inside_trades[issuer_cik][owner_cik]['underlying_securities'].append({
                'security': entry.find('underlyingSecurityTitle').find('value').text,
                'shares': entry.find('underlyingSecurityShares').find('value').text,
                'shares_owned': entry.find('sharesOwnedFollowingTransaction').find('value').text,
                'ownership': entry.find('directOrIndirectOwnership').find('value').text})

        for entry in content.find_all('sellerDetails'):
            cik_inside_trades[issuer_cik][owner_cik]['securities'].append({
                'date': entry.find('saleDate').find('value').text,
                'security': entry.find('securitiesClassTitle').find('value').text,
                'amount_sold': entry.find('amountOfSecuritiesSold').find('value').text,
                'gross_proceeds': entry.find('grossProceeds').find('value').text,
                'sold_past_3_months': entry.find('securitiesSoldInPast3Months').find('value').text})

    except Exception as e:
        print(f"Error processing {directory}: {e}")

with ThreadPoolExecutor(max_workers=10) as executor:
    futures = [executor.submit(get_form_4_submissions, directory) for directory in form_4_directories]
    for future in tqdm(as_completed(futures), total=len(futures), desc='Processing', unit='URL'):
        future.result()