## Imports

In [1]:
# To read company tickers file json files
import json
# To access DataFrame
import pandas as pd
# To read files
import csv

## Constants

In [2]:
# Data Path
DATA_PATH = 'data'

# Constant for Form 10-K
FORM_10K = '10-K'

# Constant for Form 10-Q
FORM_10Q = '10-Q'

# Form we are interested, should be one of those constants defined above
FORM = FORM_10K

# List of Submissions fields we are interersted
SUB_FIELDS = ['adsh','cik','name','sic','countryba','fye','form','period','fy','fp','filed','accepted']

# Listy of Numbers fields we are interested
NUM_FIELDS = ['adsh','tag','version','ddate','qtrs','uom','value']

# Symbols we are interested
SYMBOLS = ['GOOG','NVDA','ADBE']#'MSFT','AMZN','TSLA','WMT']

## Get symbol, cik mapping
### Reference: __[Access Companies SEC Filings Using Python](https://medium.datadriveninvestor.com/access-companies-sec-filings-using-python-760e6075d3ad)__
### json file used in this method -> https://www.sec.gov/files/company_tickers.json

In [3]:
def get_company_tickers() -> pd.DataFrame:
    '''
    Returns a DataFrame consists of CIK, ticker symbols
    
    Returns:
    pd.DataFrame: a DataFrame consists of CIK, ticker symbols or None for any errors
    '''
    # Specify the full path to load JSON data
    file_name = f'{DATA_PATH}/company_tickers.json'

    # DF to return
    df = pd.DataFrame()    
    try:
        # Open the file in read mode
        with open(file_name, 'r') as file:
            # Use json.load() to parse the JSON data from the file
            df = pd.json_normalize(pd.json_normalize(json.load(file), max_level=0).to_numpy()[0])
            df.set_index('ticker',inplace=True)
    except FileNotFoundError:
        print(f"File '{file_name}' not found.")
    except json.JSONDecodeError as e:
        print(f"Error decoding JSON data: {e}")
    except Exception as e:
        print(f"An error occurred: {e}")
    return df

In [4]:
# Get ticker, cik mappings
tickers_cik = get_company_tickers()
tickers_cik

Unnamed: 0_level_0,cik_str,title
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
MSFT,789019,MICROSOFT CORP
AAPL,320193,Apple Inc.
GOOGL,1652044,Alphabet Inc.
AMZN,1018724,AMAZON COM INC
NVDA,1045810,NVIDIA CORP
...,...,...
IHICF,1567526,IHI Corporation/ADR
FUPPF,1534043,Fuchs Petrolub SE/ADR
FUPEY,1534043,Fuchs Petrolub SE/ADR
AZTGY,1921332,AZTECH GLOBAL LTD./ADR


## Maps CIKs to tickers in SYMBOLS

In [5]:
# Maps CIK -> ticker
cik_ticker_dict = {}
for symbol in SYMBOLS:
    # Only interested in CIK
    cik_ticker_dict[tickers_cik.loc[symbol]['cik_str']] = symbol
cik_ticker_dict

{1652044: 'GOOG', 1045810: 'NVDA', 796343: 'ADBE'}

## Find CIK for Ticker

In [6]:
def get_cik(ticker:str) -> str:
    '''
    Returns the CIK associate with the stock symbol
    
    Parameters:
    ticker (str): the ticker symbol for a stock
    
    Returns:
    str: CIK associated with given ticket symbol or None if ticker is not part of the SYMBOLS constant
    '''
    if ((ticker not in SYMBOLS) or (ticker not in tickers_cik.index)):
        return None
    return tickers_cik.loc[ticker]['cik_str']

## Load Submissions file

In [8]:
def load_sub(year:int, qtr:int) -> pd.DataFrame:
    '''
    Returns the submissions as a DataFrame object
    
    Parameters:
    year (int): the year of the submission
    qtr (int): the quarter, valid values are 1 to 4
    
    Returns:
    pd.DataFrame: submissions for given year and quarter as a DataFrame. FileNotFound exception
    is thron if the submission file is not found for given parameters
    '''
    # Holds dictionaries to add to the DF
    records = []
    
    # Construct the filename
    fname = f'{DATA_PATH}/{year}q{qtr}/sub.txt'
    # Open file
    with open(fname, 'r', encoding='utf8') as file_obj: 
        # Save the heading as a list
        heading = next(file_obj).split()
        
        # Create reader object
        reader_obj = csv.reader(file_obj, delimiter='\t') 
          
        # Iterate over each row in the csv file
        for row in reader_obj:
            # Only interested in symbols specified at the beginning
            # Assume second item in the row is the CIK; this to avoid creating a row_dict item unnecessarily
            # Check whether CIK is belongs to one of our stocks
            if int(row[1]) not in cik_ticker_dict: continue
            # zip heading and row to create a dictionary
            row_dict = dict(zip(heading, row))
            # Only interested in forms specified in FORM constant
            if row_dict['form'] != FORM: continue
            # Filter columns we want to add
            filtered_dict = {}
            # Only interested in SUB_FIELDS
            for field in SUB_FIELDS:
                filtered_dict[field] = row_dict[field]
            records.append(filtered_dict)
    # Create a DF from a list of records
    df = pd.DataFrame(records)
    df['dataset'] = f'{year}q{qtr}'
    return df

## Create SUB DataFrame

In [9]:
# Collect DataFrames for each qtr
frames = []

# Loads data for years and quarters
for year in [2022, 2023]:
    for q in range(1,5):
        frames.append(load_sub(year,q))

# Join all the resulting DFs
sub_df = pd.concat(frames)
sub_df

Unnamed: 0,adsh,cik,name,sic,countryba,fye,form,period,fy,fp,filed,accepted,dataset
0,0001045810-22-000036,1045810,NVIDIA CORP,3674,US,131,10-K,20220131,2021,FY,20220318,2022-03-17 20:34:00.0,2022q1
1,0001652044-22-000019,1652044,ALPHABET INC.,7370,US,1231,10-K,20211231,2021,FY,20220202,2022-02-01 21:08:00.0,2022q1
2,0000796343-22-000032,796343,ADOBE INC.,7372,US,1130,10-K,20211130,2021,FY,20220121,2022-01-21 16:04:00.0,2022q1
0,0001045810-23-000017,1045810,NVIDIA CORP,3674,US,131,10-K,20230131,2022,FY,20230224,2023-02-24 17:24:00.0,2023q1
1,0001652044-23-000016,1652044,ALPHABET INC.,7370,US,1231,10-K,20221231,2022,FY,20230203,2023-02-02 21:24:00.0,2023q1
2,0000796343-23-000007,796343,ADOBE INC.,7372,US,1130,10-K,20221130,2022,FY,20230117,2023-01-17 16:34:00.0,2023q1


## Clean up SUB DataFrame

In [10]:
# Use adsh as the index
sub_df.set_index('adsh', inplace=True)

# Comply with the field format as specified in the readme.htm in the download

# CIK and sic are numeric fields
for key in ['cik','sic']:
    sub_df[key] = pd.to_numeric(sub_df[key], errors='coerce')
    
# Convert to date time format
for key in ['period','filed','accepted']:
    sub_df[key] = pd.to_datetime(sub_df[key])
sub_df

Unnamed: 0_level_0,cik,name,sic,countryba,fye,form,period,fy,fp,filed,accepted,dataset
adsh,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0001045810-22-000036,1045810,NVIDIA CORP,3674,US,131,10-K,2022-01-31,2021,FY,2022-03-18,2022-03-17 20:34:00,2022q1
0001652044-22-000019,1652044,ALPHABET INC.,7370,US,1231,10-K,2021-12-31,2021,FY,2022-02-02,2022-02-01 21:08:00,2022q1
0000796343-22-000032,796343,ADOBE INC.,7372,US,1130,10-K,2021-11-30,2021,FY,2022-01-21,2022-01-21 16:04:00,2022q1
0001045810-23-000017,1045810,NVIDIA CORP,3674,US,131,10-K,2023-01-31,2022,FY,2023-02-24,2023-02-24 17:24:00,2023q1
0001652044-23-000016,1652044,ALPHABET INC.,7370,US,1231,10-K,2022-12-31,2022,FY,2023-02-03,2023-02-02 21:24:00,2023q1
0000796343-23-000007,796343,ADOBE INC.,7372,US,1130,10-K,2022-11-30,2022,FY,2023-01-17,2023-01-17 16:34:00,2023q1


## Load NUM file

In [11]:
def load_num(year:int, qtr:int) -> pd.DataFrame:
    '''
    Returns the numbers as a DataFrame object
    
    Parameters:
    year (int): the year for the numbers
    qtr (int): the quarter, valid values are 1 to 4
    
    Returns:
    pd.DataFrame: numbers for given year and quarter as DataFrame. FileNotFound exception
    is thron if the numbers file is not found for given parameters
    '''
    # Holds dictionaries to add to the DF
    records = []
    
    # Construct the filename
    fname = f'{DATA_PATH}/{year}q{qtr}/num.txt'
    
    # Open file
    with open(fname, 'r', encoding='utf8') as file_obj: 
        # Save the heading as a list
        heading = next(file_obj).split()
        
        # Create reader object
        reader_obj = csv.reader(file_obj, delimiter='\t') 
          
        # Iterate over each row in the csv file
        for row in reader_obj:
            # Only interested in rows with accession numbers (adsh) beloging to the symbols we are interested.
            # Assume the first item in the row is the adsh; this to avoid creating row_dict item unnecessarily
            if row[0] not in sub_df.index: continue
            # zip heading and row to create a dictionary
            row_dict = dict(zip(heading, row))
            # Filter columns we want to add
            filtered_dict = {}
            # Only interested in NUM_FIELDS
            for field in NUM_FIELDS:
                filtered_dict[field] = row_dict[field]
            records.append(filtered_dict)
            # Add a column to specify the fy for a number field - custom field
            filtered_dict['fy'] = sub_df.loc[row[0]]['fy']
    # Create a DF from a list of records
    df = pd.DataFrame(records)            
    # Custom field - adds the dataset name
    df['dataset'] = f'{year}q{qtr}'
    return df

## Create NUM DataFrame

In [12]:
# Collect DataFrames for each qtr
frames = []
# Loads data for years and quarters
for year in [2022, 2023]:
    for q in range(1,5):
        frames.append(load_num(year,q))

# Join all the resulting DFs
num_df = pd.concat(frames)
num_df

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,value,fy,dataset
0,0001045810-22-000036,AccountsPayableCurrent,us-gaap/2021,20220131,0,USD,1783000000.0000,2021,2022q1
1,0001045810-22-000036,AccountsPayableCurrent,us-gaap/2021,20210131,0,USD,1149000000.0000,2021,2022q1
2,0001652044-22-000019,AccountsPayableCurrent,us-gaap/2021,20211231,0,USD,6037000000.0000,2021,2022q1
3,0001652044-22-000019,AccountsPayableCurrent,us-gaap/2021,20201231,0,USD,5589000000.0000,2021,2022q1
4,0000796343-22-000032,AccountsPayableCurrent,us-gaap/2021,20211130,0,USD,312000000.0000,2021,2022q1
...,...,...,...,...,...,...,...,...,...
1874,0001045810-23-000017,PurchasesOfPropertyAndEquipmentAndIntangibleAs...,0001045810-23-000017,20230131,4,USD,1833000000.0000,2022,2023q1
1875,0001045810-23-000017,PurchasesOfPropertyAndEquipmentAndIntangibleAs...,0001045810-23-000017,20220131,4,USD,976000000.0000,2022,2023q1
1876,0001045810-23-000017,PurchasesOfPropertyAndEquipmentAndIntangibleAs...,0001045810-23-000017,20210131,4,USD,1128000000.0000,2022,2023q1
1877,0001045810-23-000017,SharebasedCompensationArrangementbySharebasedP...,0001045810-23-000017,20230131,0,USD,158.3500,2022,2023q1


## Clean up NUM DataFrame

In [13]:
# Comply with the field format as specified in the readme.htm in the download

# Quarters and value are numeric fields
for key in ['qtrs', 'value']:
    num_df[key] = pd.to_numeric(num_df[key], errors='coerce')
    
# Convert to date time format
num_df['ddate'] = pd.to_datetime(num_df['ddate'])

num_df

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,value,fy,dataset
0,0001045810-22-000036,AccountsPayableCurrent,us-gaap/2021,2022-01-31,0,USD,1.783000e+09,2021,2022q1
1,0001045810-22-000036,AccountsPayableCurrent,us-gaap/2021,2021-01-31,0,USD,1.149000e+09,2021,2022q1
2,0001652044-22-000019,AccountsPayableCurrent,us-gaap/2021,2021-12-31,0,USD,6.037000e+09,2021,2022q1
3,0001652044-22-000019,AccountsPayableCurrent,us-gaap/2021,2020-12-31,0,USD,5.589000e+09,2021,2022q1
4,0000796343-22-000032,AccountsPayableCurrent,us-gaap/2021,2021-11-30,0,USD,3.120000e+08,2021,2022q1
...,...,...,...,...,...,...,...,...,...
1874,0001045810-23-000017,PurchasesOfPropertyAndEquipmentAndIntangibleAs...,0001045810-23-000017,2023-01-31,4,USD,1.833000e+09,2022,2023q1
1875,0001045810-23-000017,PurchasesOfPropertyAndEquipmentAndIntangibleAs...,0001045810-23-000017,2022-01-31,4,USD,9.760000e+08,2022,2023q1
1876,0001045810-23-000017,PurchasesOfPropertyAndEquipmentAndIntangibleAs...,0001045810-23-000017,2021-01-31,4,USD,1.128000e+09,2022,2023q1
1877,0001045810-23-000017,SharebasedCompensationArrangementbySharebasedP...,0001045810-23-000017,2023-01-31,0,USD,1.583500e+02,2022,2023q1


## Get Numbers for a Symbol, Tag and Year

In [14]:
def get_tag_values(symbol:str, tag:str, year:str=None) -> dict:
    '''
    Returns the values for given tag belonging to a symbol
    
    Parameters:
    symbol (str): ticket symbol must exist in the SYMBOLS constant
    tag (str): unique identifier (name) for a tag in a specific taxonomy release
    year (str): year for the values or defaults to all the years if not specified
    
    Returns:
    dict: with followng keys: symbol, CIK, tag and list of values associated with the tag or
    or a skeleton dictionary if symbol does not exist in the SYMBOLS constant
    '''
    # Dictionary to return
    result = {'symbol': symbol, tag: []}

    # Get the CIK for given symbol
    cik = str(get_cik(symbol))
    
    # Return the empty dictionary if symbol is not found
    if not cik: return result
    
    # Pad with zero for CIK - as per specification
    result['cik'] = cik.zfill(10)

    # Only interested in qtr 1 and 0 for 10-Q and 4 and 0 for 10-K
    qtr = 1 if FORM == FORM_10Q else 4

    if year is None:
        df = num_df.query('adsh.str.contains(@cik) and tag == @tag and qtrs in (@qtr,0)')
    else:
        df = num_df.query('adsh.str.contains(@cik) and tag == @tag and qtrs in (@qtr,0) and fy == @year')
            
    if df.empty: return result

    # Save unique dates
    unique_dates = df['ddate'].unique()
    # Loop through using unique dates, only take the value from the latest data set
    for date in sorted(unique_dates):
        # Only take the first item
        series = df.query('ddate == @date').sort_values(by=['dataset'], ascending=False).iloc[0]
        dict = series.to_dict()
        tag_item = {
            'fiscalDateEnding': dict['ddate'].strftime('%Y-%m-%d'),
            'reportedDate': sub_df.loc[dict['adsh']]['filed'].strftime('%Y-%m-%d'),
            'value': dict['value']
        }
        result[tag].append(tag_item)
    return result

In [33]:
# get_tag_values('NVDA','EarningsPerShareBasic', '2021')
get_tag_values('GOOG','EarningsPerShareBasic')
# get_tag_values('NVDA','LiabilitiesCurrent')#, '2022')
# get_tag_values('NVDA','EarningsPerShareBasic')

{'symbol': 'GOOG',
 'EarningsPerShareBasic': [{'fiscalDateEnding': '2019-12-31',
   'reportedDate': '2022-02-02',
   'value': 49.59},
  {'fiscalDateEnding': '2020-12-31',
   'reportedDate': '2023-02-03',
   'value': 2.96},
  {'fiscalDateEnding': '2021-12-31',
   'reportedDate': '2023-02-03',
   'value': 5.69},
  {'fiscalDateEnding': '2022-12-31',
   'reportedDate': '2023-02-03',
   'value': 4.59}],
 'cik': '0001652044'}

## Utility method to get Submissions for a Symbol and Year

In [16]:
def get_subs_for_symbol(symbol:str, year:str=None) -> pd.DataFrame:
    '''
    Returns the submissions for a symbol
    
    Parameters:
    symbol (str): ticket symbol must exist in SYMBOLS constant
    year (str): year for the values or defaults to all the years if not specified
    
    Returns:
    pd.DataFrame: a DF with submissions for given symbol and year (optional); empty DF is
    returned if symbols is not found in the SYMBOLS constant
    '''
    # Get the CIK for given symbol
    cik = get_cik(symbol)
    
    # Return empty data frame if symbol is not found
    if not cik: return pd.DataFrame()
    if year is None: return sub_df.query('cik == @cik')
    return sub_df.query('cik == @cik and fy == @year')

In [36]:
get_subs_for_symbol('NVDA')
# get_subs_for_symbol('GOOG', '2023')

Unnamed: 0_level_0,cik,name,sic,countryba,fye,form,period,fy,fp,filed,accepted,dataset
adsh,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0001045810-22-000036,1045810,NVIDIA CORP,3674,US,131,10-K,2022-01-31,2021,FY,2022-03-18,2022-03-17 20:34:00,2022q1
0001045810-23-000017,1045810,NVIDIA CORP,3674,US,131,10-K,2023-01-31,2022,FY,2023-02-24,2023-02-24 17:24:00,2023q1


## Utility method to get Numbers for a Symbol, Tag and Year

In [18]:
def get_nums_for_tag(symbol:str, tag:str, year:str=None) -> pd.DataFrame:
    '''
    Returns the numbers for a symbol, tag and year (optional)
    
    Parameters:
    symbol (str): ticket symbol must exist in SYMBOLS constant
    tag (str): unique identifier (name) for a tag in a specific taxonomy release
    year (str): year for the values or defaults to all the years if not specified
    
    Returns:
    pd.DataFrame: a DF with numbers for given symbol, tag and year (optional); empty DF is
    returned if symbols is not found in the SYMBOLS constant
    '''
    # Get the CIK for given symbol
    cik = get_cik(symbol)
    
    # Return empty data frame if symbol is not found
    if not cik: return pd.DataFrame()
    cik = str(cik).zfill(10)
    if year is None:
        return num_df.query('adsh.str.contains(@cik) and tag.str.contains(@tag)')
    return num_df.query('adsh.str.contains(@cik) and tag.str.contains(@tag) and fy == @year')

In [39]:
get_nums_for_tag('GOOG', 'PerShare')

Unnamed: 0,adsh,tag,version,ddate,qtrs,uom,value,fy,dataset
193,0001652044-22-000019,CommonStockParOrStatedValuePerShare,us-gaap/2021,2020-12-31,0,USD,0.001,2021,2022q1
194,0001652044-22-000019,CommonStockParOrStatedValuePerShare,us-gaap/2021,2021-12-31,0,USD,0.001,2021,2022q1
540,0001652044-22-000019,EarningsPerShareBasic,us-gaap/2021,2019-12-31,4,USD,49.59,2021,2022q1
541,0001652044-22-000019,EarningsPerShareBasic,us-gaap/2021,2020-12-31,4,USD,59.15,2021,2022q1
542,0001652044-22-000019,EarningsPerShareBasic,us-gaap/2021,2021-12-31,4,USD,113.88,2021,2022q1
552,0001652044-22-000019,EarningsPerShareDiluted,us-gaap/2021,2019-12-31,4,USD,49.16,2021,2022q1
553,0001652044-22-000019,EarningsPerShareDiluted,us-gaap/2021,2020-12-31,4,USD,58.61,2021,2022q1
554,0001652044-22-000019,EarningsPerShareDiluted,us-gaap/2021,2021-12-31,4,USD,112.2,2021,2022q1
1279,0001652044-22-000019,PreferredStockParOrStatedValuePerShare,us-gaap/2021,2020-12-31,0,USD,0.001,2021,2022q1
1280,0001652044-22-000019,PreferredStockParOrStatedValuePerShare,us-gaap/2021,2021-12-31,0,USD,0.001,2021,2022q1
