In [None]:
from bs4 import BeautifulSoup
from datetime import datetime, date, timedelta
import pandas as pd
import numpy as np
import requests
import sys
from tiingo import TiingoClient

config = {}
config['session'] = True
config['api_key'] = ""
client = TiingoClient(config)

In [None]:
# Establish dict for stock ticker to cik conversion
ticker_cik = pd.read_csv("https://www.sec.gov/include/ticker.txt", sep = "\t", header = None)
cik_dict = ticker_cik.set_index(0).to_dict()[1]

# Function to establish the filing index pages
def get_index_url(symbol, date):
    # Get cik number for given symbol.
    cik = cik_dict[symbol]
    
    # Initialize filing_date arbitrarily early date for use later.
    filing_date = datetime.strptime('2000-01-01', '%Y-%m-%d').date()
    
    # Our goal is to find most recently filed corporate info - 
    # could be in a 10-Q or 10-K so we check both.
    for filing in ['10-Q', '10-K']:
        
        # Find SEC filings page for current symbol, ticker, and date using wildcards.
        base_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type={}&dateb={}&owner=include&count=100"
        filing_page_resp = requests.get(base_url.format(cik, filing, date))
        filing_page_str = filing_page_resp.text

        # BS allows us to parse the HTML. Establish that HTML 'soup',
        # find table of filings by class, and establish table rows by tag.
        soup = BeautifulSoup(filing_page_str, 'html.parser')
        table = soup.find('table', class_='tableFile2')
        rows = table.find_all('tr')
        
        # Iterate over table rows, getting cells by tag. We break
        # once we make sure that we find the right filing (and we
        # check to ensure the right filing mostly because we could 
        # get mixed up by '10-K/A' amendments which appear in '10-K' 
        # search but aren't useful).
        for row in rows:
            cells = row.find_all('td')
            # If this is True, we've reached table observations and
            # found a relevant filing.
            # We want to only use information we could have known
            # PRIOR to the trading day, so I'll move past filings
            # that were submitted on the same day as the input date.
            if len(cells) > 3 and cells[0].text == filing and cells[3].text != date:
                # We'll update link and filing_date IF date (cells[3]) is more
                # recent than saved filing_date (to find most recent
                # filing between 10-Q and 10-K pages).
                check_date = datetime.strptime(cells[3].text, '%Y-%m-%d').date()
                if check_date > filing_date:
                    index_url = 'https://www.sec.gov' + cells[1].a['href']
                    filing_date = check_date
                    filing_type = filing
                break
                    
    return(index_url, filing_date, filing_type)
           

# Function to establish relevant xbrl links.
# All we need to do is pull index page for most recent company filing
# and find the 'XBRL INSTANCE DOCUMENT'.
def get_xbrl_url(index_url):    

    # As in get_index_url() function, establish page text.
    index_resp = requests.get(index_url)
    index_str = index_resp.text
    
    # As in get_index_url() function, get BS 'soup', find our table
    # of interest (in this case, doing so by table's summary), and
    # establish rows.
    soup = BeautifulSoup(index_str, "html.parser")
    table = soup.find('table', summary='Data Files')
    rows = table.find_all('tr')
    
    # Iterate over table rows looking for XBRL instance doc, saving link
    # once we find it.
    for row in rows:
        cells = row.find_all('td')
        if len(cells) > 3 and 'XBRL INSTANCE DOCUMENT' in cells[1].text:
            xbrl_link = 'https://www.sec.gov' + cells[2].a['href']
                
    return(xbrl_link)


# Get most recent values for our desired tags within the xbrl file.
def get_tag_values(xbrl_link):

    # As above, get XBRL text.
    xbrl_resp = requests.get(xbrl_link)
    xbrl_str = xbrl_resp.text
    
    # This time, our 'soup' is 'lxml' format instead of 'html'
    soup = BeautifulSoup(xbrl_str, 'lxml')
    
    # At some stage, companies swapped from manual submission
    # of their 'XBRL INSTANCE DOCUMENT' to iXBRL submission, resulting
    # in 'EXTRACTED XBRL INSTANCE DOCUMENT's. The only relevant change 
    # this causes is that a tag's 'xbrli:context' became 'context' and
    # the 'xbrli:enddate' became 'enddate'. We'll set these conditional
    # on which version the file is using, as identified by whether we
    # find an 'xbrli:context' tag in the file.
    xbrli_string = ''
    if soup.find('xbrli:context'):
        xbrli_string = 'xbrli:'
    
    context_tag = xbrli_string + 'context'
    enddate_tag = xbrli_string + 'enddate'
    instant_tag = xbrli_string + 'instant'
    
    # Here, we'll pull the filing's reference period end
    # (only for the sake of additional detail)
    ref_period = soup.find('dei:documentfiscalperiodfocus')
    date_id = ref_period.get('contextref')
    period_context = soup.find(context_tag, {'id':date_id})
    period_end = period_context.find(enddate_tag).text
    
    
    # Now we get into pulling our equity values. Some key notes:
    
    # 1. We are only interested in these tags if they are true values, 
    # not 'dimensions' of those values. We check this by ensuring
    # the tag's context does NOT contain an 'explicitmember' field.
    
    # 2. We only want to keep the most recent value (found in the tag's 
    # context), because filings can list values for past filings as well.
    
    # 3. We are interested in equity not including minority interest.
    # 'stockholdersequity' =
    # 'stockholdersequityincludingportionattributabletononcontrollinginterest'
    # less 'minorityinterest'. If we are missing 'stockholdersequity', we can
    # therefore calculate it with the other two figrues. In some cases, companies will
    # list only 'stockholdersequityincludingportionattributabletononcontrollinginterest'
    # without the other two figures. In these cases, it appears that the company does
    # not have any 'minorityinterest' and we can use this value as our equity figure.
    
    # For more details, see:
    # - https://xbrl.us/data-rule/dqc_0004pr/
    # - https://xbrl.us/guidance/specific-non-controlling-interest-elements/
    
    # Here, we'll get the filing's most recent values for each tag and save to dict.
    equity_dict = {}
    for equity_tag in ["us-gaap:stockholdersequity", "us-gaap:stockholdersequityincludingportionattributabletononcontrollinginterest", "us-gaap:minorityinterest"]:
        tag_dict = {"date": None,
                   "value": None}
        
        # Find tags in document.
        tag_list = soup.find_all(equity_tag)
        
        # Initialize for use in FOR loop.
        equity_date = datetime.strptime('2000-01-01', '%Y-%m-%d').date()
        
        for t in tag_list:
            # Get the tag's context for additional detail.
            ref_id = t.get('contextref')
            context = soup.find(context_tag, {'id':ref_id})
            
            # Pull the value's date reference from its context.
            equity_date_obs = datetime.strptime(context.find(instant_tag).text, '%Y-%m-%d').date()
            
            # If the value is more recent than saved value AND NOT a 'dimension',
            # we update the reference date and our saved value.
            if equity_date_obs >= equity_date and not context.find('xbrldi:explicitmember'):
                equity_date = equity_date_obs
                tag_dict = {"date": equity_date,
                           "value": int(float(t.text))}
                
        # Coming out of the above loop, we now have the 'value' of
        # the most recent non-'dimension' tag and its 'date'. We save 
        # these, and the tag name for reference, to the equity_dict.
        equity_dict.update({equity_tag: tag_dict})
        
    # To go alongside the logic in point (3.), we'll make sure to use the most recent
    # equity figure if the figures' dates including and excluding minority interest differ.
    # This did not occur in any of my trials but it seems like an edge case worth covering
    # since I've found no explicit rules against it.
    most_recent = True
    if equity_dict["us-gaap:stockholdersequity"]['value'] is not None and equity_dict["us-gaap:stockholdersequityincludingportionattributabletononcontrollinginterest"]['value'] is not None:
        if equity_dict["us-gaap:stockholdersequityincludingportionattributabletononcontrollinginterest"]['date'] >= equity_dict["us-gaap:stockholdersequity"]['date']:
            most_recent = False
    
    # So, with our equity_dict, we can save a single equity value
    # using the logic described in point (3.) above.
    if equity_dict["us-gaap:stockholdersequity"]['value'] is not None and most_recent:
        equity = equity_dict["us-gaap:stockholdersequity"]['value']
        equity_date = equity_dict["us-gaap:stockholdersequity"]['date']
    else:
        if equity_dict["us-gaap:minorityinterest"]['value'] is not None:
            equity = equity_dict["us-gaap:stockholdersequityincludingportionattributabletononcontrollinginterest"]['value'] - equity_dict["us-gaap:minorityinterest"]['value']
        else:
            equity = equity_dict["us-gaap:stockholdersequityincludingportionattributabletononcontrollinginterest"]['value']
        equity_date = equity_dict["us-gaap:stockholdersequityincludingportionattributabletononcontrollinginterest"]['date']
    
    # Now we'll look to establish the shares outstanding figure.
    # This is a bit simpler. According to the XBRL rules, each
    # class of stock should have one and only one value in the 
    # document tagged 'entitycommonstocksharesoutstanding'. We 
    # can run through these tags and if we find a 'dimension'-less
    # value we save and stop (this is the total of shares outstanding),
    # and if we don't then we sum the 'dimension'-al values. We only
    # do not blindly sum all tags to handle the case where a 
    # 'dimension'-less sum AND class-wise aspects are included 
    # (I don't think this should happen, but again I've found no
    # explicit rule against it).
    
    # For additional detail, see:
    # - https://www.sec.gov/structureddata/edgarvalidationerrors
    
    for shares_tag in ["dei:entitycommonstocksharesoutstanding"]:
        # Establish tag list
        tag_list = soup.find_all(shares_tag)
        
        shares = 0
        
        # Iterate over tag list
        for t in tag_list:
            ref_id = t.get('contextref')
            context = soup.find(context_tag, {'id':ref_id})
            
            # If we find a 'dimension'-less value, this contains the total
            # shares so we can save and break
            if not context.find('xbrldi:explicitmember'):
                shares = int(float(t.text))
                shares_date = datetime.strptime(context.find(instant_tag).text, '%Y-%m-%d').date()
                break
                
            # If we only find 'dimension'-al values, we'll sum them.
            else:
                shares += int(float(t.text))
                shares_date = datetime.strptime(context.find(instant_tag).text, '%Y-%m-%d').date()
        
    # Of note - we save the date of the shares observation for
    # later market cap calculations. And we save equity_date
    # for completeness - Though this seems unnecessary because
    # it appears to always be the same as period_end.
    return(period_end, equity, equity_date, shares, shares_date)


# Iterate over symbols and end dates to bring together
# our sample_inputs (though we'll need to update with
# prices to get market cap).
def get_sample_inputs(symbols, dates):
    
    sample_inputs = pd.DataFrame()
    
    for symbol in symbols:
        for date in dates:
            # Get index url for the filing we want w/ filing information
            index_url, filing_date, filing_type = get_index_url(symbol, date)
            # Get xbrl url from index page 'index_url'
            xbrl_url = get_xbrl_url(index_url)
            # Get tag values from the XBRL instance file 'xbrl_url'
            period_end, equity, equity_date, shares, shares_date = get_tag_values(xbrl_url)
            
            # Get share price on most recent trading day prior to shares count observation.
            # Using Tiingo API (requires session config as described at:
            # https://tiingo-python.readthedocs.io/en/latest/readme.html#further-docs).
            # Tiingo will throw an error if the date we try has no pricing data. So
            # when the function fails we go back a day and try again.
            shares_date_check = shares_date
            while True:
                try:
                    share_price = client.get_dataframe(symbol, 
                                                       metric_name = 'close',
                                                       startDate = shares_date_check,
                                                       endDate = shares_date_check)[0]
                    break
                except:
                    shares_date_check = shares_date_check - timedelta(days=1)
                
            
            # Calculate market cap on the date of the shares observation. 
            market_cap = shares * share_price
            
            # Bring values together into dataframe rows and append to full frame.
            row_values = [symbol, cik_dict[symbol], date, filing_date, period_end, filing_type, equity, equity_date, shares, shares_date, market_cap, index_url, xbrl_url, datetime.now()]
            names = ['symbol', 'cik', 'end_date', 'filing_date', 'period_end', 'filing_type', 'total_equity', 'equity_date', 'shares_outstanding', 'shares_date', 'market_cap', 'index_url', 'xbrl_url', 'download_datetime']
            row = pd.DataFrame([row_values], columns=names)
            sample_inputs = sample_inputs.append(row)
            
    return(sample_inputs)

In [None]:
def get_sample_pricing(symbols, date_range):
    
    # Establish dates for Tiingo use.
    start = date_range[0]
    end = date_range[1]
    
    # Establish empty input_prices for reference when building the
    # input_prices frame.
    sample_pricing = pd.DataFrame(columns = ['symbol', 'date'])
    
    # Using Tiingo, we can either get all metrics for one symbol at
    # a time or get one metric for many symbols at once. It feels
    # more sensible to loop through our 4 metrics of interest than
    # to loop through each symbol and select out those metrics.
    for metric in ['close', 'volume', 'divCash', 'splitFactor']:
        # Get data by metric using Tiingo API (requires session
        # config as described at:
        # https://tiingo-python.readthedocs.io/en/latest/readme.html#further-docs)
        wide_frame = client.get_dataframe(symbols,
                                          metric_name = metric,
                                          startDate = start,
                                          endDate = end)

        # Reset index for ease of use and rename (index is untitled
        # if only one symbol is input). Then, melt the data to create
        # observations by date and symbol. Finally, reformat dates.
        wide_frame = wide_frame.reset_index().rename(columns = {'index':'date'})
        metric_df = wide_frame.melt(id_vars = "date").rename(columns = {'variable':'symbol', 'value':metric})
        metric_df['date'] = metric_df['date'].dt.date
        
        # Merge the metric frame to the full frame.
        sample_pricing = sample_pricing.merge(metric_df, on = ['symbol', 'date'], how = 'outer')
            
    return sample_pricing

In [None]:
# Simple function to build the datasets.
# `symbols` is a list of stock tickers.
# `daterange` is a list (of 2) of a start date and end date for the daily data.
# `end_dates` is a list of dates prior to which we'll scrape attribute data from XBRL.
def build_data(symbols, date_range, end_dates):
    sample_inputs = get_sample_inputs(symbols, end_dates)[['symbol', 'cik', 'total_equity', 'market_cap', 'end_date']]
    sample_pricing = get_sample_pricing(symbols, date_range)
    
    return sample_inputs, sample_pricing
    
# sample_inputs, sample_pricing = build_data(['aapl', 'msft'], ['2020-06-01', '2020-06-30'], ['2020-06-01'])