In [1]:
# General packages
import requests
import json
import pandas as pd
import numpy as np
import time
import random
import re
import os
import sys
from ast import literal_eval

# Custom modules
from modules import sec_helper as sec

# Web scraping
from bs4 import BeautifulSoup as bs4


In [51]:

# OVERVIEW: functions get CIK codes for a company name and lookup
# ...associated SEC filings - form D.

# imports for program
# BeautifulSoup is only third-party package


# endpoints and params 
base_sec = r'https://www.sec.gov/cgi-bin'

# get_cik()

def get_cik(company_name):
    """
    Overview:
    
    Gets cik number for a given company name
    
    Params:
    
    name --> company name
    
    Returns:
    
    cik -- cik number
    link -- sec base link + cik number attached
    
    """
    # endpoint and cik params for get method
    cik_endpoint = r'https://www.sec.gov/cgi-bin/cik_lookup'
    params_cik = {'company': f'{company_name}'}

    cik_results = get_atags(get_tables(cik_endpoint, params = params_cik))

    cik = cik_results[0]
    link = base_sec + '/'+ cik_results[1] if cik_results[1] != 'none' else 'none'

    return cik, link






def get_atags(tables):
    """
    Overview: 
    gets atags for CIK lookup results
    
    params:
    bs4 tables --> results of bs4.find_all('table')
    
    """

    # only one table, grab first
    tdata = tables[0]

    # pull a tags
    atag = [data.find_all('a') for data in tdata]

    # atag[0] holds relevant tags list

    if len(atag[0])>1:
        cik = atag[0][0].text.strip()
        link = atag[0][0]['href']
        
        return cik, link
    else:
        return 'none', 'none'
    


def get_tables(endpoint, params):
    """
    Overview:
    Extracts all tables from url's html, returns a list
    
    Params:
    url
    
    """
    
    html = requests.get(endpoint, params = params).text
    
    # html = requests.get(html_url).text
    # initiate bs object
    soup = bs4(html,'lxml')
    # 25 total tables on FAA site
    tables = soup.find_all('table')
    
    return tables

def get_edgar_tables(cik):
    """
    Overview:
    Extracts all tables from sec cik page, returns a list
    
    Params:
    cik --> cik number
    
    """
    # url endpoint for browsing sec edgar
    edgar_endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"
    
    # f string cik num from parameters
    params_edgar = {'action':'getcompany',
                    'CIK':f'{cik}',
                    'type':'10-K',
                    'dateb':'',
                    'owner':'exclude',
                    'start':'',
                    'output':'',
                    'count':'100'
                   }
    
    html = requests.get(url = edgar_endpoint, params = params_edgar).text
    
    # html = requests.get(html_url).text
    # initiate bs object
    soup = bs4(html,'lxml')
    # 25 total tables on FAA site
    tables = soup.find_all('table', class_='tableFile2')
    
    return tables




def check_formd(ftype_list):
    """
    Overview:
    Takes a list of file types and checks for Form D types
    
    Params:
    List of file types
    
    Returns:
    'Yes' or 'No' string
    
    """

    if type(ftype_list) == list:
    
        d_list = [f for f in ftype_list if f == 'D' or 'D/A']

        if len(d_list) > 0:
    
            return 'Yes'
        else:
            return 'No'

    elif type(ftype_list) == str:

        if ftype_list == 'D' or 'D/A':

            return 'Yes'
        else:
            return 'No'

    else:
        return 'No'

        
def likely_private(co, filings_df):
    """
    Checks row in dataframe for a company likely to be private
    
    Returns 'yes' or 'no'
    
    """
    df = filings_df.copy()
    
    if df.at[co, 'cik'] == 'none':
        return 'Yes'
    elif '10-Q' in df.at[co, 'f_type'] or '10-K' in df.at[co, 'f_type']:
        return 'No'
    else:
        return 'Yes'
        
        
        

def get_filings(cik):
    """
    Overview:
    Takes in CIK, returns filings data
    
    Params:
    CIK --> cik number
    
    """
    
    # run function, get data table
    tables = get_edgar_tables(cik)


    # master storage for data per each filing
    file_types = []
    file_dates = []
    file_num = []
    acc_nums = []

    for row in tables[0].find_all('tr'):
        data = row.find_all('td')

        if len(data) > 1:

            # unpack data

            # append file data to master storage
            filing_type = data[0].text.strip()
            file_types.append(filing_type)
            filing_date = data[3].text.strip()
            file_dates.append(filing_date)
            filing_num = data[4].text.strip()
            file_num.append(filing_num)
            acc_num = data[2].text.strip()
            acc_s = acc_num.partition('Acc-no: ')[2]
            acc_nums.append(acc_s[:20])

    return file_types, file_dates, file_num, acc_nums
    

# Get CIK Numbers

In [48]:
# list storage initialization
cik_col = []
link_col = []

# List of company names to search
co_list = ['Microsoft Corp']
# Initiatlize dictionary
co_dict = {co: {'cik': [], 'cik_link': []} for co in co_list} 

# loop through company names
for co in co_dict:
    
    # sleep timer for each loop - randomized
    time.sleep(random.randint(2,4))
    
    
    try:
    
        # get_cik --> start with company name
        cik_res = get_cik(co)
        cik = cik_res[0]
        cik_link = cik_res[1]
        
        # append results
        co_dict[co]['cik'] = cik
        co_dict[co]['cik_link'] = cik_link

        
    except:
        
        # error here, pass
        print('error at ',co, 'pass for now')
        

 Get Request Parameters --> see .py file in 'Modules' folder to modify

* action: (required) By default should be set to getcompany.
* CIK: (required) Is the CIK number of the company you are searching.
* type: (optional) Allows filtering the type of form. For example, if set to 10-k only the 10-K filings are returned.
* dateb: (optional) Will only return the filings before a given date. The format is as follows YYYYMMDD
* owner: (required) Is set to exclude by default and specifies ownership. You may also set it to include and only.
* start: (optional) Is the starting index of the results. For example, if I have 100 results but want to start at 45 of 100, I would pass 45.
* state: (optional) The company's state.
* filenum: (optional) The filing number.
* sic: (optional) The company's SIC (Standard Industry Classification) identifier
* output: (optional) Defines returned data structure as either xml (atom) or normal html.
* count: (optional) The number of results you want to see with your request, the max is 100 and if not set it will default to 40.

# Get filings data

In [49]:
file_features = ['f_type', 'f_date', 'f_num', 'acc_num']

for co in co_dict:

    # sleep timer for each loop - randomized
    time.sleep(random.randint(2,5))
    
    
    if co_dict[co]['cik'] != 'none':

        try:

            # get_filings --> uses cik number
            filing_res = get_filings(co_dict[co]['cik'])
            
            # unpack response variable
            f_type = filing_res[0]
            f_date = filing_res[1]
            f_num = filing_res[2]
            acc_num = filing_res[3]

            # append result variables
            co_dict[co]['f_type'] = f_type
            co_dict[co]['f_date'] = f_date
            co_dict[co]['f_num'] = f_num
            co_dict[co]['acc_num'] = acc_num

        except:

            # error here, pass for now
            print('error here at ', co, 'pass for now')
            
    else:

        for feat in file_features:
            co_dict[co][feat] = 'none'
        

In [50]:
filings_df = pd.DataFrame.from_dict(co_dict, orient ='index')
filings_df.head()

Unnamed: 0,cik,cik_link,f_type,f_date,f_num,acc_num,likely_private
Microsoft Corp,789019,https://www.sec.gov/cgi-bin/browse-edgar?actio...,"[10-K, 10-K, 10-K, 10-K, 10-K, 10-K, 10-K, 10-...","[2019-08-01, 2018-08-03, 2017-08-02, 2016-07-2...","[001-3784519992755, 001-3784518990758, 001-378...","[0001564590-19-027952, 0001564590-18-019062, 0...",No


In [54]:
cik = filings_df['cik']['Microsoft Corp']
# Show all 10-K filing dates
filings_df['f_date']['Microsoft Corp']
# Retrieve most recent 10-K filing number
filings_df['f_num']['Microsoft Corp'][0]
# Retrieve SEC Accession No.
acc = filings_df['acc_num']['Microsoft Corp'][0]
acc_raw = acc.replace('-','')

In [55]:
# Build data url to get filing landing page
sec_data_base = 'https://www.sec.gov/Archives/edgar/data/{}/{}/{}-index.htm'
sec_data_resp = requests.get(sec_data_base.format(cik, acc_raw, acc))

# Filing Data

For each filing there are several associated documents including .htm / .xml / .txt files 

In [60]:
filing_url = sec_data_resp.url
filing_url

'https://www.sec.gov/Archives/edgar/data/0000789019/000156459019027952/0001564590-19-027952-index.htm'

# Get Data From Filing

In [95]:
tenk_link = 'https://www.sec.gov/Archives/edgar/data/789019/000156459019027952/msft-10k_20190630.htm'

tenk_resp = requests.get(tenk_link).text
# Request and Get the html file 
soup = bs4(tenk_resp, 'lxml')
# 25 total tables on FAA site
tenk_tables = soup.find_all('table')

# Builds a list of dataframes (Tables) from 10-K

In [211]:
dfs = []
for count, t in enumerate(tenk_tables):
    
    table_rows = t.find_all('tr')

    res = []
    for tr in table_rows:
        td = tr.find_all('td')
        row = [tr.text.strip() for tr in td if tr.text.strip()]
        if row:
            res.append(row)
    df = pd.DataFrame(res)
    dfs.append(df)

# Isolate interest tables by table contents

In [212]:
cloud = []
for df in dfs:
    for col in df:
        for d in df[col]:
            if d:
                if 'cloud' in d or 'Intelligent Cloud' in d:
                    cloud.append(df)

In [213]:
for index, df in enumerate(cloud):
    print("")
    print(df)
    print("")
    print('TABLE INDEX: ',index)


   0                                                  1
0  •  Build the intelligent cloud and intelligent ed...

TABLE INDEX:  0

   0                                                  1
0  •  Building and running cloud-based services in w...

TABLE INDEX:  1

   0                                                  1
0  •  Using Windows to fuel our cloud business and M...

TABLE INDEX:  2

   0                                                  1
0  •  Dynamics business solutions, including Dynamic...

TABLE INDEX:  3

   0                                                  1
0  •  Server products and cloud services, including ...

TABLE INDEX:  4

   0                                                  1
0  •  Windows, including Windows OEM licensing (“Win...

TABLE INDEX:  5

   0                                                  1
0  •  Gaming, including Xbox hardware and Xbox softw...

TABLE INDEX:  6

   0                                                  1
0  •  Cloud and AI, focuses on ma