In [1]:
import edgar
import os, time
from pathlib2 import Path
import pandas as pd
import numpy as np
import requests
from tqdm import tqdm
import ProjectDirectory as directory
pd.options.mode.chained_assignment = None

## generate df with all companies and URLs

In [2]:
project_dir = directory.get_project_dir()

In [3]:
os.chdir(os.path.join(project_dir, 'sec-filings-index'))

In [4]:
# filing_year = 2013   # uncomment to run, choose year to get all edgar filings from
# edgar.download_index(os.getcwd(), filing_year)

In [5]:
# Get list of all DFs 
table_list = []

for i in os.listdir():
    if i.endswith('.tsv'):
        table_list.append(pd.read_csv(i, sep='|', header=None, encoding='latin-1', parse_dates=[3], dtype={0: int}))

In [6]:
# append all dfs into a single df
df = pd.DataFrame(columns=[0,1,2,3,4,5])   # downloaded file has 6 columns

for i in range(len(table_list)):
        df = pd.concat([df, table_list[i]], ignore_index=True, axis=0)

df.columns= ['cik', 'company_name', 'filing_type', 'filing_date', 'url', 'url2']

## Check if dataframe correctly generated

In [7]:
count_list = []
for i in range(len(table_list)):
    count_list.append(len(table_list[i]))

if df.shape[0] == sum(count_list):
    print('df tallies with individual files. Total rows = {}'.format(df.shape[0]))
else:
    print('ERROR. df does not tally!!')

df tallies with individual files. Total rows = 6061186


## Get CIK df

In [8]:
# cik_ticker_list.csv contains cik tickets of companies
df_cik = pd.read_csv(os.path.join(project_dir, 'data', 'cik_ticker_list.csv'))

In [9]:
def company_name_search(df, company_name_list):
    for company in company_name_list:
        df_company = df[df['Name'].str.contains(company, case=False)]
        print('*' * 50)
        print('SEARCH TERM: {}'.format(company))
        print('RESULTS:')
        for i in df_company['Name'].tolist():
            for j in df_company['CIK'].tolist():
                print(i, j)
        print('*' * 50)
        

In [10]:
def get_cik_from_company_name(df, company_name_list):
    cik_list = []
    if company_name_list:
        for company in company_name_list:
            cik_series = df[df['Name'].str.contains(company, case=False)]['CIK']
            cik_list.append(cik_series.values[0])
    else:
        cik_list = df['CIK'].tolist()        
    return cik_list

In [11]:
def get_company_name_from_cik(df, cik_list):
    company_list = []
    for cik in cik_list:
        company_series = df[df['CIK'] == cik]
        company_list.append(company_series.values[0])
    return company_list

In [12]:
companies_list = ['apple inc', 'tesla', 'netflix', 
                  'amazon com inc', 'microsoft', 'facebook']

In [13]:
company_name_search(df_cik, companies_list)

**************************************************
SEARCH TERM: apple inc
RESULTS:
Apple Inc 320193
**************************************************
**************************************************
SEARCH TERM: tesla
RESULTS:
Tesla Motors Inc 1318605
**************************************************
**************************************************
SEARCH TERM: netflix
RESULTS:
Netflix Inc 1065280
**************************************************
**************************************************
SEARCH TERM: amazon com inc
RESULTS:
Amazon Com Inc 1018724
**************************************************
**************************************************
SEARCH TERM: microsoft
RESULTS:
Microsoft Corp 789019
**************************************************
**************************************************
SEARCH TERM: facebook
RESULTS:
Facebook Inc 1326801
**************************************************


In [14]:
cik_list = get_cik_from_company_name(df_cik, companies_list)

## download data

In [15]:
def download_filings(cik_num_list, from_date='2014-01-01'):
    """Function to filter the appropriate filings and download them in the folder"""
    
    project_dir = directory.get_project_dir()
    
    # filter df with company CIK,filing type (10-K and 10-Q) and date  
    df_filtered = df [(df['cik'].isin(cik_num_list)) & 
                      ((df['filing_type']=='10-K') | (df['filing_type'] == '10-Q')) & 
                      (df['filing_date'] > from_date)]
    
    company_names = df_filtered['company_name'].unique().tolist()
    
    # check if folders for each company already exists    
    sec_filings_dir = os.path.join(project_dir, 'sec-filings-downloaded')  # dir to download SEC filingsa
    os.chdir(sec_filings_dir)

    for company in company_names:
        company_dir = os.path.join(sec_filings_dir, company)

        if not os.path.exists(company_dir):
            os.makedirs(company_dir)
            print('\n created dir: {}'.format(company))
        else:
            print('\n{} directory exists'.format(company))
            
        os.chdir(company_dir)
        
        # create company specific df to iterate over    
        df_filtered_co = df_filtered[df_filtered['company_name'] == company]  # get df with the company only
        df_filtered_co['filing_date'] = df_filtered_co['filing_date'].astype(str)   # convert to 'object' to name file
        
        for i in range(len(df_filtered_co)):
            url_prefix = 'https://www.sec.gov/Archives/'
            row = df_filtered_co.iloc[i,:]
            url = url_prefix + row['url']
            response = requests.get(url, stream=True, timeout=30)
            
            filing_name = row['filing_date'] + str('_') + row['filing_type']
            if os.path.isfile(filing_name):
                print('{} file already exists'.format(filing_name))
            else:
                print('Downloading: {}'.format(filing_name))
                with open('{}'.format(filing_name), 'wb') as handle:
                    for data in tqdm(response.iter_content()):
                        handle.write(data)    

### ↓ Automated download of filings. If the filing exists in the directory, the download will skip and move on the the next filing

In [16]:
download_filings(cik_list)


 created dir: AMAZON COM INC
Downloading: 2018-07-27_10-Q


522285it [00:07, 74024.13it/s] 


Downloading: 2019-02-01_10-K


865310it [00:13, 63133.86it/s]


Downloading: 2018-02-02_10-K


873172it [00:13, 62623.72it/s]


Downloading: 2018-10-26_10-Q


518300it [00:07, 72032.12it/s] 


Downloading: 2017-07-28_10-Q


453499it [00:07, 57157.90it/s]


Downloading: 2018-04-27_10-Q


460618it [00:08, 56023.05it/s]


Downloading: 2017-10-27_10-Q


491193it [00:07, 67391.08it/s] 


Downloading: 2017-04-28_10-Q


399867it [00:06, 59215.67it/s]


Downloading: 2016-10-28_10-Q


415983it [00:07, 57423.25it/s]


Downloading: 2017-02-10_10-K


859045it [00:06, 126351.99it/s]


Downloading: 2016-07-29_10-Q


469063it [00:07, 66457.95it/s] 


Downloading: 2016-04-29_10-Q


373531it [00:06, 60640.20it/s]


Downloading: 2015-07-24_10-Q


424298it [00:07, 55684.12it/s]


Downloading: 2015-10-23_10-Q


442250it [00:07, 58901.67it/s]


Downloading: 2016-01-29_10-K


833440it [00:13, 63309.62it/s]


Downloading: 2015-04-24_10-Q


550148it [00:07, 71170.40it/s] 


Downloading: 2015-01-30_10-K


1198972it [00:09, 124421.88it/s]


Downloading: 2014-07-25_10-Q


524747it [00:09, 55052.92it/s]


Downloading: 2014-10-24_10-Q


592072it [00:09, 65161.06it/s] 


Downloading: 2014-04-25_10-Q


482948it [00:07, 65407.81it/s] 


Downloading: 2014-01-31_10-K


1171883it [00:07, 157999.15it/s]



NETFLIX INC directory exists
2018-07-18_10-Q file already exists
2019-01-29_10-K file already exists
2018-01-29_10-K file already exists
2018-10-18_10-Q file already exists
2017-07-19_10-Q file already exists
2018-04-18_10-Q file already exists
2017-10-18_10-Q file already exists
2017-04-20_10-Q file already exists
2016-10-20_10-Q file already exists
2017-01-27_10-K file already exists
2016-07-19_10-Q file already exists
2016-04-20_10-Q file already exists
2015-07-17_10-Q file already exists
2015-10-16_10-Q file already exists
2016-01-28_10-K file already exists
2015-04-17_10-Q file already exists
2015-01-29_10-K file already exists
2014-07-22_10-Q file already exists
2014-10-20_10-Q file already exists
2014-04-23_10-Q file already exists
2014-02-03_10-K file already exists

 created dir: Tesla, Inc.
Downloading: 2018-08-06_10-Q


739051it [00:05, 128828.86it/s]


Downloading: 2019-02-19_10-K


1431520it [00:12, 116837.79it/s]


Downloading: 2018-02-23_10-K


1222974it [00:08, 138920.34it/s]


Downloading: 2018-11-02_10-Q


791422it [00:06, 126613.59it/s]


Downloading: 2017-08-04_10-Q


1002978it [00:07, 137193.85it/s]


Downloading: 2018-05-07_10-Q


712897it [00:05, 123406.29it/s]


Downloading: 2017-11-03_10-Q


902673it [00:06, 129842.70it/s]


Downloading: 2017-05-10_10-Q


791409it [00:05, 141505.02it/s]


Downloading: 2017-03-01_10-K


1345040it [00:09, 135240.01it/s]



 created dir: Facebook Inc
Downloading: 2018-07-26_10-Q


1265135it [00:07, 163485.87it/s]


Downloading: 2019-01-31_10-K


934541it [00:06, 153114.45it/s]


Downloading: 2018-02-01_10-K


919255it [00:05, 158865.67it/s]


Downloading: 2018-10-31_10-Q


850367it [00:05, 166026.77it/s]


Downloading: 2017-07-27_10-Q


612192it [00:04, 150269.47it/s]


Downloading: 2018-04-26_10-Q


703131it [00:04, 160693.66it/s]


Downloading: 2017-11-02_10-Q


598843it [00:03, 159675.18it/s]


Downloading: 2017-05-04_10-Q


891168it [00:05, 151986.06it/s]


Downloading: 2016-11-03_10-Q


976432it [00:06, 150933.63it/s]


Downloading: 2017-02-03_10-K


1203800it [00:08, 146701.32it/s]


Downloading: 2016-07-28_10-Q


987303it [00:06, 156030.26it/s]


Downloading: 2016-04-28_10-Q


818405it [00:05, 162896.91it/s]


Downloading: 2015-07-31_10-Q


853926it [00:05, 162820.29it/s]


Downloading: 2015-11-05_10-Q


859319it [00:05, 143713.95it/s]


Downloading: 2016-01-28_10-K


1188042it [00:07, 164827.51it/s]


Downloading: 2015-04-23_10-Q


988920it [00:06, 158733.83it/s]


Downloading: 2015-01-29_10-K


1525008it [00:10, 150124.83it/s]


Downloading: 2014-07-24_10-Q


977300it [00:06, 160754.49it/s]


Downloading: 2014-10-30_10-Q


1102806it [00:08, 137776.31it/s]


Downloading: 2014-04-25_10-Q


1011366it [00:06, 152956.53it/s]


Downloading: 2014-01-31_10-K


1411802it [00:09, 151508.20it/s]



 created dir: APPLE INC
Downloading: 2018-08-01_10-Q


579011it [00:04, 138894.65it/s]


Downloading: 2019-01-30_10-Q


555729it [00:04, 137280.15it/s]


Downloading: 2018-02-02_10-Q


552909it [00:03, 142511.44it/s]


Downloading: 2018-11-05_10-K


817963it [00:06, 135252.98it/s]


Downloading: 2017-08-02_10-Q


577102it [00:03, 144858.37it/s]


Downloading: 2018-05-02_10-Q


602721it [00:04, 136841.38it/s]


Downloading: 2017-11-03_10-K


902322it [00:06, 137559.75it/s]


Downloading: 2017-05-03_10-Q


571823it [00:04, 129958.77it/s]


Downloading: 2016-10-26_10-K


869929it [00:06, 134622.18it/s]


Downloading: 2017-02-01_10-Q


522241it [00:03, 143855.22it/s]


Downloading: 2016-07-27_10-Q


582511it [00:04, 139240.39it/s]


Downloading: 2016-04-27_10-Q


467082it [00:03, 142324.90it/s]


Downloading: 2015-07-22_10-Q


492832it [00:03, 147776.89it/s]


Downloading: 2015-10-28_10-K


762290it [00:05, 129848.75it/s]


Downloading: 2016-01-27_10-Q


419674it [00:03, 110388.36it/s]


Downloading: 2015-04-28_10-Q


728495it [00:05, 142810.08it/s]


Downloading: 2015-01-28_10-Q


679590it [00:04, 141693.27it/s]


Downloading: 2014-07-23_10-Q


748849it [00:06, 119955.93it/s]


Downloading: 2014-10-27_10-K


1070367it [00:07, 134889.47it/s]


Downloading: 2014-04-24_10-Q


677744it [00:04, 150826.31it/s]


Downloading: 2014-01-28_10-Q


627902it [00:04, 141430.59it/s]



 created dir: MICROSOFT CORP
Downloading: 2018-08-03_10-K


1161696it [00:10, 114189.40it/s]


Downloading: 2019-01-30_10-Q


756504it [00:05, 137507.11it/s]


Downloading: 2018-01-31_10-Q


897484it [00:06, 129967.97it/s]


Downloading: 2018-10-24_10-Q


660264it [00:05, 129531.26it/s]


Downloading: 2017-08-02_10-K


1152190it [00:08, 129313.08it/s]


Downloading: 2018-04-26_10-Q


908404it [00:08, 109923.29it/s]


Downloading: 2017-10-26_10-Q


797025it [00:06, 132025.62it/s]


Downloading: 2017-04-27_10-Q


796338it [00:06, 125615.24it/s]


Downloading: 2016-10-20_10-Q


615008it [00:04, 134082.47it/s]


Downloading: 2017-01-26_10-Q


763929it [00:05, 128215.45it/s]


Downloading: 2016-07-28_10-K


951680it [00:07, 125750.17it/s]


Downloading: 2016-04-21_10-Q


629392it [00:05, 114709.95it/s]


Downloading: 2015-07-31_10-K


995771it [00:07, 141255.40it/s]


Downloading: 2015-10-22_10-Q


604206it [00:04, 145890.92it/s]


Downloading: 2016-01-28_10-Q


605545it [00:04, 143696.15it/s]


Downloading: 2015-04-23_10-Q


1090206it [00:08, 136207.94it/s]


Downloading: 2015-01-26_10-Q


1080739it [00:09, 120024.01it/s]


Downloading: 2014-07-31_10-K


1534078it [00:10, 142486.87it/s]


Downloading: 2014-10-23_10-Q


961254it [00:06, 142536.66it/s]


Downloading: 2014-04-24_10-Q


999815it [00:07, 141616.29it/s]


Downloading: 2014-01-23_10-Q


1004909it [00:06, 146469.83it/s]



 created dir: TESLA MOTORS INC
Downloading: 2016-11-02_10-Q


571612it [00:04, 128993.79it/s]


Downloading: 2016-08-05_10-Q


413523it [00:02, 141236.94it/s]


Downloading: 2016-05-10_10-Q


443288it [00:03, 138374.25it/s]


Downloading: 2015-08-07_10-Q


1214446it [00:07, 169209.36it/s]


Downloading: 2015-11-05_10-Q


392761it [00:02, 135034.42it/s]


Downloading: 2016-02-24_10-K


846493it [00:06, 130898.45it/s]


Downloading: 2015-05-11_10-Q


507146it [00:04, 103469.95it/s]


Downloading: 2015-02-26_10-K


1111509it [00:07, 139289.63it/s]


Downloading: 2014-08-08_10-Q


503637it [00:03, 143044.35it/s]


Downloading: 2014-11-07_10-Q


502885it [00:03, 141578.46it/s]


Downloading: 2014-05-09_10-Q


519382it [00:03, 149754.04it/s]


Downloading: 2014-02-26_10-K


977196it [00:06, 146324.26it/s]
