In [1]:
import pandas as pd
import numpy as np
import os
from urllib.request import urlopen # to grab a webpage 
from tqdm.notebook import tqdm     # progress bar on loops
from time import sleep             # so we are nice to the server

In [2]:
def make_folder_if_needed(path):
    '''
    Call makedirs but only if needed
    '''
    assert path[-1] == '/' # must end with forward slash
    if not os.path.exists(path):
        os.makedirs(path) # makedirs will make intermediate folders if needed (mkdir won't)
    return

## Download the Compustat-EDGAR merged data

This dataset has 2007 and 2008 data for each of 169 firms. The 2007 observations have variables to help find the corresponding 10-K on EDGAR. 

In [3]:
# download and store the input file locally. only do this once, thereafter just load it...

input_dir = './input/' # if the input dir doesn't exist, make it
make_folder_if_needed(input_dir)
    
ccm_path = input_dir + "ccm_and_edgarinfo.dta"
if not os.path.exists(ccm_path):
    url = 'https://github.com/LeDataSciFi/lectures-spr2020/blob/master/assignment_data/2007_inv_and_tech.dta?raw=true'
    ccm = pd.read_stata(url)
    ccm.to_stata(ccm_path)
else:
    ccm = pd.read_stata(ccm_path)
    

## Loop over and download filings

Download each file to to `<pwd>/edgar_filings/cik_<#>/raw/<fname>`. For example: 
- gvkey 1410 is CIK 771497, has a 2007-12-21 10-K with FName = "edgar/data/771497/0000950134-07-026080.txt".
- we will store this in `<pwd>/edgar_filings/cik_771497/raw/0000950134-07-026080.txt`
- The file name and path do not indicate what form this is, the gvkey, nor what its date is... that info is in the filing dataset (here called "ccm").

The following function will do the download for a given row in the dataset.

In [4]:
def DL_filing(df_row):
    '''
    Input a row from a dataframe containing a CIK and a FName, and this 
    will download and store the 10-K. 
    '''
    
    # where are we putting and naming the file
    
    path_dir = './edgar_filings/cik_' + str(int(df_row['CIK'])) + '/raw/'
    filename = df_row['FName'].split('/')[-1] # 'Fname' is edgar/data/<cik>/<File> Just keep file

    # try to download if file still needed
    
    if not os.path.exists(path_dir + filename): 
        try:
            url = 'https://www.sec.gov/Archives/' + df_row['FName']
            r = urlopen(url)
        except:
            print("Failed on: ",filename) # might break tdqm, idk
            #TODO log failure
        else:             
            make_folder_if_needed(path_dir)    
            with open(path_dir + filename, 'wb') as f:
                f.write(r.read())
            #TODO log success    
            sleep(3) # be nice to server
    
    return                     


Now do the downloading:

In [5]:
# you can stop partway through this cell and resume without repeating the steps

for index, row in tqdm(ccm.iterrows(), total=len(ccm)):
    
    # DL_filing() only works when CIK and FName are valid...
    if (np.isnan(row['CIK']) == False) & (row['FName'] != ''):
        
        DL_filing(row) 


HBox(children=(FloatProgress(value=0.0, max=338.0), HTML(value='')))


