##### Import libraries 

In [2]:
import pandas as pd
import io
import gzip
import requests
import datetime
import os

##### Sample input 

In [3]:
input_ = [
        {
            'date':'20200102'
        },
        {
            'date':'20200103'
        },
        {
            'date':'20200104'
        },
        {
            'date':'20200105'
        },
        {
            'date':'20200106'
        },
        {
            'date':'20200107'
        },
        {
            'date':'20200108'
        },
        {
            'date':'20200109'
        }

    ]

##### Input processing  

In [4]:
def process_input(input_):
    file_names = []
    for i in input_:
        file_name = i.get('date')
        file_names.append(file_name)
    return file_names

In [5]:
file_names = process_input(input_)

In [6]:
print(file_names)

['20200102', '20200103', '20200104', '20200105', '20200106', '20200107', '20200108', '20200109']


##### Generate download url 

In [7]:
def gen_download_link(date_string, format_='idx'):

    url_syntax = 'https://www.sec.gov/Archives/edgar/daily-index/{}/{}/{}.{}'
    quarter_mapping = {
        '01':'QTR1',
        '02':'QTR1',
        '03':'QTR1',
        '04':'QTR2',
        '05':'QTR2',
        '06':'QTR2',
        '07':'QTR3',
        '08':'QTR3',
        '09':'QTR3',
        '10':'QTR4',
        '11':'QTR4',
        '12':'QTR4',
        }
    year = date_string[:4]
    month = date_string[4:6]
    quarter = quarter_mapping[month]
    file_name = 'master'+'.'+date_string
    
    return url_syntax.format(year, quarter, file_name, format_)

##### Get response from given url 

In [8]:
def get_url_response( url_link ):
    """ request get url
    """
    response = requests.request( "GET", url_link)
    return response

##### Save download data  

In [None]:
def save_to_local( response , file_name, format_ = 'idx'):

    #define variable
    status = False
    folder_path = 'RD_LABEL/US_SEC/INDEX/'
    target_path = folder_path+'{}.{}'

    if response.status_code == 200:
        # save raw data
        with open( target_path.format(file_name, format_), 'wb') as f:
            f.write(response.content)

        # log success
        print("Save Success - ",file_name )
        status = True
    elif response.status_code != 200:

        # log warning
        print( "Save Failed - " , file_name)

    else:

        # log
        print("Invalid Response")
    
    return status

##### main download function

In [10]:
def download_sec_edgar_data(file_names):
    for file_name in file_names:
        url = gen_download_link( file_name )
        print(url)
        response = get_url_response( url )
        print(response)
        status = save_to_local( response, file_name )
    return status

In [11]:
result = download_sec_edgar_data(file_names)

https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/master.20200102.idx
<Response [200]>
Save Success -  20200102
https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/master.20200103.idx
<Response [200]>
Save Success -  20200103
https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/master.20200104.idx
<Response [403]>
Save Failed -  20200104
https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/master.20200105.idx
<Response [403]>
Save Failed -  20200105
https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/master.20200106.idx
<Response [200]>
Save Success -  20200106
https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/master.20200107.idx
<Response [200]>
Save Success -  20200107
https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/master.20200108.idx
<Response [200]>
Save Success -  20200108
https://www.sec.gov/Archives/edgar/daily-index/2020/QTR1/master.20200109.idx
<Response [200]>
Save Success -  20200109


##### Processing raw data 

In [12]:
file = open("RD_LABEL/US_SEC/INDEX/20200102.idx", 'r')

def strip_header(data):
    line = ''
    while set(line) != set(['-']):

        line = data.readline().strip()
    df = pd.read_csv(data,header=None, sep='|')
    
    return df

In [13]:
data= strip_header(file)

In [14]:
print( data )

            0                     1         2         3  \
0     1000228      HENRY SCHEIN INC         4  20200102   
1     1000275  ROYAL BANK OF CANADA     424B2  20200102   
2     1000275  ROYAL BANK OF CANADA       FWP  20200102   
3     1000275  ROYAL BANK OF CANADA       FWP  20200102   
4     1000275  ROYAL BANK OF CANADA       FWP  20200102   
...       ...                   ...       ...       ...   
4436    98246          TIFFANY & CO         4  20200102   
4437    98246          TIFFANY & CO         4  20200102   
4438    98246          TIFFANY & CO         4  20200102   
4439    98246          TIFFANY & CO         4  20200102   
4440    98338               TSR INC  SC 13D/A  20200102   

                                                4  
0     edgar/data/1000228/0001209191-20-000388.txt  
1     edgar/data/1000275/0001140361-20-000013.txt  
2     edgar/data/1000275/0001140361-20-000015.txt  
3     edgar/data/1000275/0001140361-20-000019.txt  
4     edgar/data/1000275/000114

In [16]:
data.head()

Unnamed: 0,0,1,2,3,4
0,1000228,HENRY SCHEIN INC,4,20200102,edgar/data/1000228/0001209191-20-000388.txt
1,1000275,ROYAL BANK OF CANADA,424B2,20200102,edgar/data/1000275/0001140361-20-000013.txt
2,1000275,ROYAL BANK OF CANADA,FWP,20200102,edgar/data/1000275/0001140361-20-000015.txt
3,1000275,ROYAL BANK OF CANADA,FWP,20200102,edgar/data/1000275/0001140361-20-000019.txt
4,1000275,ROYAL BANK OF CANADA,FWP,20200102,edgar/data/1000275/0001140361-20-000021.txt


##### Data munging(update key values) 

In [18]:
column_name_update_map = {0:'CIK', 1:'Company Name', 2:'Form Type',3:'Date Filed', 4:'File name'}
dataframe = data.rename(columns=column_name_update_map) 

In [19]:
dataframe.head(10)

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,File name
0,1000228,HENRY SCHEIN INC,4,20200102,edgar/data/1000228/0001209191-20-000388.txt
1,1000275,ROYAL BANK OF CANADA,424B2,20200102,edgar/data/1000275/0001140361-20-000013.txt
2,1000275,ROYAL BANK OF CANADA,FWP,20200102,edgar/data/1000275/0001140361-20-000015.txt
3,1000275,ROYAL BANK OF CANADA,FWP,20200102,edgar/data/1000275/0001140361-20-000019.txt
4,1000275,ROYAL BANK OF CANADA,FWP,20200102,edgar/data/1000275/0001140361-20-000021.txt
5,1000275,ROYAL BANK OF CANADA,FWP,20200102,edgar/data/1000275/0001140361-20-000036.txt
6,1001316,"TG THERAPEUTICS, INC.",SC 13G,20200102,edgar/data/1001316/0001172661-20-000001.txt
7,1002047,"NetApp, Inc.",4,20200102,edgar/data/1002047/0001614094-20-000001.txt
8,1003078,MSC INDUSTRIAL DIRECT CO INC,4,20200102,edgar/data/1003078/0001003078-20-000002.txt
9,1003839,MATTHEW 25 FUND,497AD,20200102,edgar/data/1003839/0001162044-20-000001.txt


In [20]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4441 entries, 0 to 4440
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   CIK           4441 non-null   int64 
 1   Company Name  4441 non-null   object
 2   Form Type     4441 non-null   object
 3   Date Filed    4441 non-null   int64 
 4   File name     4441 non-null   object
dtypes: int64(2), object(3)
memory usage: 173.6+ KB


In [21]:
dataframe.shape

(4441, 5)

In [22]:
dataframe.loc[dataframe['CIK'] == 1000228]

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,File name
0,1000228,HENRY SCHEIN INC,4,20200102,edgar/data/1000228/0001209191-20-000388.txt


##### Export data 

In [24]:
dataframe.to_csv('data.csv', index=False)  

In [25]:
import json
d = dataframe.to_dict(orient='records')

In [26]:
with open('data.json', 'w') as outfile:
    json.dump(d, outfile)