In [4]:
# Generate the list of index files archived in EDGAR since start_year (earliest: 1993) until the most recent quarter
import datetime
 
current_year = datetime.date.today().year
current_quarter = (datetime.date.today().month - 1) // 3 + 1
start_year = 2009
years = list(range(start_year, current_year))
quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']
history = [(y, q) for y in years for q in quarters]
for i in range(1, current_quarter + 1):
    history.append((current_year, 'QTR%d' % i))
urls = [('https://www.sec.gov/Archives/edgar/full-index/%d/%s/crawler.idx' % (x[0], x[1]), x[0], int(x[1][-1])) for x in history]
urls.sort()
 
# Download index files and write content into SQLite
import sqlite3
import requests
 
con = sqlite3.connect('edgar_htm_idx.db')
cur = con.cursor()
cur.execute('DROP TABLE IF EXISTS idx')
cur.execute('CREATE TABLE idx (conm TEXT, type TEXT, cik TEXT, date TEXT, path TEXT, year INT, quarter INT)')
 
for url in urls:
    lines = requests.get(url[0]).text.splitlines()
    nameloc = lines[7].find('Company Name')
    typeloc = lines[7].find('Form Type')
    cikloc = lines[7].find('CIK')
    dateloc = lines[7].find('Date Filed')
    urlloc = lines[7].find('URL')
    records = [tuple([line[:typeloc].strip(), line[typeloc:cikloc].strip(), line[cikloc:dateloc].strip(),
                      line[dateloc:urlloc].strip(), line[urlloc:].strip()]+[url[1],url[2]]) for line in lines[9:]]
    cur.executemany('INSERT INTO idx VALUES (?, ?, ?, ?, ?, ?, ?)', records)
    print(url, 'downloaded and wrote to SQLite')
 
con.commit()
con.close()

('https://www.sec.gov/Archives/edgar/full-index/2009/QTR1/crawler.idx', 2009, 1) downloaded and wrote to SQLite
('https://www.sec.gov/Archives/edgar/full-index/2009/QTR2/crawler.idx', 2009, 2) downloaded and wrote to SQLite
('https://www.sec.gov/Archives/edgar/full-index/2009/QTR3/crawler.idx', 2009, 3) downloaded and wrote to SQLite
('https://www.sec.gov/Archives/edgar/full-index/2009/QTR4/crawler.idx', 2009, 4) downloaded and wrote to SQLite
('https://www.sec.gov/Archives/edgar/full-index/2010/QTR1/crawler.idx', 2010, 1) downloaded and wrote to SQLite
('https://www.sec.gov/Archives/edgar/full-index/2010/QTR2/crawler.idx', 2010, 2) downloaded and wrote to SQLite
('https://www.sec.gov/Archives/edgar/full-index/2010/QTR3/crawler.idx', 2010, 3) downloaded and wrote to SQLite
('https://www.sec.gov/Archives/edgar/full-index/2010/QTR4/crawler.idx', 2010, 4) downloaded and wrote to SQLite
('https://www.sec.gov/Archives/edgar/full-index/2011/QTR1/crawler.idx', 2011, 1) downloaded and wrote to