In [10]:
#get list of filings - specify time period
import requests
import pandas as pd

sec_url = 'https://www.sec.gov/Archives/'
years = list(range(2015,2020))
quarters = ['QTR1', 'QTR2', 'QTR3', 'QTR4']

past5yr_reports = pd.DataFrame()

for year in years:
    for quarter in quarters:
        download = requests.get(f'https://www.sec.gov/Archives/edgar/full-index/{year}/{quarter}/master.idx').content
        download = download.decode("utf-8", errors='ignore').split('\n')
        df = pd.Series(download[11:])
        df = df.str.split('|', expand=True)
        df = df.dropna()
        past5yr_reports = past5yr_reports.append(df)
        print(str(year) + "-" + str(quarter) + " complete")
        
col = download[9].split('|')
past5yr_reports.columns = col

past5yr_reports['CIK'] = past5yr_reports['CIK'].astype(int)

past5yr_reports.head()

2015-QTR1 complete
2015-QTR2 complete
2015-QTR3 complete
2015-QTR4 complete
2016-QTR1 complete
2016-QTR2 complete
2016-QTR3 complete
2016-QTR4 complete
2017-QTR1 complete
2017-QTR2 complete
2017-QTR3 complete
2017-QTR4 complete
2018-QTR1 complete
2018-QTR2 complete
2018-QTR3 complete
2018-QTR4 complete
2019-QTR1 complete
2019-QTR2 complete
2019-QTR3 complete
2019-QTR4 complete


Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename
0,1000032,BINCH JAMES G,4,2015-03-03,edgar/data/1000032/0001209191-15-021425.txt
1,1000045,NICHOLAS FINANCIAL INC,10-Q,2015-02-09,edgar/data/1000045/0001193125-15-038970.txt
2,1000045,NICHOLAS FINANCIAL INC,8-K,2015-02-04,edgar/data/1000045/0001193125-15-033076.txt
3,1000045,NICHOLAS FINANCIAL INC,CORRESP,2015-02-18,edgar/data/1000045/0001193125-15-052939.txt
4,1000045,NICHOLAS FINANCIAL INC,CORRESP,2015-02-27,edgar/data/1000045/0001193125-15-070110.txt


In [11]:
past5yr_reports.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4821291 entries, 0 to 206456
Data columns (total 5 columns):
CIK             int64
Company Name    object
Form Type       object
Date Filed      object
Filename        object
dtypes: int64(1), object(4)
memory usage: 220.7+ MB


In [12]:
#get list of annual reports

a_reports = past5yr_reports[past5yr_reports['Form Type'] == '10-K'].reset_index(drop=True)
a_reports.head()

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename
0,1000180,SANDISK CORP,10-K,2015-02-10,edgar/data/1000180/0001000180-15-000013.txt
1,1000209,MEDALLION FINANCIAL CORP,10-K,2015-03-11,edgar/data/1000209/0001193125-15-087622.txt
2,1000228,HENRY SCHEIN INC,10-K,2015-02-11,edgar/data/1000228/0001000228-15-000007.txt
3,1000229,CORE LABORATORIES N V,10-K,2015-02-17,edgar/data/1000229/0001000229-15-000035.txt
4,1000232,KENTUCKY BANCSHARES INC /KY/,10-K,2015-03-20,edgar/data/1000232/0001104659-15-021260.txt


In [13]:
a_reports.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36837 entries, 0 to 36836
Data columns (total 5 columns):
CIK             36837 non-null int64
Company Name    36837 non-null object
Form Type       36837 non-null object
Date Filed      36837 non-null object
Filename        36837 non-null object
dtypes: int64(1), object(4)
memory usage: 1.4+ MB


In [14]:
#load S&P 500 list

sp = pd.read_csv("sp_list.csv")

In [15]:
#only get reports for s&p 500 companies

sp_reports = a_reports[a_reports['CIK'].isin(sp['CIK'])].reset_index(drop=True)
sp_reports['URL'] = ""
sp_reports.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2400 entries, 0 to 2399
Data columns (total 6 columns):
CIK             2400 non-null int64
Company Name    2400 non-null object
Form Type       2400 non-null object
Date Filed      2400 non-null object
Filename        2400 non-null object
URL             2400 non-null object
dtypes: int64(1), object(5)
memory usage: 112.6+ KB


In [16]:
sp_reports.head()

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename,URL
0,1000228,HENRY SCHEIN INC,10-K,2015-02-11,edgar/data/1000228/0001000228-15-000007.txt,
1,1000697,WATERS CORP /DE/,10-K,2015-02-27,edgar/data/1000697/0001193125-15-067900.txt,
2,1001082,DISH Network CORP,10-K,2015-02-23,edgar/data/1001082/0001104659-15-012777.txt,
3,1002910,AMEREN CORP,10-K,2015-03-02,edgar/data/1002910/0001002910-15-000075.txt,
4,100517,"United Continental Holdings, Inc.",10-K,2015-02-20,edgar/data/100517/0001193125-15-056493.txt,


In [17]:
#Get URL to reports
import re

def get_url(filename):
    sec_url = 'https://www.sec.gov/Archives/'
    txt_site = sec_url + filename
    txt_data = requests.get(txt_site).content.decode("utf-8") 
    htm = re.search('<FILENAME>(.*)\n', txt_data).group(1)
    url = sec_url + filename.replace("-","")[:-4] + "/" + htm
    return url

In [19]:
b = sp_reports

final = pd.DataFrame()

import math
import time

start = time.process_time()

for i in range(1, math.ceil(len(b)/10)):
    a = b[(i-1)*10:i*10]
    a['URL'] = a.apply (lambda row: get_url(row['Filename']), axis=1)
    final = final.append(a)
    print("%d complete - %.2f secs" %(i, time.process_time() - start))
    
remaining_rows = (len(b) % 10)*-1
c = b[remaining_rows:]
c['URL'] = c.apply (lambda row: get_url(row['Filename']), axis=1)
final = final.append(c)

final.to_csv('save1.csv')
final.tail()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


1 complete - 4.24 secs
2 complete - 8.82 secs
3 complete - 12.85 secs
4 complete - 16.48 secs
5 complete - 19.63 secs
6 complete - 23.61 secs
7 complete - 28.75 secs
8 complete - 35.03 secs
9 complete - 40.18 secs
10 complete - 44.49 secs
11 complete - 49.06 secs
12 complete - 53.55 secs
13 complete - 58.85 secs
14 complete - 63.40 secs
15 complete - 67.97 secs
16 complete - 71.90 secs
17 complete - 77.85 secs
18 complete - 81.40 secs
19 complete - 85.61 secs
20 complete - 89.69 secs
21 complete - 95.27 secs
22 complete - 104.14 secs
23 complete - 109.37 secs
24 complete - 113.96 secs
25 complete - 122.12 secs
26 complete - 129.80 secs
27 complete - 135.63 secs
28 complete - 140.10 secs
29 complete - 146.09 secs
30 complete - 150.47 secs
31 complete - 155.98 secs
32 complete - 159.62 secs
33 complete - 164.46 secs
34 complete - 169.41 secs
35 complete - 174.17 secs
36 complete - 178.78 secs
37 complete - 186.58 secs
38 complete - 190.67 secs
39 complete - 193.77 secs
40 complete - 197.

Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename,URL
2395,859737,HOLOGIC INC,10-K,2019-11-27,edgar/data/859737/0000859737-19-000026.txt,
2396,866787,AUTOZONE INC,10-K,2019-10-28,edgar/data/866787/0001193125-19-276201.txt,
2397,882184,HORTON D R INC /DE/,10-K,2019-11-25,edgar/data/882184/0000882184-19-000147.txt,
2398,883241,SYNOPSYS INC,10-K,2019-12-20,edgar/data/883241/0000883241-19-000019.txt,
2399,909832,COSTCO WHOLESALE CORP /NEW,10-K,2019-10-11,edgar/data/909832/0000909832-19-000019.txt,


In [59]:
final.tail()

Unnamed: 0.1,Unnamed: 0,CIK,Company Name,Form Type,Date Filed,Filename,URL
2395,2385,52988,JACOBS ENGINEERING GROUP INC /DE/,10-K,2019-11-25,edgar/data/52988/0000052988-19-000069.txt,https://www.sec.gov/Archives/edgar/data/52988/...
2396,2386,6281,ANALOG DEVICES INC,10-K,2019-11-26,edgar/data/6281/0000006281-19-000144.txt,https://www.sec.gov/Archives/edgar/data/6281/0...
2397,2387,6951,APPLIED MATERIALS INC /DE,10-K,2019-12-13,edgar/data/6951/0000006951-19-000046.txt,https://www.sec.gov/Archives/edgar/data/6951/0...
2398,2388,711404,"COOPER COMPANIES, INC.",10-K,2019-12-20,edgar/data/711404/0000711404-19-000051.txt,https://www.sec.gov/Archives/edgar/data/711404...
2399,2389,720005,RAYMOND JAMES FINANCIAL INC,10-K,2019-11-26,edgar/data/720005/0000720005-19-000086.txt,https://www.sec.gov/Archives/edgar/data/720005...


In [84]:
final[final['Company Name'].str.contains('Facebook')]['URL'].to_list()

['https://www.sec.gov/Archives/edgar/data/1326801/000132680115000006/fb-12312014x10k.htm',
 'https://www.sec.gov/Archives/edgar/data/1326801/000132680116000043/fb-12312015x10k.htm',
 'https://www.sec.gov/Archives/edgar/data/1326801/000132680117000007/fb-12312016x10k.htm',
 'https://www.sec.gov/Archives/edgar/data/1326801/000132680118000009/fb-12312017x10k.htm',
 'https://www.sec.gov/Archives/edgar/data/1326801/000132680119000009/fb-12312018x10k.htm']