### 本篇代码用于爬取项目的文本数据——SEC文件

In [1]:
from bs4 import BeautifulSoup
import datetime
import unicodedata
import requests
import pandas as pd
import numpy as np
from time import sleep
import math
#from config import Config
import dateutil.relativedelta
import pandas_market_calendars as mcal
import os
import io
import re
from tqdm import tqdm
import gc
import ast
from pandas.core.frame import DataFrame

#### 搜集标普500的基本数据——ticker, cik

In [18]:
wiki_url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
#用pandas的read_html可以直接把页面中的表格读出来
#header=0表示表格第一行为header
#index_col=0表示第一列数据是index
cik_df = pd.read_html(wiki_url,header=0,index_col=0)[0]
cik_df['GICS Sector'] = cik_df['GICS Sector'].astype("category")
cik_df['GICS Sub Industry'] = cik_df['GICS Sector'].astype("category")
cik_df.head()

Unnamed: 0_level_0,Security,SEC filings,GICS Sector,GICS Sub Industry,Headquarters Location,Date first added,CIK,Founded
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MMM,3M Company,reports,Industrials,Industrials,"St. Paul, Minnesota",,66740,1902
ABT,Abbott Laboratories,reports,Health Care,Health Care,"North Chicago, Illinois",1964-03-31,1800,1888
ABBV,AbbVie Inc.,reports,Health Care,Health Care,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
ABMD,ABIOMED Inc,reports,Health Care,Health Care,"Danvers, Massachusetts",2018-05-31,815094,1981
ACN,Accenture plc,reports,Information Technology,Information Technology,"Dublin, Ireland",2011-07-06,1467373,1989


In [19]:
cik_df.to_pickle('Pickles\cik.pkl')

#### 定义函数——得到SEC文本链接

In [28]:
def get_doc_links(cik,ticker):
        try:
            base_url = "https://www.sec.gov/cgi-bin/browse-edgar"
            inputted_cik = cik
            payload = {
                "action" : "getcompany",
                "CIK" : inputted_cik,
                "type" : "8-K",
                "output":"xml",
                "dateb" : "20190401", #取2019年4月之前的数据
            }
            sec_response = requests.get(url=base_url,params=payload)
            soup = BeautifulSoup(sec_response.text,'lxml')
            url_list = soup.findAll('filinghref') #得到链接
            
            html_list = []
            # 转换链接格式
            for link in url_list:
                link = link.string
                if link.split(".")[len(link.split("."))-1] == 'htm': # if the url is ended with htm
                    txtlink = link + "l" # add L and change to html
                    html_list.append(txtlink)
                    # now we have the webpages for sec files

            doc_list = []
            doc_name_list = []
            
            # 得到文本链接
            for k in range(len(html_list)):
                txt_doc = html_list[k].replace("-index.html",".txt")
                doc_name = txt_doc.split("/")[-1]
                doc_list.append(txt_doc)
                doc_name_list.append(doc_name)
                
                # Create dataframe of CIK, doc name, and txt link
            df = pd.DataFrame(
                {
                "cik" : [cik]*len(html_list),
                "ticker" : [ticker]*len(html_list),
                "txt_link" : doc_list,
                "doc_name": doc_name_list
                }
            )
        except requests.exceptions.ConnectionError:
                sleep(.1)
        return df

#### 得到s&p500每只股票的SEC文本链接

In [None]:
df_list = [] #生成一个列表装所有股票的SEC数据
company_list = cik_df['CIK'].to_dict()#生成一个ticker-cik的字典
for (ticker,cik) in tqdm(company_list.items()):
    df_list.append(get_doc_links(cik,ticker))#每个股票返回一个data frame
doc_links_df = pd.concat(df_list,axis=0)#将所有df纵向合并。axis=1为横向合并
#先将ticker设为索引，用以合并行业变量
#恢复整体索引，消除设置的index
doc_links_df = doc_links_df.set_index("ticker").join(cik_df['GICS Sector']).join(cik_df['GICS Sub Industry']).reset_index().rename(columns={"index":"ticker"})
doc_links_df.head()

In [83]:
len(doc_links_df)

19876

In [84]:
doc_links_df.to_pickle('Pickles\doc_links_df.pkl')

#### 从500中得到前100股票的文本链接

In [3]:
top100 = pd.read_csv('top100.csv')#导入前100股票的cik

In [5]:
all_links = pd.read_pickle('Pickles\doc_links_df.pkl')#导入前500股票所有的文本链接

In [11]:
all_links.head()

Unnamed: 0,ticker,cik,txt_link,doc_name,GICS Sector,GICS Sub Industry
0,A,1090872.0,https://www.sec.gov/Archives/edgar/data/109087...,0001564590-19-009011.txt,Health Care,Health Care
1,A,1090872.0,https://www.sec.gov/Archives/edgar/data/109087...,0001193805-19-000279.txt,Health Care,Health Care
2,A,1090872.0,https://www.sec.gov/Archives/edgar/data/109087...,0001090872-19-000004.txt,Health Care,Health Care
3,A,1090872.0,https://www.sec.gov/Archives/edgar/data/109087...,0001090872-19-000002.txt,Health Care,Health Care
4,A,1090872.0,https://www.sec.gov/Archives/edgar/data/109087...,0001564590-19-000714.txt,Health Care,Health Care


In [12]:
all_links = all_links.set_index('ticker')

In [13]:
all_links.index

Index(['A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A',
       ...
       'ZTS', 'ZTS', 'ZTS', 'ZTS', 'ZTS', 'ZTS', 'ZTS', 'ZTS', 'ZTS', 'ZTS'],
      dtype='object', name='ticker', length=19876)

In [17]:
top100_links =[]#得到前100的股票数据
for ticker in top100['ticker']:
    top100_links.append(all_links.xs(ticker))

In [19]:
df = pd.concat(top100_links)
df.head()

Unnamed: 0_level_0,cik,txt_link,doc_name,GICS Sector,GICS Sub Industry
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MSFT,789019.0,https://www.sec.gov/Archives/edgar/data/789019...,0001193125-19-022553.txt,Information Technology,Information Technology
MSFT,789019.0,https://www.sec.gov/Archives/edgar/data/789019...,0001193125-18-337951.txt,Information Technology,Information Technology
MSFT,789019.0,https://www.sec.gov/Archives/edgar/data/789019...,0001193125-18-306365.txt,Information Technology,Information Technology
MSFT,789019.0,https://www.sec.gov/Archives/edgar/data/789019...,0001193125-18-277602.txt,Information Technology,Information Technology
MSFT,789019.0,https://www.sec.gov/Archives/edgar/data/789019...,0001193125-18-221458.txt,Information Technology,Information Technology


In [20]:
len(df)

3924

In [23]:
df.to_csv('top100_links.csv')

#### 定义一系列函数从链接中爬取文本信息——文本，发布日，项目

定义函数：从文本链接中得到文本内容和发布日期

In [2]:
    def extract_text(link):
        try:
            r = requests.get(link,timeout=(5,5)) #获取网页,#加上超时参数，对付不响应
            #Parse 8-K document，用beautifulsoup解析
            filing = BeautifulSoup(r.content,"html5lib",from_encoding="ascii")
            #Extract datetime
            try:
                submission_dt = filing.find("acceptance-datetime").string[:14] #获取年月日时分秒
            except AttributeError:
                    # Flag docs with missing data as May 1 2019 10AM
                submission_dt = "20190501100000"
            #转换时间格式，原来就是一串数字，转换后可以识别年月日时分秒
            submission_dt = datetime.datetime.strptime(submission_dt,"%Y%m%d%H%M%S")
            #Extract HTML sections,去除表格，得到简单的文件形式
            for section in filing.findAll("html"):
                try:
                    #Remove tables
                    for table in section("table"):
                        table.decompose()
                    #Convert to unicode
                    section = unicodedata.normalize("NFKD",section.text)
                    section = section.replace("\t"," ").replace("\n"," ").replace("/s"," ").replace("\'","'")            
                except AttributeError:
                    section = str(section.encode('utf-8'))
            filing = "".join((section))
        except requests.exceptions.ConnectionError:
                sleep(10)
        sleep(.1)

        return filing, submission_dt


定义函数：从文本中抓取项目

In [3]:
    def extract_item_no(document):
        pattern = re.compile("Item+ +\d+[\:,\.]+\d+\d")#生成一个正则pattern方便多次使用
        item_list = re.findall(pattern,document)
        return item_list

定义函数：从文本中得到文件名

In [4]:
def get_doc_name(document):
    return document[:24]

#### 开始抓取文本

为了防止死机，分块抓取：  
爬取1-400完成  
400-800完成  
800-1200完成  
1200-1500完成  
1500-2000完成  
2000-2500完成  
2500-  
所有数据爬取完成

In [5]:
df = pd.read_csv('top100_links.csv')#导入所有链接

In [46]:
df3000_4000 = df.iloc[3001:]#取出一部分链接

In [47]:
df3000_4000.head()

Unnamed: 0,ticker,cik,txt_link,doc_name,GICS Sector,GICS Sub Industry
3001,GS,886982.0,https://www.sec.gov/Archives/edgar/data/886982...,0001193125-17-178841.txt,Financials,Financials
3002,GS,886982.0,https://www.sec.gov/Archives/edgar/data/886982...,0001193125-17-148346.txt,Financials,Financials
3003,GS,886982.0,https://www.sec.gov/Archives/edgar/data/886982...,0001193125-17-139624.txt,Financials,Financials
3004,BMY,14272.0,https://www.sec.gov/Archives/edgar/data/14272/...,0001140361-19-005591.txt,Health Care,Health Care
3005,BMY,14272.0,https://www.sec.gov/Archives/edgar/data/14272/...,0001140361-19-004566.txt,Health Care,Health Care


In [48]:
txt_date = []#进行抓取，如果网页解析失败则放弃该文件print failed
for link in tqdm(df3000_4000['txt_link']):
    try:
        txt_date.append(extract_text(link))
    except:
        print(link,'failed')       
gc.collect()        

  1%|▌                                                                               | 7/923 [00:22<1:08:11,  4.47s/it]

https://www.sec.gov/Archives/edgar/data/14272/000114036119001470/0001140361-19-001470.txt failed


 54%|██████████████████████████████████████████▉                                     | 496/923 [42:54<51:34,  7.25s/it]

https://www.sec.gov/Archives/edgar/data/713676/000071367618000059/0000713676-18-000059.txt failed


 55%|███████████████████████████████████████████▎                                  | 512/923 [50:11<4:15:15, 37.26s/it]

https://www.sec.gov/Archives/edgar/data/713676/000119312517283876/0001193125-17-283876.txt failed


 56%|███████████████████████████████████████████▍                                  | 514/923 [51:15<3:35:49, 31.66s/it]

https://www.sec.gov/Archives/edgar/data/713676/000119312517228257/0001193125-17-228257.txt failed


 56%|███████████████████████████████████████████▊                                  | 518/923 [51:30<1:04:26,  9.55s/it]

https://www.sec.gov/Archives/edgar/data/713676/000119312517155982/0001193125-17-155982.txt failed


 56%|███████████████████████████████████████████▉                                  | 520/923 [52:07<1:23:50, 12.48s/it]

https://www.sec.gov/Archives/edgar/data/713676/000119312517136399/0001193125-17-136399.txt failed


 56%|████████████████████████████████████████████                                  | 521/923 [52:21<1:26:05, 12.85s/it]

https://www.sec.gov/Archives/edgar/data/713676/000119312517122738/0001193125-17-122738.txt failed


 57%|████████████████████████████████████████████                                  | 522/923 [52:44<1:46:19, 15.91s/it]

https://www.sec.gov/Archives/edgar/data/713676/000119312517122062/0001193125-17-122062.txt failed


 71%|███████████████████████████████████████████████████████                       | 652/923 [1:03:59<11:34,  2.56s/it]

https://www.sec.gov/Archives/edgar/data/885725/000110465918069709/0001104659-18-069709.txt failed


 78%|████████████████████████████████████████████████████████████▊                 | 719/923 [1:34:34<31:05,  9.14s/it]

https://www.sec.gov/Archives/edgar/data/92122/000009212217000040/0000092122-17-000040.txt failed


 83%|███████████████████████████████████████████████████████████████▎            | 769/923 [1:45:51<1:22:11, 32.03s/it]

https://www.sec.gov/Archives/edgar/data/1364742/000156459018008028/0001564590-18-008028.txt failed


 84%|███████████████████████████████████████████████████████████████▉            | 776/923 [1:49:04<1:19:12, 32.33s/it]

https://www.sec.gov/Archives/edgar/data/1364742/000095012317006151/0000950123-17-006151.txt failed


100%|██████████████████████████████████████████████████████████████████████████████| 923/923 [2:00:06<00:00,  2.84s/it]


40523

In [49]:
txt_date = DataFrame(txt_date)#先转化为df

In [50]:
txt_date = txt_date.rename(columns={0:'text',1:'release_date'})#将两列数据命名

In [51]:
txt_date.head()

Unnamed: 0,text,release_date
0,0001193125-17-178841.txt : 20170523 0001193125...,2017-05-23 08:06:03
1,0001193125-17-148346.txt : 20170428 0001193125...,2017-04-28 16:31:59
2,0001193125-17-139624.txt : 20170426 0001193125...,2017-04-26 16:40:02
3,0001140361-19-005591.txt : 20190326 0001140361...,2019-03-26 07:05:07
4,0001140361-19-004566.txt : 20190308 0001140361...,2019-03-08 07:05:56


In [52]:
len(txt_date) #检查本次提取了多少个文件

911

提取文件名

In [53]:
txt_date['doc_name'] = txt_date['text'].apply(get_doc_name) #增加一列文件名方便合并

提取文件项目

In [54]:
txt_date['items'] = txt_date['text'].map(extract_item_no) #提取items

In [55]:
txt_date.head() #检查两个新变量

Unnamed: 0,text,release_date,doc_name,items
0,0001193125-17-178841.txt : 20170523 0001193125...,2017-05-23 08:06:03,0001193125-17-178841.txt,[Item 8.01]
1,0001193125-17-148346.txt : 20170428 0001193125...,2017-04-28 16:31:59,0001193125-17-148346.txt,[Item 5.07]
2,0001193125-17-139624.txt : 20170426 0001193125...,2017-04-26 16:40:02,0001193125-17-139624.txt,[]
3,0001140361-19-005591.txt : 20190326 0001140361...,2019-03-26 07:05:07,0001140361-19-005591.txt,[]
4,0001140361-19-004566.txt : 20190308 0001140361...,2019-03-08 07:05:56,0001140361-19-004566.txt,[Item 9.01]


将文件数据和原数据合并

In [56]:
df3000_4000 = pd.merge(txt_date,df3000_4000) #合并

In [57]:
df3000_4000.head()

Unnamed: 0,text,release_date,doc_name,items,ticker,cik,txt_link,GICS Sector,GICS Sub Industry
0,0001193125-17-178841.txt : 20170523 0001193125...,2017-05-23 08:06:03,0001193125-17-178841.txt,[Item 8.01],GS,886982.0,https://www.sec.gov/Archives/edgar/data/886982...,Financials,Financials
1,0001193125-17-148346.txt : 20170428 0001193125...,2017-04-28 16:31:59,0001193125-17-148346.txt,[Item 5.07],GS,886982.0,https://www.sec.gov/Archives/edgar/data/886982...,Financials,Financials
2,0001193125-17-139624.txt : 20170426 0001193125...,2017-04-26 16:40:02,0001193125-17-139624.txt,[],GS,886982.0,https://www.sec.gov/Archives/edgar/data/886982...,Financials,Financials
3,0001140361-19-005591.txt : 20190326 0001140361...,2019-03-26 07:05:07,0001140361-19-005591.txt,[],BMY,14272.0,https://www.sec.gov/Archives/edgar/data/14272/...,Health Care,Health Care
4,0001140361-19-004566.txt : 20190308 0001140361...,2019-03-08 07:05:56,0001140361-19-004566.txt,[Item 9.01],BMY,14272.0,https://www.sec.gov/Archives/edgar/data/14272/...,Health Care,Health Care


In [58]:
df3000_4000.to_csv('Data\df3000_4000.csv') #保存

#### 将所有分次爬取的数据合并成一个表格

In [60]:
df1 = pd.read_csv('Data\df1_400.csv')
df2 = pd.read_csv('Data\df400_799.csv')
df3 = pd.read_csv('Data\df800_1200.csv')
df4 = pd.read_csv('Data\df1200_1500.csv')
df5 = pd.read_csv('Data\df1500_2000.csv')
df6 = pd.read_csv('Data\df2000_2500.csv')
df7 = pd.read_csv('Data\df2500_3000.csv')
df8 = pd.read_csv('Data\df3000_4000.csv')

In [61]:
df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8],axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [3]:
len(df)

3845

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,GICS Sector,GICS Sub Industry,cik,doc_name,items,release_date,text,ticker,txt_link
0,0,Information Technology,Information Technology,789019.0,0001193125-19-022553.txt,"['Item 2.02', 'Item 9.01']",2019-01-30 16:03:36,0001193125-19-022553.txt : 20190130 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
1,1,Information Technology,Information Technology,789019.0,0001193125-18-337951.txt,['Item 5.07'],2018-11-29 15:29:57,0001193125-18-337951.txt : 20181129 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
2,2,Information Technology,Information Technology,789019.0,0001193125-18-306365.txt,"['Item 2.02', 'Item 9.01']",2018-10-24 16:03:06,0001193125-18-306365.txt : 20181024 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
3,3,Information Technology,Information Technology,789019.0,0001193125-18-277602.txt,"['Item 5.02', 'Item 9.01']",2018-09-19 16:15:58,0001193125-18-277602.txt : 20180919 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
4,4,Information Technology,Information Technology,789019.0,0001193125-18-221458.txt,"['Item 2.02', 'Item 9.01']",2018-07-19 16:02:35,0001193125-18-221458.txt : 20180719 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...


#### 进行一些预处理

In [5]:
df = df.drop('Unnamed: 0',axis=1)
df.head()

Unnamed: 0,GICS Sector,GICS Sub Industry,cik,doc_name,items,release_date,text,ticker,txt_link
0,Information Technology,Information Technology,789019.0,0001193125-19-022553.txt,"['Item 2.02', 'Item 9.01']",2019-01-30 16:03:36,0001193125-19-022553.txt : 20190130 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
1,Information Technology,Information Technology,789019.0,0001193125-18-337951.txt,['Item 5.07'],2018-11-29 15:29:57,0001193125-18-337951.txt : 20181129 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
2,Information Technology,Information Technology,789019.0,0001193125-18-306365.txt,"['Item 2.02', 'Item 9.01']",2018-10-24 16:03:06,0001193125-18-306365.txt : 20181024 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
3,Information Technology,Information Technology,789019.0,0001193125-18-277602.txt,"['Item 5.02', 'Item 9.01']",2018-09-19 16:15:58,0001193125-18-277602.txt : 20180919 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
4,Information Technology,Information Technology,789019.0,0001193125-18-221458.txt,"['Item 2.02', 'Item 9.01']",2018-07-19 16:02:35,0001193125-18-221458.txt : 20180719 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...


In [6]:
gc.collect()

0

In [7]:
df['items'] = df['items'].map(lambda x: ast.literal_eval(x)) #将items转化成安全格式

In [85]:
df = df.reset_index() #重新编排索引
df.index

RangeIndex(start=0, stop=3873, step=1)

In [9]:
def str_to_time(time):#写个函数将string的时间变成时间格式
    return datetime.datetime.strptime(time,"%Y-%m-%d %H:%M:%S")

In [10]:
df['release_date'] = df['release_date'].apply(str_to_time)
df.head()

Unnamed: 0,GICS Sector,GICS Sub Industry,cik,doc_name,items,release_date,text,ticker,txt_link
0,Information Technology,Information Technology,789019.0,0001193125-19-022553.txt,"[Item 2.02, Item 9.01]",2019-01-30 16:03:36,0001193125-19-022553.txt : 20190130 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
1,Information Technology,Information Technology,789019.0,0001193125-18-337951.txt,[Item 5.07],2018-11-29 15:29:57,0001193125-18-337951.txt : 20181129 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
2,Information Technology,Information Technology,789019.0,0001193125-18-306365.txt,"[Item 2.02, Item 9.01]",2018-10-24 16:03:06,0001193125-18-306365.txt : 20181024 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
3,Information Technology,Information Technology,789019.0,0001193125-18-277602.txt,"[Item 5.02, Item 9.01]",2018-09-19 16:15:58,0001193125-18-277602.txt : 20180919 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
4,Information Technology,Information Technology,789019.0,0001193125-18-221458.txt,"[Item 2.02, Item 9.01]",2018-07-19 16:02:35,0001193125-18-221458.txt : 20180719 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...


In [97]:

#删除没有日期的数据（布尔值）
df = df.loc[~(df['release_date'] >= pd.datetime(year=2019,month=5,day=1))]
df = df.drop_duplicates(subset="doc_name")#drop掉文件名重复的数据


再次检查index是否正确

In [11]:
df.index

RangeIndex(start=0, stop=3845, step=1)

In [100]:
df = df.drop('index',axis=1)

In [101]:
df.head()

Unnamed: 0,GICS Sector,GICS Sub Industry,cik,doc_name,items,release_date,text,ticker,txt_link
0,Information Technology,Information Technology,789019.0,0001193125-19-022553.txt,"[Item 2.02, Item 9.01]",2019-01-30 16:03:36,0001193125-19-022553.txt : 20190130 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
1,Information Technology,Information Technology,789019.0,0001193125-18-337951.txt,[Item 5.07],2018-11-29 15:29:57,0001193125-18-337951.txt : 20181129 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
2,Information Technology,Information Technology,789019.0,0001193125-18-306365.txt,"[Item 2.02, Item 9.01]",2018-10-24 16:03:06,0001193125-18-306365.txt : 20181024 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
3,Information Technology,Information Technology,789019.0,0001193125-18-277602.txt,"[Item 5.02, Item 9.01]",2018-09-19 16:15:58,0001193125-18-277602.txt : 20180919 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...
4,Information Technology,Information Technology,789019.0,0001193125-18-221458.txt,"[Item 2.02, Item 9.01]",2018-07-19 16:02:35,0001193125-18-221458.txt : 20180719 0001193125...,MSFT,https://www.sec.gov/Archives/edgar/data/789019...


#### 文本数据爬取完成

In [12]:
df.to_pickle('Pickles\\sec_df.pkl')