-
Notifications
You must be signed in to change notification settings - Fork 17
/
scrape.py
83 lines (64 loc) · 2.42 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import os
import ftplib
from scrapekit import Scraper
from lxml import etree
from common import collection, engine
HOST = 'ftp.sec.gov'
BASE_DIR = 'edgar/monthly'
EDGNS = '{http://www.sec.gov/Archives/edgar}'
SICS = [1311, 1381, 1382, 1389, 2911, 2990, 3532,
3533, 5171, 5172, 6792, None]
scraper = Scraper('sec-edgar')
@scraper.task
def monthly_indexes():
ftp = ftplib.FTP(HOST)
ftp.login('anonymous', '@anonymous')
ftp.cwd(BASE_DIR)
for file_name in ftp.nlst():
path = os.path.join(scraper.config.data_path, file_name)
if not os.path.exists(path):
with open(path, 'wb') as fh:
ftp.retrbinary("RETR " + file_name, fh.write)
parse_feed.queue(path)
ftp.quit()
@scraper.task
def parse_feed(file_name):
doc = etree.parse(file_name)
for item in doc.findall('.//item'):
data = {}
for c in item.iterchildren():
if EDGNS in c.tag:
continue
if c.tag == 'enclosure':
data[c.tag] = c.get('url')
else:
data[c.tag] = c.text
for fc in item.findall(EDGNS + 'xbrlFiling/*'):
tag = fc.tag.replace(EDGNS, '')
if tag == 'xbrlFiles':
continue
if fc.text:
data[tag] = fc.text
if data.get('guid') is None:
data['guid'] = data.get('link')
scraper.log.info('Filing title: %s, %s', data.get('title'),
data.get('guid'))
engine['filings'].upsert(data, ['guid'])
if data.get('assignedSic') is not None and \
int(data['assignedSic']) not in SICS:
continue
whole = data.copy()
whole['url'] = data.get('link').replace('-index.htm', '.txt')
whole['full'] = True
collection.ingest(whole.get('url'), **whole)
for fc in item.findall(EDGNS + 'xbrlFiling//' + EDGNS + 'xbrlFile'):
file_data = data.copy()
file_rec = {'guid': data.get('guid')}
for k, v in fc.attrib.items():
file_data[k.replace(EDGNS, 'xbrlfile_')] = v
file_rec[k.replace(EDGNS, '')] = v
scraper.log.info('XBRL Filing: %s', file_data.get('xbrlfile_url'))
engine['files'].upsert(file_rec, ['guid', 'url'])
collection.ingest(file_data.get('xbrlfile_url'), **file_data)
if __name__ == '__main__':
monthly_indexes.run()