scrape.py

import os
import ftplib

from scrapekit import Scraper
from lxml import etree

from common import collection, engine

HOST = 'ftp.sec.gov'
BASE_DIR = 'edgar/monthly'
EDGNS = '{http://www.sec.gov/Archives/edgar}'
SICS = [1311, 1381, 1382, 1389, 2911, 2990, 3532,
        3533, 5171, 5172, 6792, None]

scraper = Scraper('sec-edgar')


@scraper.task
def monthly_indexes():
    ftp = ftplib.FTP(HOST)
    ftp.login('anonymous', '@anonymous')
    ftp.cwd(BASE_DIR)
    for file_name in ftp.nlst():
        path = os.path.join(scraper.config.data_path, file_name)
        if not os.path.exists(path):
            with open(path, 'wb') as fh:
                ftp.retrbinary("RETR " + file_name, fh.write)
        parse_feed.queue(path)
    ftp.quit()


@scraper.task
def parse_feed(file_name):
    doc = etree.parse(file_name)
    for item in doc.findall('.//item'):
        data = {}
        for c in item.iterchildren():
            if EDGNS in c.tag:
                continue
            if c.tag == 'enclosure':
                data[c.tag] = c.get('url')
            else:
                data[c.tag] = c.text

        for fc in item.findall(EDGNS + 'xbrlFiling/*'):
            tag = fc.tag.replace(EDGNS, '')
            if tag == 'xbrlFiles':
                continue

            if fc.text:
                data[tag] = fc.text

        if data.get('guid') is None:
            data['guid'] = data.get('link')

        scraper.log.info('Filing title: %s, %s', data.get('title'),
                         data.get('guid'))

        engine['filings'].upsert(data, ['guid'])
        if data.get('assignedSic') is not None and \
                int(data['assignedSic']) not in SICS:
            continue
        
        whole = data.copy()
        whole['url'] = data.get('link').replace('-index.htm', '.txt')
        whole['full'] = True

        collection.ingest(whole.get('url'), **whole)

        for fc in item.findall(EDGNS + 'xbrlFiling//' + EDGNS + 'xbrlFile'):
            file_data = data.copy()
            file_rec = {'guid': data.get('guid')}
            for k, v in fc.attrib.items():
                file_data[k.replace(EDGNS, 'xbrlfile_')] = v
                file_rec[k.replace(EDGNS, '')] = v

            scraper.log.info('XBRL Filing: %s', file_data.get('xbrlfile_url'))
            engine['files'].upsert(file_rec, ['guid', 'url'])
            collection.ingest(file_data.get('xbrlfile_url'), **file_data)


if __name__ == '__main__':
    monthly_indexes.run()