In [13]:
import requests
import os
from bs4 import BeautifulSoup
import errno
import datetime
import argparse
import time
import re
from tqdm import trange

from nltk import word_tokenize
from nltk.corpus import stopwords

In [14]:
class EDGARQueryError(Exception):
    """
    This error is thrown when a query receives a response that is not a 200 response.
    """

    def __init__(self, response):
        self.response = response

    def __str__(self):
        return "An error occured while making the query. Received {response} response".format(
            response=self.response
        )

In [15]:
class EDGARFieldError(Exception):
    """
    This error is thrown when an invalid field is given to an endpoint.
    """

    def __init__(self, endpoint, field):
        self.endpoint = endpoint
        self.field = field

    def __str__(self):
        return "Field {field} not found in endpoint {endpoint}".format(
            field=self.field, endpoint=self.endpoint
        )

In [16]:
class CIKError(Exception):
    """
    This error is thrown when an invalid CIK is given.
    """

    def __init__(self, cik):
        self.cik = cik

    def __str__(self):
        return "CIK {cik} is not valid. Must be str or int with 10 digits.".format(cik=self.cik)

In [17]:
default_data_path = os.path.abspath("/Users/anan_mac/Projects/SEC-risks/SEC-company-data/test")

In [18]:
# with open("sample.txt", "r") as f:
#     sample = f.readlines()
#     compile = re.findall(r"(?<=TABLE OF CONTENTS)(.*)(?=PART I)",sample[0])
#     print(compile)


In [23]:
def clean_data(data):
    data = data.lower()  
#    removing the \t and \n characters
    # data = data.replace('\t',' ').replace('\n',' ')

#    removing TABLE OF CONTENTS to PART I      
    clean = re.compile(r"(?<=TABLE OF CONTENTS)(.*)(?=PART I)")
    data = re.sub(clean," ",data)

#    keeping the string between the two Item 1A. and Item 1B.   
    data = re.findall(r"item 1a\.(.+?)item 1b\.",data)
    data = " ".join(data)

    data = data.replace('&#8217;',"'").replace('&#39;', "'")

#    removing non-breaking special characters
    clean = re.compile(r"&#\w+;")
    data = re.sub(clean,' ', str(data))

#    removing non-breaking special characters without ; at end       
    clean = re.compile(r"&#\w+")
    data = re.sub(clean,' ', str(data))

#    removing the non-breaking space and special characters    
    # data = data.replace('&#160;',' ').replace('&nbsp;',' ')
    # data = data.replace('&#174',' ').replace('&#xA0;',' ')
    # data = data.replace('&#32;',' ').replace('&#8220;',' ')
    # data = data.replace('&#8221;',' ').replace('&#8217;', ' ')
    # data = data.replace('&#149',' ').replace('&#146',' ')
    # data = data.replace('&#x',' ')


#    converting list of strings (output of re.findall into single string)    
    # data = ". ".join(data)
                                                
                        
##    removing the <HEAD></HEAD> from utf-8 encoded data       
#    clean = re.compile('<HEAD>.*?</HEAD>')
#    data = re.sub(clean,' ', data)                        
#
##    removing the <TABLE></TABLE> from utf-8 encoded data       
#    clean = re.compile('<CENTER>.*?</CENTER>')
#    data = re.sub(clean,' ', data)                        
#
##    removing the image utf-8 encoded data       
#    clean = re.compile('\.jpg(.|\s)*?</TEXT>')
#    data = re.sub(clean,' ', data)
#
##    removing the zip utf-8 encoded data
#    clean = re.compile('\.zip(.|\s)*?</TEXT>')
#    data = re.sub(clean,' ', data)    
#    
##    removing the xls utf-8 encoded data
#    clean = re.compile('\.xls(.|\s)*?</TEXT>')
#    data = re.sub(clean,' ', data)
#
##    removing the xsd utf-8 encoded data
#    clean = re.compile('\.xsd(.|\s)*?</TEXT>')
#    data = re.sub(clean,' ', data)    
#
#    
##    removing the png utf-8 encoded data
#    clean = re.compile('\.png.*|\s*<TEXT>.*?</TEXT>')
#    data = re.sub(clean,' ', data)   
#    
##    removing the pdf utf-8 encoded data
#    clean = re.compile('\.pdf(.|\s)*?</TEXT>')
#    data = re.sub(clean,' ', data)

##    removing the XBRL format utf-8 encoded data   
#    clean = re.compile('<XBRL>(.|\s)*?</XBRL>')
#    data = re.sub(clean,' ', data)
#
#
##    removing the .htm format utf-8 encoded data        
#    clean = re.compile('\.htm(.|\s)*?</TEXT>')
#    data = re.sub(clean,' ', data)
#
#
##    removing the xml utf-8 encoded data
#    clean = re.compile('\.xml(.|\s)*?</TEXT>')
#    data = re.sub(clean,' ', data)

#    removing the \t and \n characters
#    data = path.replace('\t',' ').replace('\n',' ')
#    data = path.replace('\t',' ')

#    removing the html tags from data

    clean = re.compile(r'<(.|\s)*?>')
    data = re.sub(clean,' ', data)

    data = " ".join(data.strip().split())

#    clean4 = re.compile('<(.|\s)*?>')
#    data4 = re.sub(clean4,' ', data3)


#    removing all numbers from data   - this seems like a bad idea

#    clean4 = re.compile('\d+(?:\.\d+)?')
#    data4 = re.sub(clean4, ' ', data3)
    
    return (data)

In [24]:
class SecCrawler(object):

    def __init__(self, data_path=default_data_path):
        self.data_path = data_path
        print("Directory where reports are stored:  " + self.data_path)

    def __repr__(self):
        return "SecCrawler(data_path{0})".format(self.data_path)

    def __str__(self):
        return "SecCrawler(data_path{0})".format(self.data_path)


    def _make_directory(self, company_code, cik, priorto, filing_type):
        # path = os.path.join(self.data_path, company_code, cik, filing_type)
        path = os.path.join(self.data_path, company_code)

        if not os.path.exists(path):
            try:
                os.makedirs(path)
            except OSError as Exception:
                if Exception.errno != errno.EEXIST:
                    raise


    def _save_in_directory(self, company_code, cik, priorto, filing_type, docs):
        # Save every text document into its respective folder
        for (url, doc_name) in docs:
            r = requests.get(url)
            data = r.text
            # data1 = clean_data(data)
            data1 = data 
                       
            # path = os.path.join(self.data_path, company_code, cik,
            #                     filing_type, doc_name)
            path = os.path.join(self.data_path, company_code, doc_name)

            with open(path, "ab") as f:
                f.write(data1.encode('ascii', 'ignore'))


    @staticmethod
    def _create_document_list(data):
        # parse fetched data using beautifulsoup
        # Explicit parser needed
        soup = BeautifulSoup(data, features='html.parser')
        # store the link in the list
        link_list = [link.string for link in soup.find_all('filinghref')]

        print("Number of files to download: {0}".format(len(link_list)))
        print("Starting download...")

        # List of url to the text documents
        txt_urls = [link[:link.rfind("-")] + ".txt" for link in link_list]
        # List of document doc_names
        doc_names = [url.split("/")[-1] for url in txt_urls]

        return list(zip(txt_urls, doc_names))

    @staticmethod
    def _sanitize_date(date):
        if isinstance(date, datetime.datetime):
            return date.strftime("%Y%m%d")
        elif isinstance(date, str):
            if len(date) != 8:
                raise TypeError('Date must be of the form YYYYMMDD')
        elif isinstance(date, int):
            if date < 10**7 or date > 10**8:
                raise TypeError('Date must be of the form YYYYMMDD')

    @staticmethod
    def _check_cik(cik):
        invalid_str = isinstance(cik, str) and len(cik) != 10
        invalid_int = isinstance(cik, int) and not (999999999 < cik < 10**10)
        invalid_type = not isinstance(cik, (int, str))
        if invalid_str or invalid_int or invalid_type:
            raise CIKError(cik)
        else:
            return cik

    def _fetch_report(self, company_code, cik, priorto, count, filing_type):
        priorto = self._sanitize_date(priorto)
        cik = self._check_cik(cik)
        self._make_directory(company_code, cik, priorto, filing_type)

        # generate the url to crawl
        base_url = "http://www.sec.gov/cgi-bin/browse-edgar"
        params = {'action': 'getcompany', 'owner': 'exclude', 'output': 'xml',
                  'CIK': cik, 'type': filing_type, 'dateb': priorto, 'count': count}
        print("started {filing_type} {company_code}".format(
            filing_type=filing_type, company_code=company_code))
        r = requests.get(base_url, params=params)
        if r.status_code == 200:
            data = r.text
#            tree = html.fromstring(r.content)
            # get doc list data
            docs = self._create_document_list(data)

            try:
                self._save_in_directory(
                    company_code, cik, priorto, filing_type, docs)
            except Exception as e:
                print(str(e))
        else:
            raise EDGARQueryError(r.status_code)

            
        
    def filing_10Q(self, company_code, cik, priorto, count):
        path = self._fetch_report(company_code, cik, priorto, count, '10-Q')
        return path

    def filing_10K(self, company_code, cik, priorto, count):
#        path = self._fetch_report(company_code, cik, priorto, count, '10-K')
#        return path
        self._fetch_report(company_code, cik, priorto, count, '10-K')

    def filing_8K(self, company_code, cik, priorto, count):
        path = self._fetch_report(company_code, cik, priorto, count, '8-K')
        return path

    def filing_13F(self, company_code, cik, priorto, count):
        path = self._fetch_report(company_code, cik, priorto, count, '13-F')
        return path

    def filing_SD(self, company_code, cik, priorto, count):
        path = self._fetch_report(company_code, cik, priorto, count, 'SD')
        return path

    def filing_4(self, company_code, cik, priorto, count):
        path = self._fetch_report(company_code, cik, priorto, count, '4')
        return path
            

In [25]:
def get_filings(a,b,c,d):
    t1 = time.time()
    seccrawler = SecCrawler() # creating object crawler from class SecCrawler()
    
    companyCode = a    #company code for Apple Inc
    cik = b      #cik code for Apple Inc
    date = c       #date from which filings should be downloaded
    count = d            # number of filings to be downloaded, at minimum 10 entries by EDGAR
    
#   Crawling, creating consolidated file, returning path to consolidated file
#    path = seccrawler.filing_10K(companyCode, cik, date, count)
    seccrawler.filing_10K(companyCode, cik, date, count)

    print("Successfully downloaded all the files")
    
#   Clocking out 
    t2 = time.time()
    print("Total time taken: {0}".format(t2-t1))

In [27]:
if __name__ == '__main__':
    
    companyCode = ['AAPL']
    cik = ['0000320193']
    
    # with open('sp500.txt') as f:
    #     df = pd.read_csv(f,sep=',')
    #     df["CIK"] = df.CIK.map("{:010}".format)
    #     for index, row in df.iterrows():
    #         companyCode.append(row['Name'])
    #         cik.append(row['CIK'])
    
#    with open('cik_ticker_database.csv') as f:
#        df = pd.read_csv(f, sep='|')
#        df["CIK"] = df.CIK.map("{:010}".format)
#        for index, row in df.iterrows():
#            companyCode.append(row['Name'])
#            cik.append(row['CIK'])


    # companyCode = ['Norfolk Southern','Southwest Airlines','International Paper'
    #               ,'PG&E','Freeport-McMoRan','Bristol-Myers Squibb'
    #               ,'Texas Instruments','Las Vegas Sands','Las Vegas Sands b'
    #               ,'Abbott Laboratories','Marriott International','Biogen'
    #               ,'Monsanto','Andeavor','AmerisourceBergen','Applied Materials'
    #               ,'General Motors','Cisco Systems','TJX Cos'
    #               ,'American International Group']    #company code for Apple Inc
    # cik = ['0000702165','0000092380','0000051434','0001004980','0000831259'
    #       ,'0000014272','0000097476','0001300514','0000850994' ,'0001441848'
    #       ,'0001048286','0000875045','0001110783','0000050104','0001140859'
    #       ,'0000006951','0000040730','0000858877','0000109198','0000005272']      #cik code for Apple Inc
    date = '20201231'       #date from which filings should be downloaded
    count = 25            # number of filings to be downloaded, at minimum 10 entries by EDGAR
    
    for i in range(len(cik)):
#        path = get_filings(companyCode[i], cik[i], date, count)     #Fetching the data based on input details
        get_filings(companyCode[i], cik[i], date, count)     #Fetching the data based on input details

Directory where reports are stored:  /Users/anan_mac/Projects/SEC-risks/SEC-company-data/test
started 10-K AAPL
Number of files to download: 20
Starting download...
Successfully downloaded all the files
Total time taken: 8.658684253692627
