# Homework exercise 1

__Name:__ Serhii Horbachov, 12026116


__SEC EDGAR__

How to access SEC filings: https://www.sec.gov/edgar/searchedgar/accessing-edgar-data.htm

Please write a function that

* downloads index files sorted by form type for a particular day or a list of days
* then downloads the _HTML versions_ of the filings made on that day (or each day in the list), with an optional argument that can specify the form type if you want to access only files of one such form type. Note that you can identify the file containing the main filing, which is the file to be downloaded, by considering the column 'Type' in the table, e.g., here: https://www.sec.gov/Archives/edgar/data/946644/0001493152-21-005524-index.htm

Please write another function that 
* downloads the HTML versions of the files of form type 10-Q file on a given day
* removes all tables and images from the files if there are any
* returns a DataFrame in which the columns correspond to the different parts/items of the form and the content of each filing is written to one row of the DataFrame. Item here is a technical term here as you will see when looking at such filings, e.g., here: https://www.sec.gov/Archives/edgar/data/1530425/000147793221001290/arrt_10q.htm ;  the items are numbered and items with the same number that are contained in the same part of the filing always have the same name.

Please test your code for days comprising a total of at least 10 filings.

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import lxml.etree as xml                 # a library for processing XML and HTML
from tqdm import tqdm                    # a progress bar library just for convenience to see how much time it's taking
import pandas as pd
import os                                # provides functions for interacting with the operating system
import csv                               # csv module implements classes to read and write tabular data in CSV format
import re

In [None]:
def make_df(filename):
    rows = []                                                           # creating a list for rows with 
    
    csv.register_dialect('skip_space', skipinitialspace=True)           # csv.register_dialect is used to parse the file 
                                                                        # and specify parameters, such as skipping blank spaces
                                                                        # by setting skipinitialspace=True
    
    with open(filename, 'r') as f:                                      # the mode 'r' means that the file is going
                                                                        # to be opened for reading
            
        reader=csv.reader(f , delimiter=' ', dialect='skip_space')      # csv.reader iterates the contents of a CSV file
                                                                        # in our case 
            
        for i, item in enumerate(reader):
            if i < 11:
                continue                                                # applies conditions in loops to get rid of useless stuff

            form_type = item[0]                                         # selecting 0th element
            company_name = " ".join(item[1:-4])                         # we create a line with elements from the end -4 till 1
            cik = item[-4]
            data_field = item[-3]
            file_name = item[-2]
                                                                        # add these created rows to the list
            rows.append([                                               
                    form_type, 
                    company_name, 
                    cik, 
                    data_field, 
                    file_name
                    ])

    return pd.DataFrame(rows, columns=['Form Type', 'Company Name', 'CIK', 'Date Field', 'File Name'])

In [None]:
# Testing validity of parsing:

# with open('form.20090116.idx', 'r') as f:
#     a = csv.reader(f , delimiter=' ', dialect='skip_space')
#     for  i, item in enumerate(a):
#         if i == 0:
#             print(item)

In [None]:
def create_needed_dirs(year, quarter, day):
    quarter_path = os.path.join(str(year), str(quarter))   # combining the path/directory of years and quarters
    day_path = os.path.join(quarter_path, str(day))        # combining the path/directory of days and quarters

    os.makedirs(str(year), exist_ok=True)                  # create all unavailable/missing directory for years
    os.makedirs(quarter_path, exist_ok=True)               # create all unavailable/missing directory for quarters
    os.makedirs(day_path, exist_ok=True)                   # create all unavailable/missing directory for days
    
                                                           # exist_ok=True leaves directory unaltered

In [None]:
def get_idx_files_by_days(days=[], form_type=None):
    
    idx_files = []                                         # creating a list for indexed files
    
    for day in tqdm(days):                                 # tqdm just for visual conveniance
                                                           # get idx for each day
        year = int(day[:4]) 
        month = int(day[4:6])

        quarter = None
        if month < 4:
            quarter = 'QTR1'
        elif month < 7:
            quarter = 'QTR2'
        elif month < 10:
            quarter = 'QTR3'
        else:
            quarter = 'QTR4'
        
        file_name = 'form.{}.idx'.format(day)

        create_needed_dirs(year, quarter, day[-2:])
        
        url = "https://www.sec.gov/Archives/edgar/daily-index/{}/{}/{}".format(year, quarter, file_name)
        
        # The try and except block in Python is used to catch and handle exceptions. 
        # Python executes code following the try statement as a “normal” part of the program.
        
        try:
            r = requests.get(url, allow_redirects=False)   # making a request to a web page
            with open(file_name, 'wb') as f:               # to write in binary mode ('wb')
                f.write(r.content)

            # to read index file as csv now in pandas and get the html file
            
            df = make_df(file_name)
            if form_type is not None:
                filtered_df = df[df['Form Type'] == form_type]
            else:
                filtered_df = df.copy()

            for index, row in filtered_df.iterrows():
                base = "https://www.sec.gov/Archives/"
                r = requests.get(base + row['File Name'])

                filename = row['File Name'].split('/')[-1]
                path = os.path.join(str(year), str(quarter), str(day[-2:]), filename)

                with open(path, 'wb+') as f:
                    f.write(r.content)

                idx_files.append(path)
        except:
            print('Could not get index file for {}'.format(day))

    return idx_files

In [None]:
get_idx_files_by_days(["20080212", "20090220", "20090116"])  # 20900220

 33%|███▎      | 1/3 [06:01<12:03, 361.86s/it]

Could not get index file for 20080212


 67%|██████▋   | 2/3 [06:03<04:13, 253.75s/it]

Could not get index file for 20090220


100%|██████████| 3/3 [06:04<00:00, 121.49s/it]

Could not get index file for 20090116





In [None]:
# This stuff deletes the folder:

# import shutil
# shutil.rmtree('2008')

In [None]:
# This stuff tests that dataframe is created from index file:

# df = make_df('form.20090220.idx')

- downloads the HTML versions of the files of form type 10-Q file on a given day
- removes all tables and images from the files if there are any
- returns a DataFrame in which the columns correspond to the different parts/items of the form and the content of each filing is written to one row of the DataFrame. Item here is a technical term here as you will see when looking at such filings, e.g., here: https://www.sec.gov/Archives/edgar/data/1530425/000147793221001290/arrt_10q.htm ; the items are numbered and items with the same number that are contained in the same part of the filing always have the same name.

In [None]:
def format_item(txt):
    item = txt.replace('\n', ' ').replace(u'\xa0', '')
    item = re.sub(' +', ' ', item)
    
    itm = ''
    num = ''
    title = ''

    flag = 0

    for i, char in enumerate(item):
        if flag == 0:
            if char == ' ' or char.isdigit():
                if char.isdigit():
                    num += char
                flag += 1
            else:
                itm += char
        elif flag == 1:
            if char == ' ' or not char.isdigit():
                if char != ' ' and char != '.':
                    if item[i + 1] == '.':
                        pass
                        # num += char
                    else:
                        title += char
                
                if num[-1] != '.':
                    num += '.'
                flag += 1
            else:
                num += char
        else:
            if char == '’' or char == '\'':
                continue

            if char != '.':
                title += char
    
    return f"{itm} {num} {title.strip()}".lower()


# assert(format_item('Item3. Quantitative and Qualitative Disclosures About Market Risk ') == \
    #    'Item 3. Quantitative and Qualitative Disclosures About Market Risk')
# assert(format_item('Item 5Other Information') == 'item 5. other information')

# assert(format_item('Item1A. Risk Factors ') == 'item 1a. risk factors')
# format_item('Item1A. Risk Factors ')
format_item('Item 5Other Information')
format_item('Item 6Exhibits ')

'item 6. exhibits'

In [None]:
# with open('2008/QTR1/12/0000919465-08-000013.txt', 'r') as f:
#     cont = f.read()
# # cont

In [None]:
def func2(days=[]):
    files = get_idx_files_by_days(days, '10-Q')

    df = pd.DataFrame()

    for n, file in enumerate(files):
        with open(file, 'r') as f:
            data = f.read()

        bs_content = bs(data, 'lxml')
        # remove tables
        for table in bs_content.select('table'):
            table.extract()
        # remove images
        for img in bs_content.select('image'):
            img.extract()

        items = {}
        item = False

        for text in bs_content.find_all('text'):
            for div in text.find_all('div'):
                if item:
                    if 'Item' in div.text[:4]:
                        item = format_item(div.text) 
                        # print(item)
                        continue

                    if len(div.text.replace('\n', '').replace(u'\xa0', '')) < 3:
                        continue

                    if item in items:
                        items[item] += '\n' + div.text.replace(u'\xa0', ' ').replace('\n', ' ')
                    else:
                        items[item] = div.text.replace(u'\xa0', ' ').replace('\n', ' ')
                else:
                    if 'Item' in div.text[:4]:
                        item = format_item(div.text)
                        # print(item)


        file_df = pd.DataFrame(items, index=[n])
        if n == 9:
            print(file)
        df = df.append(file_df)

    return df.dropna(how='all')

In [None]:
result = func2(["20080212"])

100%|██████████| 1/1 [00:05<00:00,  5.39s/it]


2008/QTR1/12/0000919465-08-000013.txt


In [None]:
result

Unnamed: 0,item 1. financial statements,item 3. quantitative and qualitative disclosures about market risk,item 4. controls and procedures,item 1. legal proceedings,item 1. risk factors,item 2. unregistered sales of equity securities and use of proceeds,item 3. defaults upon senior securities,item 4. submission of matters to a vote of security holders,item 5. other information,item 6. exhibits,item 2. managements discussion and analysis of financial condition and results of operations,item 1.  financial statements,item 2. managements discussion and analysis of financial condition and results of operations,item 3. qualitative and quantitative disclosures about market risk,item 2. change in securities and use of proceeds,item 6. (a)exhibits,item 6. exhibits -,item 1. a risk factors,item 4. submission of matters to vote of security holders
2,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,
7,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,
8,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,
9,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,
10,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,
11,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,
12,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,
14,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,
15,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,
16,Berry Plastics Corporation\nConsolidated Balan...,Interest Rate Risk\nWe are exposed to market r...,We maintain disclosure controls and procedures...,There has been no material changes in legal pr...,You should carefully consider the risks descri...,Not Applicable,Not Applicable,Not Applicable,Not Applicable,SIGNATURE\nPursuant to the requirements of the...,,,,,,,,,


In [None]:
# result.loc[9]['item 1. financial statements']