# Homework exercise 1
## Deadline: upload to Moodle by 17 May 18:00 h

__Please submit your homework either as a Jupyter Notebook or using .py files.__

If you use .py files, please also include a PDF containing the output of your code and your explanations. Either way, the code needs to be in a form that can be easily run on another computer.

__Name:__


The name of the file that you upload should be named *Homework1_YourLastName_YourStudentID*.

Reminder: you are required to attend class on 18 May to earn points for this homework exercise unless you have a valid reason for your absence.

You are expected to work on this exercise individually. If any part of the questions is unclear, please ask on the Moodle forum.

__SEC EDGAR__

Filings made by companies to the regulator are another very useful source of text data. The most important source in this regard is the US Securities and Exchange Commission (SEC).

The SEC provides information on how to access their filings here: https://www.sec.gov/edgar/searchedgar/accessing-edgar-data.htm

Please write a function that

* downloads index files sorted by form type for a particular day or a list of days
* then downloads the _HTML versions_ of the filings made on that day (or each day in the list), with an optional argument that can specify the form type if you want to access only files of one such form type. Note that you can identify the file containing the main filing, which is the file to be downloaded, by considering the column 'Type' in the table, e.g., here: https://www.sec.gov/Archives/edgar/data/946644/0001493152-21-005524-index.htm

Please write another function that 
* downloads the HTML versions of the files of form type 10-Q file on a given day
* removes all tables and images from the files if there are any
* returns a DataFrame in which the columns correspond to the different parts/items of the form and the content of each filing is written to one row of the DataFrame. Item here is a technical term here as you will see when looking at such filings, e.g., here: https://www.sec.gov/Archives/edgar/data/1530425/000147793221001290/arrt_10q.htm ;  the items are numbered and items with the same number that are contained in the same part of the filing always have the same name.

Please test your code for days comprising a total of at least 10 filings.

In [1]:
from tqdm import tqdm as tqdm # progress bars
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urljoin
from math import ceil # round up
import pandas as pd
import re, io, requests, time, wget, glob, unidecode

PAUSE_TIME_SEC = 1
# downloads index files sorted by form type for a particular day or a list of days
# soup methods https://github.com/jonathanoheix/scraping_basics_with_beautifulsoup/blob/master/scraping_basics_with_beautifulsoup.ipynb
main_url = "https://www.sec.gov/Archives/edgar/daily-index/"
base_url = "https://www.sec.gov/Archives/"

In [2]:
# This function takes partial_text and input_soup arguments and follows the first link added to current_url variable    
def follow_link(partial_text, input_soup, current_url):
    # we look up all links that start with partial_text
    a_tags = input_soup.find_all("a", href=lambda href: href and href.startswith(partial_text)) # https://stackoverflow.com/questions/49840504/using-lambda-functions-in-beautiful-soup
    if len(a_tags)>0:
        # we add the link to the current_url
        output_url = urljoin(current_url, a_tags[0]["href"])
        print(f"Making a request, url={output_url}")
        time.sleep(PAUSE_TIME_SEC)  
        
        # we request the content of the new url    
        # we create a loop to request url until we get a ban message for some reason
        flag_new_result_bad = True
        ban_message_line = "Request Rate Threshold Exceeded"
        while flag_new_result_bad:
            new_result = requests.get(output_url)
            if ban_message_line in new_result.text:
                time.sleep(PAUSE_TIME_SEC)
            else:
                flag_new_result_bad = False
        # we parse the output 
        output_soup = BeautifulSoup(new_result.text, 'html.parser')
        return output_url, output_soup
    else:
        print("Have not found any links to follow!")
        print("Printing the soup below...")
        print(input_soup)

# this function tells us which quarter the input_date belongs to         
def return_quarter(input_month):
    # Return quarter's abbreviation for a given month
    return f"QTR{ceil(int(input_month)/3.)}"

# this function downloads the index files
def download_dataframe(input_soup, file_suffix):
    # we look up all links that lead to the download of all index files
    a_list = input_soup.find_all("a", href=lambda href: href and href.endswith(file_suffix))
    if len(a_list)>0:
        # we add the link to the current url
        file_url = urljoin(quarter_url, a_list[0]["href"])
        print(f"Waiting {PAUSE_TIME_SEC}")
        time.sleep(PAUSE_TIME_SEC) 
        
        # we manually create the column names because .idx is not properly formatted
        headers = ["Company name", "Form Type", "CIK", "Date filed", "File name"]  
        print(f"Trying to download table, url={file_url}")
        # we define our data frame
        df = pd.read_fwf(file_url,
                         skiprows=11,
                         names=headers)
        return df
    else:
        print("Have not found any links to download dataframes!")
        print(input_soup.prettify())
        
# this function parses websites by their url and returns their soup
def parse_html_by_url(url):
    flag_soup_bad = True
    ban_message_line = "Request Rate Threshold Exceeded"
    # we get source html of the url through a loop until we get a ban message for some reason
    while flag_soup_bad:
        print(f"Waiting {PAUSE_TIME_SEC}")
        time.sleep(PAUSE_TIME_SEC)
        html = requests.get(url)
        # we parse source html and receive its soup
        soup = BeautifulSoup(html.text, "html.parser")
        if not ban_message_line in soup.text:
            flag_soup_bad = False
    return soup

# this function downloads the index files
def dowload_idx_file(year,month,day):
    # we manually create the column names because .idx is not properly formatted
    headers = ["Company name", "Form Type", "CIK", "Date filed", "File name"]  
    print(f"Waiting {PAUSE_TIME_SEC}")
    time.sleep(PAUSE_TIME_SEC) 

    # this variable is assigned to a variable link
    file_url = f"https://www.sec.gov/Archives/edgar/daily-index/{year}/{return_quarter(month)}/company.{year}{month}{day}.idx"
    print(f"Trying to download table, url={file_url}")
    # we define our data frame
    df = pd.read_fwf(file_url,
                     skiprows=11,
                     names=headers)
    return df

In [3]:
# this function removes all images and tables in the soup of html through a loop
def remove_imgs_and_tbls(soup):
    selected_images = soup.findAll('img')
    for match in selected_images:
        match.decompose()
    selected_tables = soup.findAll('table')
    for match in selected_tables:
        match.decompose()
    with open("test.html", "w", encoding='utf-8') as file:
        file.write(str(soup))
    return soup

# this function removes a header from an index file
def remove_sec_header_from_filing(txt_filing):
    start = filing_text.index("</SEC-HEADER>")
    filing_text_short = txt_filing[start:]
    start = filing_text_short.index("<TYPE>10-Q")
    end = filing_text_short.index("</DOCUMENT>")
    filing_text_short = filing_text_short[start:end]
    return filing_text_short

# this function returns text between items
def get_items_text(soups_text):
    rows = list()
    for i in range(len(items_names)-1):
        # https://stackoverflow.com/questions/3368969/find-string-between-two-substrings
        try:
            begin = soups_text.index(items_names[i].upper())
            end = soups_text.index(items_names[i+1].upper())
            result_substring = soups_text[begin+len(items_names[i]):end]
        except:
            print(f"{items_names[i]}-{items_names[i+1]} did not work out")
            result_substring = ""
        rows.append(result_substring)
    # for the last item
    try:
        begin = soups_text.index(items_names[i+1].upper())
        result_substring = soups_text[begin+len(items_names[i+1]):]
    except:
        print(f"{items_names[i+1]}-END did not work out")
        result_substring = ""
    rows.append(result_substring)
    return [rows]

# this function adds items' text in data frame
def append_10form_text(df_10q_items_texts, source_text):
    filing_text_short = remove_sec_header_from_filing(source_text)
    soup_no_img_and_tbl = remove_imgs_and_tbls(BeautifulSoup(filing_text_short, "html.parser")) 
    filing_text_only = soup_no_img_and_tbl.get_text(" ", strip=True)  # " " is needed to preserve space between words
    
    #parse non-breakable whitespaces and other non-unicode characters
    filing_text_only = unidecode.unidecode(filing_text_only)
    # we extract items' text by the function to fill the data frame columns later 
    items_texts_10q = get_items_text(filing_text_only)
    
    # finally we create a buffer data frame and append it to the original
    df_buf = pd.DataFrame(items_texts_10q,columns=items_names)
    df_10q_items_texts = df_10q_items_texts.append(df_buf, ignore_index=True)
    return df_10q_items_texts

In [4]:
# here we assume the input to be the form of year-month(zero padded)-day(zero padded)
input_dates = ["2021-05-14"]

# here we put the desired form type to sort the files by
input_form_type = "10-Q"

In [5]:
full_visit = False
for input_date in input_dates:
    day = input_date.split("-")[2]
    month = input_date.split("-")[1]
    year =  input_date.split("-")[0]
    if full_visit:
        # we parse the reply from the server upon main_url request using html parser
        main_url_soup = parse_html_by_url(main_url)

        # now we split input_date into elements to use later
        print(f"Day = {day}, month={month}, year={year}")
        #algorithm:
        # 1) follow the first(year) link: we return year_url and its html content
        year_url, yearly_soup = follow_link(year, main_url_soup, main_url)

        # 2) follow the second (quartal) link: we return quarter_url and its html content
        quarter_url, quarterly_soup = follow_link(return_quarter(month), yearly_soup, year_url)

        # 3) we introduce a variable for selection of index files
        file_suffix = f".{year}{month}{day}.idx"

        # all records
        df_source = download_dataframe(quarterly_soup, file_suffix)
        # records sorted by form type
        df = df_source.loc[df_source["Form Type"] == input_form_type]
        print(df)
    else:
        df_source = dowload_idx_file(year,month,day)
        df = df_source.loc[df_source["Form Type"] == input_form_type]
        print(df)

Waiting 1
Trying to download table, url=https://www.sec.gov/Archives/edgar/daily-index/2021/QTR2/company.20210514.idx
                          Company name Form Type      CIK  Date filed  \
11         1st FRANKLIN FINANCIAL CORP      10-Q    38723    20210514   
19                       374Water Inc.      10-Q   933972    20210514   
23                         89bio, Inc.      10-Q  1785173    20210514   
28        A-Mark Precious Metals, Inc.      10-Q  1591588    20210514   
30    AB Private Credit Investors Corp      10-Q  1634452    20210514   
...                                ...       ...      ...         ...   
6382           ZETA ACQUISITION CORP I      10-Q  1422141    20210514   
6420                   aTYR PHARMA INC      10-Q  1339970    20210514   
6430                 iCap Vault 1, LLC      10-Q  1800199    20210514   
6433                        iQSTEL Inc      10-Q  1527702    20210514   
6436                    nDivision Inc.      10-Q  1659183    20210514   

     

In [8]:
for i in tqdm(range(len(df.index)), desc="Downloading HTML files"):
    url_to_download = urljoin(base_url,df.iloc[i]["File name"])
    fname = wget.download(url_to_download)
    time.sleep(PAUSE_TIME_SEC*10)
    print(f"Downloaded {fname}")

Downloading HTML files:   0%|          | 1/374 [00:11<1:08:49, 11.07s/it]

Downloaded 0001376474-21-000157.txt


Downloading HTML files:   1%|          | 2/374 [00:21<1:07:26, 10.88s/it]

Downloaded 0001575705-21-000287.txt


Downloading HTML files:   1%|          | 3/374 [00:32<1:07:27, 10.91s/it]

Downloaded 0001564590-21-027662.txt





HTTPError: HTTP Error 403: Forbidden

In [6]:
items_names = ['Item 1. Financial Statements',
               'Item 2. Management\'s Discussion and Analysis of Financial Condition and Results of Operations',
               'Item 3. Quantitative and Qualitative Disclosures About Market Risk',
               'Item 4. Controls and Procedures',
               'Item 1. Legal Proceedings',
               'Item 1A. Risk Factors',
               'Item 2. Unregistered Sales of Equity Securities and Use of Proceeds',
               'Item 3. Defaults Upon Senior Securities',
               'Item 4. Mine Safety Disclosures',
               'Item 5. Other Information',
               'Item 6. Exhibits']

df_filings = pd.DataFrame({colname:[] for colname in items_names})

for fname in glob.glob("*.txt"):
    with open(fname, "r") as fp:
        filing_text = fp.readlines()
        filing_text = "".join(filing_text)
    df_filings = append_10form_text(df_filings, filing_text)
df_filings

Item 1. Financial Statements-Item 2. Management's Discussion and Analysis of Financial Condition and Results of Operations did not work out
Item 2. Management's Discussion and Analysis of Financial Condition and Results of Operations-Item 3. Quantitative and Qualitative Disclosures About Market Risk did not work out
Item 3. Quantitative and Qualitative Disclosures About Market Risk-Item 4. Controls and Procedures did not work out
Item 1A. Risk Factors-Item 2. Unregistered Sales of Equity Securities and Use of Proceeds did not work out
Item 2. Unregistered Sales of Equity Securities and Use of Proceeds-Item 3. Defaults Upon Senior Securities did not work out
Item 1. Financial Statements-Item 2. Management's Discussion and Analysis of Financial Condition and Results of Operations did not work out
Item 2. Management's Discussion and Analysis of Financial Condition and Results of Operations-Item 3. Quantitative and Qualitative Disclosures About Market Risk did not work out
Item 3. Quantita

Unnamed: 0,Item 1. Financial Statements,Item 2. Management's Discussion and Analysis of Financial Condition and Results of Operations,Item 3. Quantitative and Qualitative Disclosures About Market Risk,Item 4. Controls and Procedures,Item 1. Legal Proceedings,Item 1A. Risk Factors,Item 2. Unregistered Sales of Equity Securities and Use of Proceeds,Item 3. Defaults Upon Senior Securities,Item 4. Mine Safety Disclosures,Item 5. Other Information,Item 6. Exhibits
0,,,,. (a) Disclosure Controls and Procedures. Und...,. There are no material pending legal proceedi...,,,. None.,. Not Applicable.,. None.,. 31.1 Certification of President of General P...
1,,,,. (a) Disclosure Controls and Procedures. Und...,. There are no material pending legal proceedi...,,,. None.,. Not Applicable.,. None.,. 10.1 Assignment of Purchase Agreement dated ...
2,,,,. (a) Disclosure Controls and Procedures. Und...,. There are no material pending legal proceedi...,,,. None.,. Not Applicable.,. None. 18,. 31.1 Certification of President of Managing ...
3,,,,. (a) Disclosure Controls and Procedures. Und...,. There are no material pending legal proceedi...,,,. None.,. Not Applicable.,. None. 20,. 31.1 Certification of President of Managing ...
4,,,,. (a) Disclosure Controls and Procedures. Und...,. There are no material pending legal proceedi...,,,. None.,. Not Applicable.,. None. 19,. 31.1 Certification of President of Managing ...
5,,,,,,,,,,,
6,,,,,,,,,,,
7,,,,,,,,,,,
8,,,,,,,,,,,
9,,,,,,,,,,,


In [60]:
## below is the obsolete approach to the second task

In [4]:
# collected_links = list()
# current_entry_num = 0
# timeout = PAUSE_TIME_SEC*3
# feed_count = 100
# stop_number_of_filings = 10

#while True:
#    current_rss_feed = iterate_rss_feed(start_feed=current_entry_num,
#                                        feed_count=feed_count)
#    if len(current_rss_feed["entries"])>0:
#        current_entry_num+= len(current_rss_feed["entries"])
#        for correct_link in return_filtered_entries(current_rss_feed, year, month, day, form_type=input_form_type):
#            collected_links.append(correct_link)
#    else:
#        print(f"Request threshold achieved! TIMEOUT ({timeout} sec)")
#        time.sleep(timeout)
#
#    time.sleep(PAUSE_TIME_SEC)
#    print(f"Number of correct links = {len(collected_links)}")
#    if len(collected_links)>=stop_number_of_filings:
#        print(f"Links n>={stop_number_of_filings}, stopping")
#        break

In [5]:
#https://stackoverflow.com/questions/13240700/python-beautifulsoup-to-find-all-a-href-with-specific-anchor-text
#TODO: RSS url supports filter by form type
#import os
#
#download_file_num = 0
#
#for link in tqdm(collected_links, desc="Downloading HTML filings"):
#    filing_html_result = requests.get(link)
#    time.sleep(PAUSE_TIME_SEC)
#    filing_html_soup = BeautifulSoup(filing_html_result.text, "html.parser")
#
#    a_tags = filing_html_soup.find_all("a", href=True, text=lambda text: text and text.endswith(".html"))
#    base_url = "https://www.sec.gov/"
#    filing_url = urljoin(base_url, a_tags[0]["href"])
#
#    filing_html_result_document = requests.get(filing_url)
#    time.sleep(PAUSE_TIME_SEC)
#    filing_html_document_soup = BeautifulSoup(filing_html_result_document.text, "html.parser")
#
#    if input_form_type:
#        path = f'filings_form_{input_form_type}'
#    else:
#        path = f'filings_all_forms'
#    os.makedirs(path, exist_ok=True)
#
#    with open(f"{path}/{download_file_num}.html", "w", encoding='utf-8') as file:
#        file.write(str(filing_html_document_soup))
#    download_file_num+=1


Downloading HTML filings: 0it [00:00, ?it/s]


In [4]:
# task 2 objectives
# 1)download the HTML files of form type 10-Q on X day
# 2)remove all tables and images from the files if there are any
# 3)return a DataFrame where columns correspond to items and rows to filings
# https://www.sec.gov/Archives/edgar/data/1530425/000147793221001290/arrt_10q.htm

In [45]:
#def get_clean_text_from_soup(soup):
#    text_ascii_apostophe = soup.get_text(strip=True).replace("Management’s", "Management's")
#    text_remove_non_ascii = re.sub(r'[^\x00-\x7F]+',' ', text_ascii_apostophe)
#    text_replace_double_spaces = text_remove_non_ascii.replace("  ", " ")
#    final_text_clean = text_replace_double_spaces.replace("Management's","Management’s")
#    return final_text_clean
#
## this function parses websites by their url and returns their soup
#def parse_html_by_url(url):
#    flag_soup_bad = True
#    ban_message_line = "Request Rate Threshold Exceeded"
#    # we get source html of the url
#    while flag_soup_bad:
#        time.sleep(PAUSE_TIME_SEC)
#        html = requests.get(url)
#        # we parse source html and receive its soup
#        soup = BeautifulSoup(html.text, "html.parser")
#        if not ban_message_line in soup.text:
#            flag_soup_bad = False
#    return soup
#
## this function removes all images and tables in the soup of html
#def remove_imgs_and_tbls(soup):
#    selected_images = soup.findAll('img')
#    for match in selected_images:
#        match.decompose()
#    selected_tables = soup.findAll('table')
#    for match in selected_tables:
#        match.decompose()
#    with open("test.html", "w", encoding='utf-8') as file:
#        file.write(str(soup))
#    return soup
#
## we extract the items' names and put them into a list
#def extract_items_names(filing_soup):
#    #links_to_items_in_10q_form = filing_soup.find_all("a", text=lambda text: text and text.startswith("Item "))
#    #if len(links_to_items_in_10q_form)>0:
#    #    items_10q_names = list()
#    #    for item_link in links_to_items_in_10q_form :
#    #        items_10q_names.append(item_link.text)
#    #    return items_10q_names
#    #else:
#    #    print(f"Have not found any links to items in the 10q form, falling back to the default values")
#    return ['Item 1. Financial Statements',
#            'Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations',
#            'Item 3. Quantitative and Qualitative Disclosures About Market Risk',
#            'Item 4. Controls and Procedures',
#            'Item 1. Legal Proceedings',
#            'Item 1A. Risk Factors',
#            'Item 2. Unregistered Sales of Equity Securities and Use of Proceeds',
#            'Item 3. Defaults Upon Senior Securities',
#            'Item 4. Mine Safety Disclosures',
#            'Item 5. Other Information',
#            'Item 6. Exhibits']
#
## this function returns text between items
#def get_items_text(items_names, soup_10q):
#    soups_text = get_clean_text_from_soup(soup_10q)
#    if len(items_names)>0:
#        rows = list()
#        filing_end_10q = "\xa0  \xa0 \xa0  \xa0\n\n\n"
#        for i in range(len(items_names)-1):
#            # https://stackoverflow.com/questions/3368969/find-string-between-two-substrings
#            try:
#                result = re.search('%s(.*)%s' % (items_names[i], items_names[i+1]), soups_text).group(1)
#            except:
#                result = ""
#            rows.append(result)
#        try:
#            # we had to add the filing end to the end since it is removed by the get_clean_text_from_soup function
#            result = re.search('%s(.*)%s' % (items_names[i+1], filing_end_10q), soups_text+filing_end_10q).group(1)
#        except:
#            result = ""
#        rows.append(result)
#        return [rows]
#    else:
#        print(f"Received empty list items_names={items_names}")
#
## this function queries rss server untill it fetches good rss feed
#def fetch_rss_feed(url):
#    flag_rss_bad = True
#    while flag_rss_bad:
#        rss_feed = feedparser.parse(url)
#        if rss_feed["bozo"] == 0:
#            flag_rss_bad = False
#        else:
#            time.sleep(PAUSE_TIME_SEC)
#    return rss_feed

In [6]:
#rss_10q_url = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=10-Q&company=&dateb=&owner=include&start=0&count=100&output=atom"
#base_url = "https://www.sec.gov/"
#
#rss_10q_feed = fetch_rss_feed(rss_10q_url)
#print(f"Fetched n={len(rss_10q_feed['entries'])} RSS entries")

Fetched n=100 RSS entries


In [98]:
#df_filings = None
#desired_form_type = "10-Q"
#i = 0
#for rss_entry in tqdm(rss_10q_feed["entries"], desc="RSS entries processed"):
#    if rss_entry["tags"][0]["term"] == desired_form_type:
#        flag_entry_not_processed = True
#        while flag_entry_not_processed:
#            link_to_filing_detail_form10q = rss_entry["links"][0]["href"]
#            filing_detail_soup = parse_html_by_url(link_to_filing_detail_form10q)
#
#            urls_to_10q_filings = filing_detail_soup.find_all("a",
#                                                              href=True,
#                                                              text=lambda text: text and (("10q" in text) or ("10-q" in text) or ("10_q" in text)) and text.endswith(".htm"))
#            if urls_to_10q_filings==[]:
#                print(f"Not found any .htm 10Q filings here= {link_to_filing_detail_form10q}  ; Skipping this entry...")
#                flag_entry_not_processed = False
#            else:
#                filing_10q_url = urljoin(base_url, urls_to_10q_filings[0]["href"])
#                filing_10q_soup = parse_html_by_url(filing_10q_url)
#
#                # we clean the soup of the filing by calling our function
#                filing_10q_soup = remove_imgs_and_tbls(filing_10q_soup)
#
#                # we put items' names into a dataframe by calling the constructor function
#                dataframe_col_names = extract_items_names(filing_10q_soup)
#                if df_filings is not None:
#                    df_filings = pd.DataFrame({colname:[] for colname in dataframe_col_names})
#
#                # we extract items' text by the function to fill the data frame columns later 
#                items_texts_10q = get_items_text(dataframe_col_names, filing_10q_soup)
#
#                # finally we create a buffer data frame and append it to the original
#                # https://stackoverflow.com/questions/16597265/appending-to-an-empty-dataframe-in-pandas
#                df_buf = pd.DataFrame(items_texts_10q,columns=dataframe_col_names)
#                df_filings = pd.concat([df_filings,df_buf],
#                                       ignore_index=True,
#                                       verify_integrity=False)
#                flag_entry_not_processed = False
#    i+=1
#    if i==1:
#        break
#    print(f"Filings in the data frame N={len(df_filings.index)}")
#df_filings

RSS entries processed:   0%|          | 0/100 [00:40<?, ?it/s]


Unnamed: 0,Item 1. Financial Statements,Item 2. Management’s Discussion and Analysis of Financial Condition and Results of Operations,Item 3. Quantitative and Qualitative Disclosures About Market Risk,Item 4. Controls and Procedures,Item 1. Legal Proceedings,Item 1A. Risk Factors,Item 2. Unregistered Sales of Equity Securities and Use of Proceeds,Item 3. Defaults Upon Senior Securities,Item 4. Mine Safety Disclosures,Item 5. Other Information,Item 6. Exhibits
0,NOTE 1 BUSINESS SUMMARY OF SIG...,Cautionary Statements This Form 10-Q contai...,Not applicable.,Evaluation of Disclosure Controls and Proced...,"As of March 31, 2021, we were not a party to...",We are not obligated to disclose our risk fa...,"The Company issued 444,764 shares of the Com...",. There have been no defaults upon senior sec...,Not applicable.,Not Applicable Item 6. Exhibits (a) Exhib...,


In [8]:
## then downloads the HTML versions of the filings made on that day (or each day in the list), with an
## optional argument that can specify the form type if you want to access only files of one such form
## type. Note that you can identify the file containing the main filing, which is the file to be downloaded,
## by considering the column 'Type' in the table
#
#"""
#filings_main_url = "https://www.sec.gov/cgi-bin/browse-edgar?company=&CIK=&type=&owner=include&count=100&action=getcurrent"
#result = requests.get(filings_main_url)
#time.sleep(PAUSE_TIME_SEC)
#result.text[:1000]
#filings_soup = BeautifulSoup(result.text, 'html.parser')
##print(soup.prettify())
#
## 1) download html files submitted on X date
#a_tags = filings_soup.find_all("a", href=lambda href: href and href.endswith("-index.htm"))
## 2) filter out all forms != Y
#3) print the result
#"""

In [2]:
#import feedparser  #from https://gis.stackexchange.com/questions/21319/parse-xml-files-in-python-elementtree
#from datetime import datetime
#
#def iterate_rss_feed(start_feed, feed_count):
#    rss_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcurrent&CIK=&type=&company=&dateb=&owner=include&start={start_feed}&count={feed_count}&output=atom"    
#    return feedparser.parse(rss_url)

In [3]:
#def return_filtered_entries(rss_feed, year, month, day, form_type=None):
#    print(f"Number of entires in the RSS feed ={len(rss_feed['entries'])}")
#    correct_links = list()    
#    for elem in rss_feed["entries"]:
#        date = datetime.fromisoformat(elem["updated"])
#        if (date.year == int(year)) and (date.month == int(month)) and (date.day == int(day)):
#            if form_type:
#                if form_type == d["entries"][0]["tags"][0]["term"]:
#                    correct_links.append(elem["links"][0]["href"])
#            else:
#                correct_links.append(elem["links"][0]["href"])
#    return correct_links    