# Homework exercise 1
## Deadline: upload to Moodle by 17 May 18:00 h

__Please submit your homework either as a Jupyter Notebook or using .py files.__

If you use .py files, please also include a PDF containing the output of your code and your explanations. Either way, the code needs to be in a form that can be easily run on another computer.

__Name:__Fabian Basler


The name of the file that you upload should be named *Homework1_YourLastName_YourStudentID*.

Reminder: you are required to attend class on 18 May to earn points for this homework exercise unless you have a valid reason for your absence.

You are expected to work on this exercise individually. If any part of the questions is unclear, please ask on the Moodle forum.

__SEC EDGAR__

Filings made by companies to the regulator are another very useful source of text data. The most important source in this regard is the US Securities and Exchange Commission (SEC).

The SEC provides information on how to access their filings here: https://www.sec.gov/edgar/searchedgar/accessing-edgar-data.htm

Please write a function that

* downloads index files sorted by form type for a particular day or a list of days
* then downloads the _HTML versions_ of the filings made on that day (or each day in the list), with an optional argument that can specify the form type if you want to access only files of one such form type. Note that you can identify the file containing the main filing, which is the file to be downloaded, by considering the column 'Type' in the table, e.g., here: https://www.sec.gov/Archives/edgar/data/946644/0001493152-21-005524-index.htm

Please write another function that 
* downloads the HTML versions of the files of form type 10-Q file on a given day
* removes all tables and images from the files if there are any
* returns a DataFrame in which the columns correspond to the different parts/items of the form and the content of each filing is written to one row of the DataFrame. Item here is a technical term here as you will see when looking at such filings, e.g., here: https://www.sec.gov/Archives/edgar/data/1530425/000147793221001290/arrt_10q.htm ;  the items are numbered and items with the same number that are contained in the same part of the filing always have the same name.

Please test your code for days comprising a total of at least 10 filings.

In [1]:
import pandas as pd
import time
import re
from bs4 import BeautifulSoup, SoupStrainer
import requests
import datetime as dt
from math import ceil
import numpy as np

In [2]:
def save_htm(url, dire , name, form):
    # request the url and put it in bs
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    #select all .htm documents if form is None
    if form==None:
        doc_links= [a.get("href") for a in soup.find_all("a",href= re.compile(r".htm$"))]
    # select only links with a text similar than the form
    else:
        form_re= re.compile(re.escape(form), re.I)
        doc_links= [a.get("href") for a in soup.find_all("a", text = form_re)]
    # downloads the document to the directory
    for link in doc_links:
        time.sleep(0.3)
        r= requests.get("https://www.sec.gov/"+link)
        open(f'{dire}/{name}.htm', 'wb').write(r.content)

In [3]:
def download_1(dates,form=None):
    """
    This function downloads the SEC files for given dates.
    Dates must be a list of strings.
    The format of the date has to be yyyy-mm-dd.
    """
    # makes the input to lists datetimes
    if type(dates) == str:
        dates = [dates]
    dates_dt = [dt.datetime.strptime(date, '%Y-%m-%d').date() for date in dates]
    dates_dt.sort()

    for date in dates_dt:
        print("new date started")
        # extracting items of the date
        year = str(date.year)
        qrt = str(ceil(date.month/3))
        month = date.strftime('%m')
        day = date.strftime('%d')
        dire = f"C:/Users/Fabian/Desktop/Python/projects/PY for Finance II/bs_scrap/SEC{year}{month}{day}"
        # creates a new directory for the date if doesn´t exists
        if not os.path.exists(dire):
            os.makedirs(dire)
        # download of the .idx file
        url= f"https://www.sec.gov/Archives/edgar/daily-index/{year}/QTR{qrt}/master.{year}{month}{day}.idx"
        r = requests.get(url)
        open(f'{dire}/master.{year}{month}{day}.idx', 'wb').write(r.content)

        # opens the idx file and stores it in a dataframe. cleans the data
        with open(f"{dire}/master.{year}{month}{day}.idx","r", encoding="utf-8") as fp:
            df = pd.read_csv(fp,sep="|",header=4, skip_blank_lines=True)
        df = df[1:]
        df["Date Filed"] = pd.to_datetime(df["Date Filed"],format='%Y%m%d')
        df["File Name"] = "https://www.sec.gov/Archives/" + df["File Name"].str.replace(".txt", "-index.htm")
        # selects only the searched form in the df
        if form != None:
            df = df[df['Form Type'] == form]
        df.sort_values(by=['Form Type'], inplace=True)
        # downloads the htm files
        for url, CIK in zip(df['File Name'],df["CIK"]):
            time.sleep(1)
            save_htm(url, dire, CIK, form)


In [4]:
 download_1(["2020-03-23","2012-12-17"],"8-K")

new date started
new date started


In [5]:
def dict_split(text, regex, company_CIK):
    """
    returns a dictionairy with the items as keys and the merged text as value
    """
    keys = [x.upper() for x in regex.findall(text)]
    text_split = regex.split(text)
    # drops first string because it occures before the first item
    text_split.pop(0)
    dummy_dict = {}
    # adds the string if the key already exists, sets to the string otherwise
    for key, split in zip(keys,text_split):
        if key in dummy_dict.keys():
            dummy_dict[key] = dummy_dict[key] + split
        else:
            dummy_dict[key]= split
    df = pd.DataFrame(data=dummy_dict, index=[str(company_CIK)])
    return df

In [6]:
def save_10Q(url, dire , name):
    # request the url and put it in bs
    result = requests.get(url)
    soup = BeautifulSoup(result.text, 'html.parser')
    # gets link of 10-Q htm
    doc_links= [a.get("href") for a in soup.find_all("a", text = re.compile(r"10.?q.htm"))]
    time.sleep(1)
    # downloads the document to the directory
    Dummy_Frame = pd.DataFrame()
    for link in doc_links:
        r= requests.get("https://www.sec.gov/"+link)
        time.sleep(2)
        soup = BeautifulSoup(r.text, 'html.parser')
        # delets tables and pictures
        for item in soup.find_all(["table", "picture"]):
            item.decompose()

        with open(f'{dire}/{name}.html', "w", encoding = 'utf-8') as file:
            file.write(str(soup))
        
        text = soup.get_text()
        re_item = re.compile(r"ITEM [0-9]",re.I)
        df = dict_split(text, re_item, name)
        Dummy_Frame = Dummy_Frame.append(df, sort=False)
    return Dummy_Frame


In [9]:
def download_2(dates):
    """
    This function downloads the SEC files for given dates.
    Dates must be a list of strings.
    The format of the date has to be yyyy-mm-dd.
    """
    # makes the input to lists datetimes
    if type(dates) == str:
        dates = [dates]
    dates_dt = [dt.datetime.strptime(date, '%Y-%m-%d').date() for date in dates]
    dates_dt.sort()

    
    DummyFrame = pd.DataFrame()
    for date in dates_dt:
        print("new date started")
        # extracting items of the date        
        year = str(date.year)
        qrt = str(ceil(date.month/3))
        month = date.strftime('%m')
        day = date.strftime('%d')
        dire = f"C:/Users/Fabian/Desktop/Python/projects/PY for Finance II/bs_scrap2/SEC{year}{month}{day}"
        # creates a new directory for the date if doesn´t exists
        if not os.path.exists(dire):
            os.makedirs(dire)
        # download of the .idx file
        url= f"https://www.sec.gov/Archives/edgar/daily-index/{year}/QTR{qrt}/master.{year}{month}{day}.idx"
        r = requests.get(url)
        time.sleep(1)
        open(f'{dire}/master.{year}{month}{day}.idx', 'wb').write(r.content)

        # opens the idx file and stores it in a dataframe. cleans the data
        with open(f"{dire}/master.{year}{month}{day}.idx","r", encoding="utf-8") as fp:
            df = pd.read_csv(fp,sep="|",header=4, skip_blank_lines=True)
        df = df[1:]
        df["Date Filed"] = pd.to_datetime(df["Date Filed"],format='%Y%m%d')
        df["File Name"] = "https://www.sec.gov/Archives/" + df["File Name"].str.replace(".txt", "-index.htm")
        df = df[df['Form Type'] == "10-Q"]
        # downloads the htm files and appends the DataFrame
        for url, CIK in zip(df['File Name'],df["CIK"]):
            time.sleep(5)
            DummyFrame = DummyFrame.append(save_10Q(url, dire, CIK))
    return DummyFrame


In [10]:
scrap2 = download_2("2020-03-23")

new date started


In [12]:
print(scrap2.head())

                                                    ITEM 1  \
1129096                                                NaN   
1375618                                                NaN   
1404935                                                NaN   
1546853  : FINANCIAL STATEMENTS   SKKYNET CLOUD SYSTEMS...   
1715611  . CONDENSED CONSOLIDATED INTERIM FINANCIAL STA...   

                                                    ITEM 2  \
1129096                                                NaN   
1375618                                                NaN   
1404935                                                NaN   
1546853  : MANAGEMENT’S DISCUSSION AND ANALYSIS OF FINA...   
1715611   – MANAGEMENT’S DISCUSSION AND ANALYSIS OF FIN...   

                                                    ITEM 3  \
1129096                                                NaN   
1375618                                                NaN   
1404935                                                NaN   
154685