In [None]:
import json
import os
import sys
import logging
import math
import tempfile
import zipfile
import itertools
from datetime import datetime
from typing import List, Optional, Union

import pandas as pd
import requests
from bs4 import BeautifulSoup
from requests.exceptions import (
    ConnectionError,
	RetryError,
	RequestException,
	HTTPError,
	Timeout,
)

project_root = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.insert(0, project_root)

PROJECT_ROOT = project_root

DATASET_DIR = os.path.join(project_root, 'datasets')

if not os.path.exists(DATASET_DIR):
	os.mkdir(DATASET_DIR)

In [24]:
def download_indices(
    start_year: int,
    end_year: int,
    quarters: List[int],
    skip_present_indices: bool,
    indices_folder: str,
    user_agent: str,
) -> None:
    """
    Downloads EDGAR Index files for the specified years and quarters.

    Args:
            start_year (int): The first year of the indices to be downloaded.
            end_year (int): The last year of the indices to be downloaded.
            quarters (List[int]): A list of quarters (in the format 1, 2, 3, 4) for which the indices will be downloaded.
            skip_present_indices (bool): If True, the function will skip downloading indices that are already present in the directory.
            indices_folder (str): Directory where the indices will be saved.
            user_agent (str): The User-Agent string that will be declared to SEC EDGAR.

    Raises:
            ValueError: If an invalid quarter is passed.
    """

    base_url = "https://www.sec.gov/Archives/edgar/full-index/"

    print("Downloading index files from SEC...")

    for quarter in quarters:
        if quarter not in [1, 2, 3, 4]:
            raise Exception(f'Invalid quarter "{quarter}"')
    
    first_iteration = True
    # Loop over the years and quarters to download the indices
    while True:
        failed_indices = []
        for year in range(start_year, end_year+1):
            for quarter in quarters:
                if year == datetime.now().year and quarter > math.ceil(
                    datetime.now().month / 3
                ): # Skip future quarters
                    break
                    
                index_filename = f"{year}_QTR{quarter}.tsv"

                # Check if the index file is already present
                if skip_present_indices and os.path.exists(
                    os.path.join(indices_folder, index_filename)
                ):
                    if first_iteration:
                        print(f"Skipping {index_filename}")
                    continue

                # If not, download the index file
                url = f"{base_url}/{year}/QTR{quarter}/master.zip"
                
                # Retry the download in case of failures
                with tempfile.TemporaryFile(mode="w+b") as tmp:
                    try:
                        request = requests.get(url=url, headers={"User-agent": user_agent})
                    except Exception as e:
                        failed_indices.append(index_filename)
                        continue
                
                    
                    tmp.write(request.content)

                    with zipfile.ZipFile(tmp).open("master.idx") as f:
                        lines = [
                            (decoded := line.decode("latin-1")).strip() 
                            + "|" + decoded.split("|")[-1].replace(".txt", "-index.html")
                            for line in itertools.islice(f, 11, None)
                        ]

                    # Save the processed index file
                    with open(
                        os.path.join(indices_folder, index_filename),
                        "w+",
                        encoding="utf-8",
                    ) as f:
                        f.write("".join(lines))
                        print(f"{index_filename} downloaded")
        first_iteration = False
        # Handle failed downloads
        if len(failed_indices) > 0:
            print(f"Could not download the following indices:\n{failed_indices}")
            user_input = input("Retry (Y/N): ")
            if user_input in ["Y", "y", "yes"]:
                print("Retry downloading failed indices")
            else:
                break
        else:
            break

In [None]:
def get_specific_indices(
    tsv_filenames: List[str],
    filing_types: List[str],
    user_agent: str,
    cik_tickers: Union[List[str], str, None] = None,
) -> pd.DataFrame:
    """
    Loops through all the indexes and keeps only the rows/Series for the specific filing types.

    Args:
            tsv_filenames (List[str]): The filenames of the indices.
            filing_types (List[str]): The filing types to download, e.g., ['10-K', '8-K'].
            user_agent (str): The User-Agent string that will be declared to SEC EDGAR.
            cik_tickers (Optional[List[str]]): List of CIKs or Tickers. If None, the function processes all CIKs in the provided indices.

    Returns:
            pd.DataFrame: A dataframe which contains series only for the specific indices.
    
    Note:
            cik_tickers: 타입 힌트 관점에서 Optional 대신 Union이 더 적합함.
    """
    # Initialize list for CIKs
    ciks = []
    
    # cik_tickers 입력값 정규화
    target_list = []
    # cik_tickers가 제공 되었을 경우
    if cik_tickers:
        if isinstance(cik_tickers, str):
            # 파일 경로인 경우 파일 읽기
            if os.path.exists and os.path.isfile(cik_tickers):
                with open(cik_tickers) as f:
                    cik_tickers = [
                        line.strip() for line in f.readlines() if line.strip() != ""
                    ]
            else:
                raise FileNotFoundError(f"File not found: {cik_tickers}")
        
        elif isinstance(cik_tickers, List):
            target_list = cik_tickers
        
    # SEC 데이터 다운로드 및 매핑
    if target_list:
        # Define the company_tickers_url
        company_tickers_url = "https://www.sec.gov/files/company_tickers.json"
    
        try:
            request = requests.get(url=company_tickers_url, headers={"User-agent": user_agent})
        except (
            RequestException,
            HTTPError,
            ConnectionError,
            Timeout,
            RetryError,
        ) as e:
            print(f'Failed downloading "{company_tickers_url}" - {e}')
            raise

        # 회사 ticker 데이터 로드
        company_tickers = json.load(request.content)

NameError: name 'Union' is not defined

In [None]:
def main():
    """
    Orchestrates the entire flow of crawling and downloading filings from SEC EDGAR.

    This function performs the following steps:
    1. Loads the configuration file.
    2. Creates necessary directories.
    3. Filters out the unnecessary years.
    4. Downloads the indices.
    5. Gets specific indices according to the provided filing types and CIKs/tickers.
    6. Compares the new indices with the old ones to download only the new filings.
    7. Crawls through each index to download (.tsv files) and save the filing.

    Raises:
            SystemExit: If no filing types are provided or if there are no new filings to download.
    """

    # Load the configuration file
    config_path = os.path.join(PROJECT_ROOT, "config.json")
    with open(config_path) as fin:
        config = json.load(fin)["download_filings"]

    # Define the directories and filepaths
    raw_filings_folder = os.path.join(DATASET_DIR, config["raw_filings_folder"])
    indices_folder = os.path.join(DATASET_DIR, config["indices_folder"])
    filings_metadata_filepath = os.path.join(
        DATASET_DIR, config["filings_metadata_file"]
    )
    
    # Check if at least one filing type is provided
    if len(config["filing_types"]) == 0:
        print("Please provide at least one filing type")
        exit()

    # If the indices and/or download folder doesn't exist, create them
    if not os.path.isdir(indices_folder):
        os.mkdir(indices_folder)
    if not os.path.isdir(raw_filings_folder):
        os.mkdir(raw_filings_folder)

    # We also create subfolders for each filing type in the raw_filings_folder for better organization
    for filing_type in config["filing_types"]:
        filing_type_folder = os.path.join(raw_filings_folder, filing_type)
        if not os.path.isdir(filing_type_folder):
            os.mkdir(filing_type_folder)

    # If companies_info.json doesn't exist, create it with empty JSON
    if not os.path.isfile(os.path.join(DATASET_DIR, "companies_info.json")):
        with open(os.path.join(DATASET_DIR, "companies_info.json"), "w") as f:
            json.dump(obj={}, fp=f)
    
    download_indices(
        start_year=config["start_year"],
        end_year=config["end_year"],
        quarters=config["quarters"],
        skip_present_indices=config["skip_present_indices"],
        indices_folder=indices_folder,
        user_agent=config["user_agent"],
    )

    tsv_filenames = []
    for year in range(config["start_year"], config["end_year"] + 1):
        for quarter in config["quarters"]:
            filepath = os.path.join(indices_folder, f"{year}_QTR{quarter}.tsv")

            if os.path.isfile(filepath):
                tsv_filenames.append(filepath)

    df = get_specific_indices(
        tsv_filenames=tsv_filenames,
        filing_types=config["filing_types"],
        cik_tickers=config["cik_tickers"],
        user_agent=config["user_agent"],
    )