In [1]:
pip install sec_edgar_downloader

Collecting sec_edgar_downloader
  Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl.metadata (11 kB)
Collecting pyrate-limiter>=3.6.0 (from sec_edgar_downloader)
  Downloading pyrate_limiter-3.9.0-py3-none-any.whl.metadata (28 kB)
Downloading sec_edgar_downloader-5.0.3-py3-none-any.whl (14 kB)
Downloading pyrate_limiter-3.9.0-py3-none-any.whl (33 kB)
Installing collected packages: pyrate-limiter, sec_edgar_downloader
Successfully installed pyrate-limiter-3.9.0 sec_edgar_downloader-5.0.3
Note: you may need to restart the kernel to use updated packages.


In [11]:
import os
import re
from bs4 import BeautifulSoup
from sec_edgar_downloader import Downloader

# ✅ MAIN FOLDER STRUCTURE CONFIG
DATA_DIR = "Data Collection/sec-edgar-filings"

# ✅ 1. Download latest filings into your target folder
def download_latest_filing(form_type: str = "10-Q") -> str:
    dl = Downloader("dvschenone@ucsd.edu", DATA_DIR)
    dl.get(form_type, "NVDA")
    return get_latest_filing_file(form_type)


# ✅ 2. Traverse folders to find latest primary-document
def get_latest_filing_file(form_type: str) -> str:
    filings_root = os.path.join(DATA_DIR)
    latest_file = None
    latest_mtime = 0

    for cik_folder in os.listdir(filings_root):
        form_path = os.path.join(filings_root, cik_folder, form_type)
        if not os.path.exists(form_path):
            continue

        for root, _, files in os.walk(form_path):
            for file in files:
                if "primary-document" in file and (file.endswith(".html") or file.endswith(".txt")):
                    file_path = os.path.join(root, file)
                    mtime = os.path.getmtime(file_path)
                    if mtime > latest_mtime:
                        latest_file = file_path
                        latest_mtime = mtime

    return latest_file


# ✅ 3. Extract "Item 1A. Risk Factors" from .html or .txt
def extract_item_1a(file_path: str) -> str:
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        content = f.read()

    # If it's HTML, parse and extract just the text
    if "<html" in content.lower():
        soup = BeautifulSoup(content, "html.parser")
        text = soup.get_text(separator="\n")
    else:
        text = content

    # Normalize and extract Item 1A section
    text = re.sub(r'\n+', '\n', text)
    pattern = r'(?i)(item\s+1A\.?\s*[-–:]?\s*Risk\s+Factors)(.*?)(item\s+1B\.?|item\s+2\.)'
    match = re.search(pattern, text, re.DOTALL)

    if match:
        return match.group(2).strip()
    else:
        return "Item 1A section not found."

# ✅ 4. Example run
if __name__ == "__main__":
    file_path = download_latest_filing("10-Q")
if file_path:
    print(f"✅ Found latest filing: {file_path}")
    print(extract_item_1a(file_path)[:1000])
else:
    print("❌ Still no filing found.")





FileNotFoundError: [Errno 2] No such file or directory: 'Data Collection/sec-edgar-filings'