In [11]:
import os
from sec_edgar_downloader import Downloader
from datetime import datetime
import glob
from bs4 import BeautifulSoup

In [2]:
# the directory to save the filings
DOWNLOAD_DIR = "sec_filings"

In [4]:
# create the downloader instance
email_address = "sude.tiras@sabanciuniv.edu"  # Replace with your email address
dl = Downloader(DOWNLOAD_DIR, email_address)

In [5]:
# Define the company's Ticker or CIK
ticker = "AAPL"  # Example: Apple Inc.

# Define the starting year and current year
start_year = 2022
end_year = datetime.now().year

# Function to download filings for a specific year
def download_filings_by_year(ticker, filing_type, start_year, end_year):
    """
    Download all filings for a given ticker between start_year and end_year.

    :param ticker: Company ticker or CIK.
    :param filing_type: '10-K' or '10-Q'.
    :param start_year: Year to start fetching reports from.
    :param end_year: The last year to fetch reports.
    """
    for year in range(start_year, end_year + 1):
        try:
            print(f"Downloading {filing_type} filings for {ticker} in {year}...")
            dl.get(filing_type, ticker, after=f"{year}-01-01", before=f"{year}-12-31")
        except Exception as e:
            print(f"Error downloading {filing_type} for {year}: {e}")

# Download 10-K and 10-Q reports for each year
download_filings_by_year(ticker, "10-K", start_year, end_year)
download_filings_by_year(ticker, "10-Q", start_year, end_year)

# Print the directory where the files are saved
print(f"Files are saved in: {os.path.abspath(DOWNLOAD_DIR)}")


Downloading 10-K filings for AAPL in 2022...
Downloading 10-K filings for AAPL in 2023...
Downloading 10-K filings for AAPL in 2024...
Downloading 10-K filings for AAPL in 2025...
Downloading 10-Q filings for AAPL in 2022...
Downloading 10-Q filings for AAPL in 2023...
Downloading 10-Q filings for AAPL in 2024...
Downloading 10-Q filings for AAPL in 2025...
Files are saved in: c:\Users\sudet\Desktop\finai\finai\Data Collection\sec_filings


In [10]:
import os
import glob
from bs4 import BeautifulSoup


# Directory where SEC filings are stored
SEC_FILINGS_DIR = "sec-edgar-filings"

# Function to extract readable text from SEC filing
def extract_clean_text(file_path):
    with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
        content = file.read()
        
        # Parse HTML/XML content
        soup = BeautifulSoup(content, "lxml")
        
        # Extract all readable text from document body
        for tag in ["script", "style", "ix:header", "ix:nonNumeric", "ix:nonFraction"]:
            [s.extract() for s in soup.find_all(tag)]
        
        # Convert to plain text
        text = soup.get_text(separator="\n")
        
        # Remove excessive blank lines
        text = "\n".join([line.strip() for line in text.split("\n") if line.strip()])
        
        return text

# Process all SEC filing files
for company_folder in glob.glob(f"{SEC_FILINGS_DIR}/*"):
    cik = os.path.basename(company_folder)
    print(f"Processing filings for CIK: {cik}")

    for filing_type in ["10-K", "10-Q"]:
        filing_dir = os.path.join(company_folder, filing_type)
        if not os.path.exists(filing_dir):
            continue
        
        for file in glob.glob(f"{filing_dir}/*/*.txt"):
            try:
                print(f"Extracting: {file}")
                clean_text = extract_clean_text(file)

                # Save cleaned text
                output_file = file.replace(".txt", "_cleaned.txt")
                with open(output_file, "w", encoding="utf-8") as out:
                    out.write(clean_text)

                print(f"Saved cleaned file: {output_file}")
            except Exception as e:
                print(f"Error processing {file}: {e}")


Processing filings for CIK: AAPL
Extracting: sec-edgar-filings\AAPL\10-K\0000320193-22-000108\full-submission.txt
Saved cleaned file: sec-edgar-filings\AAPL\10-K\0000320193-22-000108\full-submission_cleaned.txt
Extracting: sec-edgar-filings\AAPL\10-K\0000320193-23-000106\full-submission.txt
Saved cleaned file: sec-edgar-filings\AAPL\10-K\0000320193-23-000106\full-submission_cleaned.txt
Extracting: sec-edgar-filings\AAPL\10-K\0000320193-24-000123\full-submission.txt
Saved cleaned file: sec-edgar-filings\AAPL\10-K\0000320193-24-000123\full-submission_cleaned.txt
Extracting: sec-edgar-filings\AAPL\10-Q\0000320193-22-000007\full-submission.txt
Saved cleaned file: sec-edgar-filings\AAPL\10-Q\0000320193-22-000007\full-submission_cleaned.txt
Extracting: sec-edgar-filings\AAPL\10-Q\0000320193-22-000059\full-submission.txt
Saved cleaned file: sec-edgar-filings\AAPL\10-Q\0000320193-22-000059\full-submission_cleaned.txt
Extracting: sec-edgar-filings\AAPL\10-Q\0000320193-22-000070\full-submission.