In [3]:
import requests
import os
import time
from datetime import datetime

# Configuration for companies and years
COMPANIES = [
    {"ticker": "GOOGL", "cik": "0001652044"},
    {"ticker": "MSFT", "cik": "0000789019"},
    {"ticker": "NVDA", "cik": "0001045810"},
]
YEARS = [2022, 2023, 2024]
BASE_SEC_ARCHIVES_URL = "https://www.sec.gov/Archives/edgar/data"
BASE_SEC_DATA_API_URL = "https://data.sec.gov"

# Define OUTPUT_DIR relative to the project root.
# Assuming this notebook is run from the project root (or main.ipynb which is at root).
PROJECT_ROOT = os.getcwd()
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "data")


# Headers for API requests - essential for SEC EDGAR API compliance.
# It identifies your application and contact information.
# This user agent complies with SEC.gov's Privacy and Security Policy.
HEADERS = {
    "User-Agent": "Uniqus.AI Financial RAG Project Bot (your@email.com)" #replace with your personal email and project name 
}

def create_output_directory():
    """Creates the output directory if it doesn't exist."""
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Ensured output directory '{OUTPUT_DIR}' exists.")

def get_company_filings_data(cik):
    """
    Fetches the JSON data for a company's submissions from the SEC API.
    This uses data.sec.gov, which is designed for programmatic access.
    """
    api_url = f"{BASE_SEC_DATA_API_URL}/submissions/CIK{cik}.json"
    print(f"Fetching filing data from API: {api_url}")
    try:
        response = requests.get(api_url, headers=HEADERS, timeout=15)
        response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching company filings data for CIK {cik}: {e}")
        return None

def download_10k_filing(ticker, cik, year):
    """
    Downloads the 10-K filing for a given company and fiscal year using the SEC API.
    This function specifically targets paths allowed by robots.txt (/Archives/edgar/data).

    Args:
        ticker (str): Company stock ticker (e.g., "GOOGL").
        cik (str): Company CIK code.
        year (int): Fiscal year of the 10-K filing.
    """
    print(f"\nAttempting to download 10-K for {ticker} (fiscal year {year})...")
    
    filings_data = get_company_filings_data(cik)
    if not filings_data:
        print(f"Failed to get filings data for {ticker} ({year}).")
        return False

    recent_filings = filings_data.get('filings', {}).get('recent', {})
    
    # Check if 'form' and 'reportDate' exist and are lists to prevent errors
    if not (recent_filings and isinstance(recent_filings.get('form'), list) and isinstance(recent_filings.get('reportDate'), list)):
        print(f"No recent filings data or malformed data for {ticker} ({year}).")
        return False

    found_filing = False
    
    # Iterate through recent filings to find the correct 10-K for the specified fiscal year.
    # The 'reportDate' in the SEC API typically indicates the fiscal period end date.
    for i in range(len(recent_filings['form'])):
        form_type = recent_filings['form'][i]
        report_date_str = recent_filings['reportDate'][i]
        accession_number = recent_filings['accessionNumber'][i]
        primary_document = recent_filings['primaryDocument'][i]

        if form_type == '10-K':
            try:
                report_date = datetime.strptime(report_date_str, '%Y-%m-%d')
                # A 10-K for fiscal year 'Y' usually has a reportDate in 'Y' (e.g., 2023-12-31 for 2023 10-K).
                # We want the 10-K where the reportDate's year matches our target fiscal 'year'.
                # Adding a check for report_date.month in case the 10-K is filed very early for the next year.
                # A common fiscal year end is Dec 31, so check for that or a quarter earlier/later.
                # For simplicity, we stick to strict year match as per assignment scope (2022-2024).
                if report_date.year == year:
                    # Construct the direct URL to the HTML filing from SEC Archives.
                    # Accession number needs dashes removed for the URL path segment.
                    accession_no_dashes = accession_number.replace('-', '')
                    
                    # The full URL format used here is explicitly allowed by SEC's robots.txt:
                    # https://www.sec.gov/Archives/edgar/data/CIK/ACCESSION_NO_NO_DASHES/PRIMARY_DOCUMENT_FILENAME
                    full_10k_url = (f"{BASE_SEC_ARCHIVES_URL}/"
                                    f"{cik}/"
                                    f"{accession_no_dashes}/"
                                    f"{primary_document}")
                    
                    filename = os.path.join(OUTPUT_DIR, f"{ticker}_{year}_10K.html")

                    try:
                        print(f"Found 10-K for {ticker} (fiscal year {year}). Downloading from: {full_10k_url}")
                        doc_response = requests.get(full_10k_url, headers=HEADERS, timeout=20)
                        doc_response.raise_for_status() # Check for HTTP errors
                        
                        with open(filename, 'wb') as f:
                            f.write(doc_response.content)
                        print(f"Successfully downloaded {ticker} {year} 10-K to '{filename}'")
                        found_filing = True
                        return True # Found and downloaded successfully, no need to check further filings for this company-year
                    except requests.exceptions.RequestException as e:
                        print(f"Error downloading {full_10k_url}: {e}")
                        # Continue to the next filing in the list if the current one fails
                    finally:
                        # Be polite to SEC servers: add a small delay after each download attempt
                        time.sleep(0.5) 

            except ValueError:
                print(f"Could not parse reportDate '{report_date_str}' for {ticker} 10-K.")
                continue # Skip to next filing if date format is unexpected or invalid

    if not found_filing:
        print(f"Could not find a suitable 10-K filing for {ticker} for fiscal year {year} in recent filings.")
        # For the scope of this assignment (2022-2024), 'recent' filings should be sufficient.
        return False

def main():
    """Main function to orchestrate the downloading process."""
    create_output_directory()

    for company in COMPANIES:
        for year in YEARS:
            download_10k_filing(company['ticker'], company['cik'], year)
            # Add a delay between requests for different company-year filings to avoid overwhelming SEC servers
            time.sleep(1) 

'''if __name__ == "__main__":
    main()
'''
def run_downloader():
    main()

Ensured output directory 'C:\Users\Sai Vaishnavi\Downloads\Intern project\data' exists.

Attempting to download 10-K for GOOGL (fiscal year 2022)...
Fetching filing data from API: https://data.sec.gov/submissions/CIK0001652044.json
Found 10-K for GOOGL (fiscal year 2022). Downloading from: https://www.sec.gov/Archives/edgar/data/0001652044/000165204423000016/goog-20221231.htm
Successfully downloaded GOOGL 2022 10-K to 'C:\Users\Sai Vaishnavi\Downloads\Intern project\data\GOOGL_2022_10K.html'

Attempting to download 10-K for GOOGL (fiscal year 2023)...
Fetching filing data from API: https://data.sec.gov/submissions/CIK0001652044.json
Found 10-K for GOOGL (fiscal year 2023). Downloading from: https://www.sec.gov/Archives/edgar/data/0001652044/000165204424000022/goog-20231231.htm
Successfully downloaded GOOGL 2023 10-K to 'C:\Users\Sai Vaishnavi\Downloads\Intern project\data\GOOGL_2023_10K.html'

Attempting to download 10-K for GOOGL (fiscal year 2024)...
Fetching filing data from API: ht