In [5]:
#####################################
# edgar_npx_downloader.ipynb CONTENT
#####################################

# 1. Imports and Globals
import os
import requests
import pandas as pd
from datetime import datetime
import time

AttributeError: partially initialized module 'pandas' has no attribute '_pandas_parser_CAPI' (most likely due to a circular import)

In [None]:
# 2. Configuration
TARGET_YEAR = 2024
MAX_DOWNLOADS = 10  # can be set to None for all
MASTER_INDEX_DIR = "./edgar_index"
NPX_DOWNLOAD_DIR = "./npx_filings"
HEADERS = {"User-Agent": "N-PX-Downloader (myemail@domain.com)"}


In [None]:
# 3. Download Master Index Function
def download_master_index(year: int, quarter: int, base_dir: str = MASTER_INDEX_DIR) -> str:
    """
    Downloads the master index for the specified year & quarter.
    Saves it locally as {base_dir}/{year}_QTR{quarter}_master.idx.
    Returns the local filepath.
    """
    url = f"https://www.sec.gov/Archives/edgar/full-index/{year}/QTR{quarter}/master.idx"
    os.makedirs(base_dir, exist_ok=True)
    local_filename = os.path.join(base_dir, f"{year}_QTR{quarter}_master.idx")

    print(f"Downloading: {url}")
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        with open(local_filename, "wb") as f:
            f.write(response.content)
        print(f"Saved to: {local_filename}")
        return local_filename
    else:
        print(f"Failed to retrieve {url} [HTTP {response.status_code}].")
        return ""



In [None]:
# 4. Parse Master Index
def parse_master_index(idx_filepath: str) -> pd.DataFrame:
    """
    Parses a local .idx master index file and returns a DataFrame.
    DataFrame columns: [cik, company_name, form_type, date_filed, filename].
    """
    records = []
    if not os.path.exists(idx_filepath) or os.path.getsize(idx_filepath) == 0:
        print(f"Index file not found or empty: {idx_filepath}")
        return pd.DataFrame()

    with open(idx_filepath, "r", encoding="latin-1") as f:
        lines = f.readlines()

    # Find the line with "CIK|Company Name|Form Type|Date Filed|Filename"
    start_idx = None
    for i, line in enumerate(lines):
        if line.strip().startswith("CIK|Company Name|Form Type|Date Filed|Filename"):
            start_idx = i + 1
            break

    if start_idx is None:
        print(f"Could not find the data header in: {idx_filepath}")
        return pd.DataFrame()

    # Parse lines from start_idx onward
    for line in lines[start_idx:]:
        line = line.strip()
        if not line:
            continue
        parts = line.split("|")
        if len(parts) < 5:
            continue
        cik, company_name, form_type, date_filed, filename = parts[:5]
        records.append({
            "cik": cik,
            "company_name": company_name,
            "form_type": form_type,
            "date_filed": date_filed,
            "filename": filename
        })

    df = pd.DataFrame(records)
    return df



In [None]:
# 5. Combine & Filter for N-PX
def get_all_npx_for_year(year: int,
                        base_dir: str = MASTER_INDEX_DIR,
                        output_csv: str = "npx_list.csv") -> pd.DataFrame:
    """
    Downloads and parses the QTR1-QTR4 master indexes for the given year.
    Filters for N-PX form types.
    Saves combined results as a CSV: {base_dir}/{output_csv}.
    Returns the N-PX DataFrame.
    """
    combined_df = pd.DataFrame()

    for quarter in [1, 2, 3, 4]:
        idx_path = download_master_index(year, quarter, base_dir=base_dir)
        if not idx_path:
            # No file or failed download
            continue
        df_q = parse_master_index(idx_path)
        if df_q.empty:
            continue
        combined_df = pd.concat([combined_df, df_q], ignore_index=True)

    if combined_df.empty:
        print("No data found for the entire year.")
        return combined_df

    # Filter for N-PX only
    mask_npx = combined_df["form_type"].str.upper() == "N-PX"
    npx_df = combined_df[mask_npx].copy()

    # Save to CSV
    os.makedirs(base_dir, exist_ok=True)
    output_path = os.path.join(base_dir, output_csv)
    npx_df.to_csv(output_path, index=False)
    print(f"Saved N-PX listings to {output_path}")

    return npx_df



In [None]:
# 6. Aggregate N-PX for the Chosen Year
npx_df = get_all_npx_for_year(
    year=TARGET_YEAR,
    base_dir=MASTER_INDEX_DIR,
    output_csv=f"npx_{TARGET_YEAR}_list.csv"
)



In [None]:
# 7. Download N-PX Filings
def download_npx_filings(npx_df: pd.DataFrame,
                         max_downloads: int = 10,
                         output_dir: str = NPX_DOWNLOAD_DIR):
    """
    Download up to 'max_downloads' N-PX filings from the npx_df.
    Saves them to 'output_dir'.
    """
    if npx_df.empty:
        print("No N-PX filings in the DataFrame.")
        return

    if max_downloads is None:
        max_downloads = len(npx_df)

    os.makedirs(output_dir, exist_ok=True)
    subset_df = npx_df.head(max_downloads)

    count = 0
    for idx, row in subset_df.iterrows():
        filename = row["filename"].strip()
        form_type = row["form_type"].strip()
        date_filed = row["date_filed"].strip()

        full_url = f"https://www.sec.gov/Archives/{filename}"
        local_name = f"{date_filed}_{form_type}_{os.path.basename(filename)}"
        local_path = os.path.join(output_dir, local_name)

        print(f"\n[{count+1}/{max_downloads}] Downloading: {full_url}")
        try:
            r = requests.get(full_url, headers=HEADERS)
            if r.status_code == 200:
                with open(local_path, "wb") as f:
                    f.write(r.content)
                print(f"Saved to: {local_path}")
            else:
                print(f"Failed to download [HTTP {r.status_code}].")
        except Exception as e:
            print(f"Error downloading {full_url}: {e}")

        count += 1
        # Sleep 0.2s for ~5 requests/sec
        time.sleep(0.2)

    print(f"\nCompleted. Downloaded {count} files.")



In [None]:
# 8. Execute the Download Step
if not npx_df.empty:
    download_npx_filings(
        npx_df,
        max_downloads=MAX_DOWNLOADS,
        output_dir=NPX_DOWNLOAD_DIR
    )
else:
    print("No N-PX data found for the specified year.")