In [1]:
##################################################
# edgar_npx_downloader_improved.py
##################################################
import os
import random
import time
from datetime import datetime

import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

In [2]:
# ------------------------------------------------
# 1. Global Configuration & Setup
# ------------------------------------------------
TARGET_YEAR = 2024
MAX_DOWNLOADS = 25  # can be set to None to download all
MASTER_INDEX_DIR = "./edgar_index"
NPX_DOWNLOAD_DIR = "./npx_filings"
#HEADERS = {"User-Agent": "N-PX-Downloader (myemail@domain.com)"}
HEADERS = {"User-Agent": "PythonRequests/3.0 (Generic Scraper)"}

# For robust networking
def create_requests_session(
    retries: int = 3, backoff_factor: float = 0.3, status_forcelist=(500, 502, 503, 504)
) -> requests.Session:
    """
    Create a requests Session with retry logic for handling transient errors.
    """
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    return session

# We'll use a single session throughout the script
REQUESTS_SESSION = create_requests_session()

In [3]:
# ------------------------------------------------
# 2. Master Index Download & Parsing
# ------------------------------------------------

def download_master_index(year: int, quarter: int, base_dir: str = MASTER_INDEX_DIR) -> str:
    """
    Downloads the master index for the specified year & quarter,
    saving it locally as {base_dir}/{year}_QTR{quarter}_master.idx.
    Returns the local filepath or "" on failure.
    """
    url = f"https://www.sec.gov/Archives/edgar/full-index/{year}/QTR{quarter}/master.idx"
    os.makedirs(base_dir, exist_ok=True)
    local_filename = os.path.join(base_dir, f"{year}_QTR{quarter}_master.idx")

    # Check if file already exists to avoid re-downloading (resume logic)
    if os.path.exists(local_filename) and os.path.getsize(local_filename) > 0:
        print(f"File already exists, skipping download: {local_filename}")
        return local_filename

    print(f"Downloading: {url}")
    try:
        response = REQUESTS_SESSION.get(url, headers=HEADERS, timeout=10)
        if response.status_code == 200:
            with open(local_filename, "wb") as f:
                f.write(response.content)
            print(f"Saved to: {local_filename}")
            return local_filename
        else:
            print(f"Failed to retrieve {url} [HTTP {response.status_code}].")
            return ""
    except Exception as e:
        print(f"Error connecting to {url}: {e}")
        return ""


def parse_master_index(idx_filepath: str) -> pd.DataFrame:
    """
    Parses a local .idx master index file and returns a DataFrame.
    DataFrame columns: [cik, company_name, form_type, date_filed, filename].
    """
    records = []
    if not os.path.exists(idx_filepath) or os.path.getsize(idx_filepath) == 0:
        print(f"Index file not found or empty: {idx_filepath}")
        return pd.DataFrame()

    # Attempt to read lines
    try:
        with open(idx_filepath, "r", encoding="latin-1") as f:
            lines = f.readlines()
    except Exception as e:
        print(f"Failed to read {idx_filepath}: {e}")
        return pd.DataFrame()

    # Find header line
    start_idx = None
    for i, line in enumerate(lines):
        if line.strip().startswith("CIK|Company Name|Form Type|Date Filed|Filename"):
            start_idx = i + 1
            break

    if start_idx is None:
        print(f"Could not find the data header in: {idx_filepath}")
        return pd.DataFrame()

    # Parse lines from start_idx onward
    for line in lines[start_idx:]:
        line = line.strip()
        if not line:
            continue
        parts = line.split("|")
        if len(parts) < 5:
            continue
        cik, company_name, form_type, date_filed, filename = parts[:5]
        records.append({
            "cik": cik,
            "company_name": company_name,
            "form_type": form_type,
            "date_filed": date_filed,
            "filename": filename
        })

    df = pd.DataFrame(records)
    return df


def get_all_npx_for_year(
    year: int,
    base_dir: str = MASTER_INDEX_DIR,
    output_csv: str = "npx_list.csv"
) -> pd.DataFrame:
    """
    Downloads & parses QTR1-QTR4 master indexes for the given year,
    filters for N-PX form types, saves combined results to a CSV:
    {base_dir}/{output_csv}. Returns the N-PX DataFrame.
    """
    combined_df = pd.DataFrame()

    for quarter in [1, 2, 3, 4]:
        idx_path = download_master_index(year, quarter, base_dir=base_dir)
        if not idx_path:
            # No file or failed download
            continue
        df_q = parse_master_index(idx_path)
        if df_q.empty:
            continue
        combined_df = pd.concat([combined_df, df_q], ignore_index=True)

    if combined_df.empty:
        print("No data found for the entire year.")
        return combined_df

    # Filter for N-PX only
    mask_npx = combined_df["form_type"].str.upper() == "N-PX"
    npx_df = combined_df[mask_npx].copy()

    # Save to CSV
    os.makedirs(base_dir, exist_ok=True)
    output_path = os.path.join(base_dir, output_csv)
    npx_df.to_csv(output_path, index=False)
    print(f"Saved N-PX listings to {output_path} (Count: {len(npx_df)})")

    return npx_df

In [4]:
# ------------------------------------------------
# 3. Download N-PX Filings With Improvements
# ------------------------------------------------

def download_npx_filings(
    npx_df: pd.DataFrame,
    max_downloads: int = 10,
    output_dir: str = NPX_DOWNLOAD_DIR,
    random_sample: bool = False,
    min_sleep: float = 0.5,
    max_sleep: float = 1.0
):
    """
    Download up to 'max_downloads' N-PX filings from npx_df.
    - Resumes by skipping files that already exist locally.
    - Allows random sampling of the data set.
    - Rate-limits requests with random sleep intervals [min_sleep, max_sleep].
    """
    if npx_df.empty:
        print("No N-PX filings in the DataFrame.")
        return

    # If max_downloads=None, download them all
    total_filings = len(npx_df)
    if max_downloads is None or max_downloads > total_filings:
        max_downloads = total_filings

    # Decide how to pick the subset
    if random_sample:
        # random sample of size max_downloads
        subset_df = npx_df.sample(n=max_downloads, random_state=42).reset_index(drop=True)
        print(f"Randomly sampling {max_downloads} out of {total_filings} possible filings.")
    else:
        # just take the first N rows
        subset_df = npx_df.head(max_downloads)
        print(f"Taking top {max_downloads} of {total_filings} filings.")

    os.makedirs(output_dir, exist_ok=True)

    for idx, row in subset_df.iterrows():
        filename = row["filename"].strip()
        form_type = row["form_type"].strip()
        date_filed = row["date_filed"].strip()

        full_url = f"https://www.sec.gov/Archives/{filename}"
        local_name = f"{date_filed}_{form_type}_{os.path.basename(filename)}"
        local_path = os.path.join(output_dir, local_name)

        # Resume/Skip logic
        if os.path.exists(local_path) and os.path.getsize(local_path) > 0:
            print(f"File already exists, skipping: {local_path}")
            continue

        print(f"\nDownloading: {full_url}")
        try:
            r = REQUESTS_SESSION.get(full_url, headers=HEADERS, timeout=10)
            if r.status_code == 200:
                with open(local_path, "wb") as f:
                    f.write(r.content)
                print(f"Saved to: {local_path}")
            else:
                print(f"Failed to download [HTTP {r.status_code}].")
        except Exception as e:
            print(f"Error downloading {full_url}: {e}")

        # Rate-limiting with random sleep
        sleep_duration = random.uniform(min_sleep, max_sleep)
        time.sleep(sleep_duration)

    print("\nDownload process complete.")

In [5]:
# ------------------------------------------------
# 4. Putting It All Together
# ------------------------------------------------

if __name__ == "__main__":
    # 4.1: Gather all N-PX for the chosen year
    npx_df = get_all_npx_for_year(
        year=TARGET_YEAR,
        base_dir=MASTER_INDEX_DIR,
        output_csv=f"npx_{TARGET_YEAR}_list.csv"
    )

    # 4.2: If we have any N-PX records, download them
    if not npx_df.empty:
        download_npx_filings(
            npx_df,
            max_downloads=MAX_DOWNLOADS,
            output_dir=NPX_DOWNLOAD_DIR,
            random_sample=True,  # Set to True to get a wide variety of N-PX
            min_sleep=0.5,       # Wait at least 0.5s between requests
            max_sleep=1.0        # Wait at most 1.0s between requests
        )
    else:
        print("No N-PX data found for the specified year. No downloads performed.")

Downloading: https://www.sec.gov/Archives/edgar/full-index/2024/QTR1/master.idx
Saved to: ./edgar_index\2024_QTR1_master.idx
Downloading: https://www.sec.gov/Archives/edgar/full-index/2024/QTR2/master.idx
Saved to: ./edgar_index\2024_QTR2_master.idx
Downloading: https://www.sec.gov/Archives/edgar/full-index/2024/QTR3/master.idx
Saved to: ./edgar_index\2024_QTR3_master.idx
Downloading: https://www.sec.gov/Archives/edgar/full-index/2024/QTR4/master.idx
Saved to: ./edgar_index\2024_QTR4_master.idx
Saved N-PX listings to ./edgar_index\npx_2024_list.csv (Count: 10321)
Randomly sampling 25 out of 10321 possible filings.

Downloading: https://www.sec.gov/Archives/edgar/data/1511985/0001085146-24-003669.txt
Saved to: ./npx_filings\2024-08-09_N-PX_0001085146-24-003669.txt

Downloading: https://www.sec.gov/Archives/edgar/data/1639997/0001376474-24-000319.txt
Saved to: ./npx_filings\2024-07-16_N-PX_0001376474-24-000319.txt

Downloading: https://www.sec.gov/Archives/edgar/data/1826136/0001172661-2