In [15]:
################################################################################
# npx_parsing_notebook_hybrid.ipynb
# A Jupyter Notebook Example for Parsing N-PX Filings from `npx_filings` folder,
# with file size detection and an lxml.etree.iterparse approach for large docs.
################################################################################

import os
import re
import pandas as pd
from bs4 import BeautifulSoup

# For the iterparse approach:
from lxml import etree

NPX_DOWNLOAD_DIR = "./npx_filings"
FILE_SIZE_THRESHOLD = 50 * 1024 * 1024  # e.g., 50 MB

In [16]:
################################################################################
# 1. Streaming Approach for "smaller" files
################################################################################

def parse_sec_header_streaming(filepath):
    """
    Reads the file line-by-line until we hit the first <DOCUMENT> or run out of lines.
    Extracts fields like AMENDMENT_NO, ACCESSION_NUMBER, FILING_DATE, etc.
    """
    header_info = {
        'accessionNumber': "",
        'filingDate': "",
        'conformedPeriod': "",
        'headerCik': "",
        'amendmentNo': ""
    }

    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line_upper = line.upper()
            if "<DOCUMENT>" in line_upper:
                break

            if "AMENDMENT NO:" in line_upper:
                val = line.split("NO:")[-1].strip()
                header_info['amendmentNo'] = val
            elif "ACCESSION NUMBER:" in line_upper:
                val = line.split("NUMBER:")[-1].strip()
                header_info['accessionNumber'] = val
            elif "FILED AS OF DATE:" in line_upper:
                val = line.split("DATE:")[-1].strip()
                header_info['filingDate'] = val
            elif "CONFORMED PERIOD OF REPORT:" in line_upper:
                val = line.split("REPORT:")[-1].strip()
                header_info['conformedPeriod'] = val
            elif "CENTRAL INDEX KEY:" in line_upper:
                val = line.split("KEY:")[-1].strip()
                header_info['headerCik'] = val

    return header_info

def stream_documents(filepath):
    """
    Generator that yields (doc_type, doc_content) for each <DOCUMENT> block,
    reading line-by-line.
    """
    inside_document = False
    doc_lines = []
    doc_type = "NO_TYPE_FOUND"

    type_regex = re.compile(r"<TYPE>(?P<type>[^\r\n<]+)", re.IGNORECASE)

    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            upper_line = line.upper()

            if "<DOCUMENT>" in upper_line:
                inside_document = True
                doc_lines = [line]
                doc_type = "NO_TYPE_FOUND"

            elif "</DOCUMENT>" in upper_line and inside_document:
                doc_lines.append(line)
                doc_text = "".join(doc_lines)
                yield (doc_type.upper(), doc_text)

                inside_document = False
                doc_lines = []
                doc_type = "NO_TYPE_FOUND"

            else:
                if inside_document:
                    doc_lines.append(line)
                    match = type_regex.search(line)
                    if match:
                        doc_type = match.group("type").strip()




In [17]:
################################################################################
# 2. Helper Extraction Functions (smaller-file approach)
################################################################################

def extract_primary_doc_xml(document_text):
    filename_match = re.search(r"<FILENAME>(.*?)</FILENAME>", document_text, re.IGNORECASE)
    filename = filename_match.group(1).strip() if filename_match else ""
    if "primary_doc.xml" in filename.lower():
        text_match = re.search(r"<TEXT>(?P<xml>.*?)</TEXT>", document_text, re.IGNORECASE | re.DOTALL)
        return text_match.group("xml") if text_match else None
    return None

def extract_vote_table_xml(document_text):
    type_match = re.search(r"<TYPE>(?P<type>[^\r\n<]+)", document_text, re.IGNORECASE)
    doc_type = type_match.group("type").strip() if type_match else ""
    if "PROXY VOTING RECORD" in doc_type.upper():
        text_match = re.search(r"<TEXT>(?P<xml>.*?)</TEXT>", document_text, re.IGNORECASE | re.DOTALL)
        return text_match.group("xml") if text_match else None

    desc_match = re.search(r"<DESCRIPTION>(?P<desc>[^\r\n<]+)", document_text, re.IGNORECASE)
    desc = desc_match.group("desc").strip() if desc_match else ""
    if "VOTE TABLE" in desc.upper():
        text_match2 = re.search(r"<TEXT>(?P<xml>.*?)</TEXT>", document_text, re.IGNORECASE | re.DOTALL)
        return text_match2.group("xml") if text_match2 else None
    return None

def parse_primary_npx_xml(xml_str):
    soup = BeautifulSoup(xml_str, "lxml-xml")
    edgar_sub = soup.find("edgarSubmission")
    if not edgar_sub:
        return {}

    doc_info = {}
    series_id_tag = edgar_sub.find("seriesId")
    doc_info["seriesId"] = series_id_tag.text.strip() if series_id_tag else ""

    period_of_report = edgar_sub.find("periodOfReport")
    submission_type = edgar_sub.find("submissionType")
    registrant_type = edgar_sub.find("registrantType")
    investment_type = edgar_sub.find("investmentCompanyType")

    doc_info["periodOfReport"] = period_of_report.text.strip() if period_of_report else ""
    doc_info["submissionType"] = submission_type.text.strip() if submission_type else ""
    doc_info["registrantType"] = registrant_type.text.strip() if registrant_type else ""
    doc_info["investmentCompanyType"] = investment_type.text.strip() if investment_type else ""
    # you can parse more fields if needed

    return doc_info

def parse_vote_table_xml(xml_str):
    soup = BeautifulSoup(xml_str, "lxml-xml")
    vote_tables = soup.find_all(["proxyTable", "inf:proxyTable"])
    results = []
    for vt in vote_tables:
        row = {}
        row["issuerName"] = vt.find_text("issuerName")
        row["cusip"] = vt.find_text("cusip")
        row["isin"] = vt.find_text("isin")
        row["figi"] = vt.find_text("figi")
        row["meetingDate"] = vt.find_text("meetingDate")
        row["sharesVoted"] = vt.find_text("sharesVoted")
        row["howVoted"] = vt.find_text("howVoted")
        row["managementRecommendation"] = vt.find_text("managementRecommendation")
        row["voteDescription"] = vt.find_text("voteDescription")

        # forAgainstMgmt
        if row["howVoted"] and row["managementRecommendation"]:
            if row["howVoted"].upper() == row["managementRecommendation"].upper():
                row["forAgainstMgmt"] = "FOR"
            else:
                row["forAgainstMgmt"] = "AGAINST"
        else:
            row["forAgainstMgmt"] = ""

        results.append(row)
    return results

def parse_npx_file_streaming(filepath):
    header_info = parse_sec_header_streaming(filepath)
    doc_info = dict(header_info)
    all_votes = []

    # Stream the docs
    for doc_type, doc_content in stream_documents(filepath):
        if "N-PX" in doc_type:
            possible_xml = extract_primary_doc_xml(doc_content)
            if possible_xml:
                info = parse_primary_npx_xml(possible_xml)
                doc_info.update(info)
        if "PROXY VOTING RECORD" in doc_type or "VOTE TABLE" in doc_type:
            vote_xml = extract_vote_table_xml(doc_content)
            if vote_xml:
                votes = parse_vote_table_xml(vote_xml)
                all_votes.extend(votes)
    return doc_info, all_votes



In [18]:
################################################################################
# 3. lxml.etree.iterparse approach for big single-doc files
################################################################################

def parse_npx_file_iterparse(filepath):
    """
    Example of using lxml.etree.iterparse if the file is a single large well-formed XML doc.
    In reality, EDGAR .txt might not be strictly well-formed if it has multiple <DOCUMENT> blocks.
    This is a simplified demonstration.
    """
    # If the entire file is one giant XML doc, we can do an iterparse. We'll assume we skip the
    # standard <SEC-HEADER> approach for now, or parse it separately if you like.
    # Example: We parse 'edgarSubmission' nodes, searching for seriesId, etc.

    # We'll store minimal info in doc_info + gather votes as we see them
    doc_info = {}
    all_votes = []

    # Possibly parse header lines first if you want the same "header_info" from the top lines
    header_info = parse_sec_header_streaming(filepath)
    doc_info.update(header_info)

    # iterparse expects a well-formed XML doc
    # If the file has multiple <DOCUMENT> blocks, you might do a prior step to isolate the big one.
    context = etree.iterparse(filepath, events=('start', 'end'), recover=True, encoding='utf-8')
    # We'll track state
    current_vote = None

    for event, elem in context:
        tag_name = etree.QName(elem.tag).localname

        if event == 'end':
            # If we see e.g. <seriesId> and we haven't set doc_info["seriesId"] yet, do so
            if tag_name == "seriesId":
                doc_info["seriesId"] = (elem.text or "").strip()

            # If we see <issuerName> or <cusip> etc., perhaps we are in a <proxyTable> or <inf:proxyTable>.
            # This requires a more advanced approach if we have repeated tables.
            if tag_name in ["proxyTable", "inf:proxyTable"]:
                # We might finalize a record. This example is simplistic:
                # We could parse deeper if needed
                pass

        # Clear the element to free memory
        elem.clear()

    # This demonstration is incomplete for a full N-PX parse with iterparse, because
    # EDGAR files often are not single well-formed XML doc. But it shows how you'd do it
    # for a purely giant single doc.

    return doc_info, all_votes



In [19]:
################################################################################
# 4. Hybrid parse_npx_file that chooses approach based on file size
################################################################################

def parse_npx_file_hybrid(filepath, size_threshold=FILE_SIZE_THRESHOLD):
    """
    1) Check file size
    2) If less than threshold => use parse_npx_file_streaming
    3) If bigger => use parse_npx_file_iterparse
       (But keep in mind you might need a more robust logic if
        the large file has multiple <DOCUMENT> blocks.)
    """
    file_size = os.path.getsize(filepath)

    if file_size < size_threshold:
        return parse_npx_file_streaming(filepath)
    else:
        print(f"File is {file_size} bytes, using iterparse approach for {filepath}")
        return parse_npx_file_iterparse(filepath)



In [20]:
################################################################################
# 5. Putting It All Together
################################################################################

metadata_records = []
votes_records = []

all_files = [f for f in os.listdir(NPX_DOWNLOAD_DIR) if f.lower().endswith(".txt")]

for filename in all_files:
    filepath = os.path.join(NPX_DOWNLOAD_DIR, filename)
    doc_info, all_votes = parse_npx_file_hybrid(filepath)

    if doc_info:
        doc_info["filename"] = filename
        metadata_records.append(doc_info)

    for v in all_votes:
        v["filename"] = filename
        votes_records.append(v)

df_metadata = pd.DataFrame(metadata_records)
df_votes = pd.DataFrame(votes_records)

# Merge them if you want a single CSV
if not df_metadata.empty and not df_votes.empty:
    df_final = df_votes.merge(df_metadata, on="filename", how="inner")
else:
    df_final = pd.DataFrame()

print(f"Metadata rows: {len(df_metadata)}")
print(f"Vote rows: {len(df_votes)}")
print(f"Final joined rows: {len(df_final)}")

df_metadata.to_csv("parsed_npx_metadata.csv", index=False)
df_votes.to_csv("parsed_npx_votes.csv", index=False)
df_final.to_csv("parsed_npx_final.csv", index=False)

print("Hybrid parse (size-based) complete! See CSV outputs for details.")

FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml-xml. Do you need to install a parser library?