In [7]:
################################################################################
# npx_parsing_notebook_hybrid_fixed.ipynb
# A Jupyter Notebook Example for Parsing N-PX Filings from `npx_filings` folder,
# with file size detection and an lxml.etree.iterparse approach for large docs.
# AND with bug fixes related to .find_text(...) calls.
################################################################################

import os
import re
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree  # for iterparse in large-file scenario

NPX_DOWNLOAD_DIR = "./npx_filings"
FILE_SIZE_THRESHOLD = 50 * 1024 * 1024  # e.g., 50 MB

In [8]:
################################################################################
# 1. Streaming Approach for "smaller" files
################################################################################

def parse_sec_header_streaming(filepath):
    """
    Reads the file line-by-line until we hit the first <DOCUMENT> or run out of lines.
    Extracts fields like AMENDMENT_NO, ACCESSION_NUMBER, FILING_DATE, etc.
    """
    header_info = {
        'accessionNumber': "",
        'filingDate': "",
        'conformedPeriod': "",
        'headerCik': "",
        'amendmentNo': ""
    }

    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            line_upper = line.upper()
            if "<DOCUMENT>" in line_upper:
                break

            if "AMENDMENT NO:" in line_upper:
                val = line.split("NO:")[-1].strip()
                header_info['amendmentNo'] = val
            elif "ACCESSION NUMBER:" in line_upper:
                val = line.split("NUMBER:")[-1].strip()
                header_info['accessionNumber'] = val
            elif "FILED AS OF DATE:" in line_upper:
                val = line.split("DATE:")[-1].strip()
                header_info['filingDate'] = val
            elif "CONFORMED PERIOD OF REPORT:" in line_upper:
                val = line.split("REPORT:")[-1].strip()
                header_info['conformedPeriod'] = val
            elif "CENTRAL INDEX KEY:" in line_upper:
                val = line.split("KEY:")[-1].strip()
                header_info['headerCik'] = val

    return header_info


def stream_documents(filepath):
    """
    Generator that yields (doc_type, doc_content) for each <DOCUMENT> block,
    reading line-by-line.
    """
    inside_document = False
    doc_lines = []
    doc_type = "NO_TYPE_FOUND"

    type_regex = re.compile(r"<TYPE>(?P<type>[^\r\n<]+)", re.IGNORECASE)

    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            upper_line = line.upper()

            if "<DOCUMENT>" in upper_line:
                inside_document = True
                doc_lines = [line]
                doc_type = "NO_TYPE_FOUND"

            elif "</DOCUMENT>" in upper_line and inside_document:
                doc_lines.append(line)
                doc_text = "".join(doc_lines)
                yield (doc_type.upper(), doc_text)

                inside_document = False
                doc_lines = []
                doc_type = "NO_TYPE_FOUND"

            else:
                if inside_document:
                    doc_lines.append(line)
                    match = type_regex.search(line)
                    if match:
                        doc_type = match.group("type").strip()




In [9]:
################################################################################
# 2. Helper Extraction Functions (smaller-file approach)
################################################################################

def extract_primary_doc_xml(document_text):
    filename_match = re.search(r"<FILENAME>(.*?)</FILENAME>", document_text, re.IGNORECASE)
    filename = filename_match.group(1).strip() if filename_match else ""
    if "primary_doc.xml" in filename.lower():
        text_match = re.search(r"<TEXT>(?P<xml>.*?)</TEXT>", document_text, re.IGNORECASE | re.DOTALL)
        return text_match.group("xml") if text_match else None
    return None


def extract_vote_table_xml(document_text):
    type_match = re.search(r"<TYPE>(?P<type>[^\r\n<]+)", document_text, re.IGNORECASE)
    doc_type = type_match.group("type").strip() if type_match else ""
    if "PROXY VOTING RECORD" in doc_type.upper():
        text_match = re.search(r"<TEXT>(?P<xml>.*?)</TEXT>", document_text, re.IGNORECASE | re.DOTALL)
        return text_match.group("xml") if text_match else None

    desc_match = re.search(r"<DESCRIPTION>(?P<desc>[^\r\n<]+)", document_text, re.IGNORECASE)
    desc = desc_match.group("desc").strip() if desc_match else ""
    if "VOTE TABLE" in desc.upper():
        text_match2 = re.search(r"<TEXT>(?P<xml>.*?)</TEXT>", document_text, re.IGNORECASE | re.DOTALL)
        return text_match2.group("xml") if text_match2 else None
    return None


def parse_primary_npx_xml(xml_str):
    # Use lxml-xml parser to avoid "FeatureNotFound: xml" error
    soup = BeautifulSoup(xml_str, "lxml-xml")
    edgar_sub = soup.find("edgarSubmission")
    if not edgar_sub:
        return {}

    doc_info = {}
    series_id_tag = edgar_sub.find("seriesId")
    doc_info["seriesId"] = series_id_tag.get_text(strip=True) if series_id_tag else ""

    period_of_report = edgar_sub.find("periodOfReport")
    submission_type = edgar_sub.find("submissionType")
    registrant_type = edgar_sub.find("registrantType")
    investment_type = edgar_sub.find("investmentCompanyType")

    doc_info["periodOfReport"] = period_of_report.get_text(strip=True) if period_of_report else ""
    doc_info["submissionType"] = submission_type.get_text(strip=True) if submission_type else ""
    doc_info["registrantType"] = registrant_type.get_text(strip=True) if registrant_type else ""
    doc_info["investmentCompanyType"] = investment_type.get_text(strip=True) if investment_type else ""
    # parse more fields if needed (e.g. <reportType>, <amendmentType>)

    return doc_info


def parse_vote_table_xml(xml_str):
    # Use lxml-xml parser
    soup = BeautifulSoup(xml_str, "lxml-xml")
    vote_tables = soup.find_all(["proxyTable", "inf:proxyTable"])

    results = []
    for vt in vote_tables:
        row = {}

        # Bug fix: Replacing vt.find_text(...) with standard .find("tag") + .get_text()
        issuer_tag = vt.find("issuerName")
        row["issuerName"] = issuer_tag.get_text(strip=True) if issuer_tag else ""

        cusip_tag = vt.find("cusip")
        row["cusip"] = cusip_tag.get_text(strip=True) if cusip_tag else ""

        isin_tag = vt.find("isin")
        row["isin"] = isin_tag.get_text(strip=True) if isin_tag else ""

        figi_tag = vt.find("figi")
        row["figi"] = figi_tag.get_text(strip=True) if figi_tag else ""

        meeting_date_tag = vt.find("meetingDate")
        row["meetingDate"] = meeting_date_tag.get_text(strip=True) if meeting_date_tag else ""

        shares_voted_tag = vt.find("sharesVoted")
        row["sharesVoted"] = shares_voted_tag.get_text(strip=True) if shares_voted_tag else ""

        how_voted_tag = vt.find("howVoted")
        row["howVoted"] = how_voted_tag.get_text(strip=True) if how_voted_tag else ""

        mgmt_rec_tag = vt.find("managementRecommendation")
        row["managementRecommendation"] = mgmt_rec_tag.get_text(strip=True) if mgmt_rec_tag else ""

        vote_desc_tag = vt.find("voteDescription")
        row["voteDescription"] = vote_desc_tag.get_text(strip=True) if vote_desc_tag else ""

        # forAgainstMgmt logic
        if row["howVoted"] and row["managementRecommendation"]:
            row["forAgainstMgmt"] = "FOR" if row["howVoted"].upper() == row["managementRecommendation"].upper() else "AGAINST"
        else:
            row["forAgainstMgmt"] = ""

        results.append(row)

    return results


def parse_npx_file_streaming(filepath):
    header_info = parse_sec_header_streaming(filepath)
    doc_info = dict(header_info)
    all_votes = []

    for doc_type, doc_content in stream_documents(filepath):
        if "N-PX" in doc_type:
            possible_xml = extract_primary_doc_xml(doc_content)
            if possible_xml:
                info = parse_primary_npx_xml(possible_xml)
                doc_info.update(info)
        if "PROXY VOTING RECORD" in doc_type or "VOTE TABLE" in doc_type:
            vote_xml = extract_vote_table_xml(doc_content)
            if vote_xml:
                votes = parse_vote_table_xml(vote_xml)
                all_votes.extend(votes)

    return doc_info, all_votes




In [10]:
################################################################################
# 3. lxml.etree.iterparse approach for big single-doc files
################################################################################

def parse_npx_file_iterparse(filepath):
    """
    Example of using lxml.etree.iterparse if the file is a single large well-formed XML doc.
    In reality, EDGAR .txt might not be strictly well-formed if it has multiple <DOCUMENT> blocks.
    This is a simplified demonstration.
    """
    doc_info = {}
    all_votes = []

    # Possibly parse header lines if desired
    header_info = parse_sec_header_streaming(filepath)
    doc_info.update(header_info)

    context = etree.iterparse(filepath, events=('start', 'end'), recover=True, encoding='utf-8')
    for event, elem in context:
        tag_name = etree.QName(elem.tag).localname

        if event == 'end':
            if tag_name == "seriesId":
                doc_info["seriesId"] = (elem.text or "").strip()

            # In a real scenario, parse <proxyTable> or <inf:proxyTable> to create vote records
            if tag_name in ["proxyTable", "inf:proxyTable"]:
                # This is a placeholder approach
                pass

        elem.clear()

    return doc_info, all_votes




In [None]:
################################################################################
# 4. Hybrid parse_npx_file that chooses approach based on file size
################################################################################

def parse_npx_file_hybrid(filepath, size_threshold=FILE_SIZE_THRESHOLD):
    file_size = os.path.getsize(filepath)

    if file_size < size_threshold:
        return parse_npx_file_streaming(filepath)
    else:
        print(f"File is {file_size} bytes, using iterparse approach for {filepath}")
        return parse_npx_file_iterparse(filepath)


In [12]:
################################################################################
# 5. Putting It All Together
################################################################################

metadata_records = []
votes_records = []

all_files = [f for f in os.listdir(NPX_DOWNLOAD_DIR) if f.lower().endswith(".txt")]

for filename in all_files:
    filepath = os.path.join(NPX_DOWNLOAD_DIR, filename)

    doc_info, all_votes = parse_npx_file_hybrid(filepath)

    if doc_info:
        doc_info["filename"] = filename
        metadata_records.append(doc_info)

    for v in all_votes:
        v["filename"] = filename
        votes_records.append(v)

df_metadata = pd.DataFrame(metadata_records)
df_votes = pd.DataFrame(votes_records)

# Merge them if you want a single CSV
if not df_metadata.empty and not df_votes.empty:
    df_final = df_votes.merge(df_metadata, on="filename", how="inner")
else:
    df_final = pd.DataFrame()

print(f"Metadata rows: {len(df_metadata)}")
print(f"Vote rows: {len(df_votes)}")
print(f"Final joined rows: {len(df_final)}")

df_metadata.to_csv("parsed_npx_metadata.csv", index=False)
df_votes.to_csv("parsed_npx_votes.csv", index=False)
df_final.to_csv("parsed_npx_final.csv", index=False)

print("Hybrid parse (size-based) complete! See CSV outputs for details.")

File is 97342826 bytes, using iterparse approach for ./npx_filings\2024-08-28_N-PX_0001021408-24-005385.txt
Metadata rows: 25
Vote rows: 10940
Final joined rows: 10940
Hybrid parse (size-based) complete! See CSV outputs for details.
