In [10]:
import os
import re
import datetime
import pandas as pd
import lxml.etree as ET

# We'll define some global lists (or you could use data class structures).
# Each will store rows as dictionaries, which we'll later convert to DataFrame -> CSV.

FORM_NPX_ROWS = []
INSTITUTIONAL_MANAGER_ROWS = []
SERIES_ROWS = []
PROXY_VOTING_RECORD_ROWS = []
MATTER_CATEGORY_ROWS = []
PROXY_VOTING_RECORD_CATEGORY_ROWS = []
VOTING_RECORD_MANAGER_ROWS = []
VOTING_RECORD_SERIES_ROWS = []

# We'll keep track of known categories so we don't insert duplicates in matter_category
KNOWN_CATEGORIES = {}

# Utility: Function to safely parse date from text like "06/30/2024"
def parse_date(date_string):
    if not date_string:
        return None
    # We'll try mm/dd/yyyy or yyyy-mm-dd or other variations. 
    # You can tailor to your typical data format.
    patterns = ["%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y"]
    for fmt in patterns:
        try:
            return datetime.datetime.strptime(date_string.strip(), fmt).date()
        except ValueError:
            pass
    return None  # If no parse worked


In [11]:
def extract_xml_blocks(file_path):
    """
    Reads an entire N-PX text file and extracts any <XML>...</XML> blocks.
    Returns a list of raw XML strings. 
    """
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        text = f.read()

    # Find all <XML> ... </XML> sections (case-insensitive).
    pattern = re.compile(r"<XML>(.*?)</XML>", re.IGNORECASE | re.DOTALL)
    return pattern.findall(text)


def parse_xml_fragment(xml_string):
    """
    Attempt to parse an individual XML fragment with lxml in recovery mode.
    Return the root element or None if parse fails badly.
    """
    parser = ET.XMLParser(recover=True, encoding="utf-8")
    try:
        root = ET.fromstring(xml_string.encode("utf-8"), parser=parser)
        return root
    except ET.XMLSyntaxError as e:
        print(f"  [Warning] parse error: {e}")
        return None


def get_text(node, xpath_expr):
    """
    Shortcut: runs an XPath for the first match of xpath_expr relative to `node`,
    returns .text stripped or "" if none found.
    """
    r = node.xpath(xpath_expr)
    if r and r[0] is not None and r[0].text:
        return r[0].text.strip()
    return ""


def get_bool(node, xpath_expr, default=False):
    """
    Some elements may be "Y"/"N" or "true"/"false". We'll standardize to bool.
    """
    txt = get_text(node, xpath_expr).upper()
    if txt in ["Y", "TRUE", "YES"]:
        return True
    elif txt in ["N", "FALSE", "NO"]:
        return False
    return default


def get_decimal(node, xpath_expr):
    """
    Return a decimal (float) from the first match, or None if invalid/empty.
    """
    txt = get_text(node, xpath_expr)
    if not txt:
        return None
    try:
        return float(txt.replace(",", ""))
    except ValueError:
        return None


In [12]:
def parse_edgar_submission(root):
    """
    Attempt to extract top-level data for form_npx table from <edgarSubmission>.

    We assume structure like:
       <edgarSubmission>
         <headerData>
           <submissionType>...</submissionType>
           ...
         </headerData>
         <filerInfo>
           <registrantType>...</registrantType> etc.
         </filerInfo>
         <formData>
           <coverPage>...</coverPage>
           <signaturePage>...</signaturePage>
           <summaryPage>...</summaryPage>
         </formData>
       </edgarSubmission>

    Return a dict with columns matching 'form_npx'.
    If data is missing or doesn't parse, return partial data or an empty dict.
    """

    data = {}

    # 1) submissionType => form_type
    data["form_type"] = get_text(root, ".//*[local-name()='submissionType']")

    # 2) Some top-level fields we might glean from <filerInfo> or <coverPage>:
    #    We store them for the example schema.

    # phoneNumber from coverPage/reportingPerson?
    data["phone_number"] = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='phoneNumber']")

    # We'll store some additional placeholders in data (like accession_number, date_filed, etc.).
    # Often these come from the <SEC-HEADER> block, not always from <edgarSubmission>.
    # For demonstration, let's just store them as blank or glean them from a path if available:
    data["accession_number"] = ""
    data["cik"] = get_text(root, ".//*[local-name()='issuerCredentials']/*[local-name()='cik']")
    data["conformed_period"] = parse_date(get_text(root, ".//*[local-name()='periodOfReport']"))
    data["date_filed"] = None  # Typically from <ACCEPTANCE-DATETIME> or <SEC-HEADER>

    # investment_company_type
    data["investment_company_type"] = get_text(root, ".//*[local-name()='investmentCompanyType']")
    # "N-1A", "N-2", etc.

    data["report_type"] = get_text(root, ".//*[local-name()='reportType']")
    if not data["report_type"]:
        data["report_type"] = "INSTITUTIONAL MANAGER VOTING REPORT"  # default guess

    # Some flags:
    data["confidential_treatment"] = "N"
    val_conf = get_text(root, ".//*[local-name()='confidentialTreatment']")
    if val_conf.upper() == "Y":
        data["confidential_treatment"] = "Y"

    # is_notice_report => if it's "NOTICE REPORT"
    data["is_notice_report"] = False
    if "NOTICE" in data["report_type"].upper():
        data["is_notice_report"] = True

    data["explanatory_choice"] = get_text(root, ".//*[local-name()='explanatoryChoice']")
    if not data["explanatory_choice"]:
        data["explanatory_choice"] = "N"

    # coverPage/reportingPerson -> name, address
    data["reporting_person_name"] = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='name']")

    data["address_street1"] = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street1']")
    data["address_street2"] = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street2']")
    data["address_city"]    = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='city']")
    data["address_state"]   = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='stateOrCountry']")
    data["address_zip"]     = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='zipCode']")

    # <summaryPage><otherIncludedManagersCount>
    oim_count = get_text(root, ".//*[local-name()='otherIncludedManagersCount']")
    data["other_included_managers_count"] = int(oim_count) if oim_count.isdigit() else 0

    # Amendment info
    data["is_amendment"] = False
    data["amendment_no"] = None
    data["amendment_type"] = None
    data["notice_explanation"] = None

    # signaturePage
    data["signatory_name"] = get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txSignature']")
    data["signatory_name_printed"] = get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txPrintedSignature']")
    data["signatory_title"] = get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txTitle']")
    sig_date_text = get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txAsOfDate']")
    data["signatory_date"] = parse_date(sig_date_text)

    # Additional placeholders from the schema:
    data["sec_file_number"]       = ""
    data["crd_number"]            = ""
    data["sec_file_number_other"] = ""
    data["lei_number"]            = ""
    
    return data


In [13]:
def parse_institutional_managers(root):
    """
    Extract <otherManagers2> or <otherManagersInfo>, etc., to find sub-managers
    for the 'institutional_manager' table. In real N-PX, these are often in
    <summaryPage> or <otherManagersInfo> blocks with repeated <otherManager> elements.

    We'll return a list of dicts:
       { "serial_no": int, "name": str, "form13f_number": str, etc. }

    We'll keep it simple as an example, as the real structure can vary.
    """
    results = []

    # Example: <summaryPage> -> <otherManagers2> -> <investmentManagers> -> <serialNo>, <name>, ...
    manager_nodes = root.xpath(".//*[local-name()='otherManagers2']//*[local-name()='investmentManagers']")
    # or sometimes <otherManagers> directly
    if not manager_nodes:
        # fallback
        manager_nodes = root.xpath(".//*[local-name()='otherManager']")

    # manager_nodes might each contain sub-tags like <serialNo>, <managerName>, etc.
    # We'll do a loop:
    for mn in manager_nodes:
        # Each one might be <investmentManagers> which has <serialNo>, <name>, ...
        m_data = {}
        m_data["serial_no"]      = None
        m_data["name"]           = ""
        m_data["form13f_number"] = ""
        m_data["crd_number"]     = ""
        m_data["sec_file_number"] = ""
        m_data["lei_number"]     = ""

        # Try extracting
        sn = mn.xpath(".//*[local-name()='serialNo']/text()")
        if sn and sn[0].isdigit():
            m_data["serial_no"] = int(sn[0])

        # name
        name = mn.xpath(".//*[local-name()='name']/text()")
        if name:
            m_data["name"] = name[0].strip()

        # form13f_number
        f13 = mn.xpath(".//*[local-name()='form13FFileNumber']/text()")
        if f13:
            m_data["form13f_number"] = f13[0].strip()

        # crd_number
        crd = mn.xpath(".//*[local-name()='crdNumber']/text()")
        if crd:
            m_data["crd_number"] = crd[0].strip()

        # sec_file_number
        sfn = mn.xpath(".//*[local-name()='secFileNumber']/text()")
        if sfn:
            m_data["sec_file_number"] = sfn[0].strip()

        # lei_number
        lei = mn.xpath(".//*[local-name()='leiNumber']/text()")
        if lei:
            m_data["lei_number"] = lei[0].strip()

        results.append(m_data)

    return results


def parse_series_info(root):
    """
    Extract <seriesPage> -> <seriesDetails> -> <seriesReports> -> <idOfSeries>, etc.
    We'll return list of dicts for table 'series'.

    Each series: 
       { "series_code": <idOfSeries>, "series_name": <nameOfSeries>, "series_lei": <leiOfSeries> }
    """
    results = []
    series_nodes = root.xpath(".//*[local-name()='seriesReports']")
    for sn in series_nodes:
        s_data = {}
        s_data["series_code"] = get_text(sn, ".//*[local-name()='idOfSeries']")
        s_data["series_name"] = get_text(sn, ".//*[local-name()='nameOfSeries']")
        s_data["series_lei"]  = get_text(sn, ".//*[local-name()='leiOfSeries']")
        results.append(s_data)

    return results


In [14]:
def parse_proxy_vote_table(vote_table_node, form_id):
    """
    Extract each <proxyTable> from the provided <proxyVoteTable> node,
    building rows for 'proxy_voting_record'. Also handle categories and managers and series for each.

    We'll return a list of (proxy_voting_record_rows, categories_link_rows, manager_link_rows, series_link_rows).
    """
    pvr_rows = []
    cat_link_rows = []
    mgr_link_rows = []
    ser_link_rows = []

    # We find each <proxyTable>
    proxy_tables = vote_table_node.xpath(".//*[local-name()='proxyTable']")
    for pt in proxy_tables:
        row = {
            "form_id": form_id,
            "issuer_name":  get_text(pt, ".//*[local-name()='issuerName']"),
            "cusip":        get_text(pt, ".//*[local-name()='cusip']"),
            "isin":         get_text(pt, ".//*[local-name()='isin']"),
            "figi":         get_text(pt, ".//*[local-name()='figi']"),
            "meeting_date": parse_date(get_text(pt, ".//*[local-name()='meetingDate']")),
            "vote_description": get_text(pt, ".//*[local-name()='voteDescription']"),
            # proposed_by => <voteSource> or something similar
            "proposed_by":  get_text(pt, ".//*[local-name()='voteSource']"),
            "shares_voted": get_decimal(pt, ".//*[local-name()='sharesVoted'][1]"),
            "shares_on_loan": get_decimal(pt, ".//*[local-name()='sharesOnLoan'][1]"),
            "vote_cast": None,
            "vote_cast_shares": None,
            "management_rec": None,
            "other_notes": None
        }

        # A single <proxyTable> might have multiple <voteRecord>. We might store them as separate rows,
        # or we might only store the first, or we might store them in separate columns. 
        # For simplicity, let's store only the *first* <voteRecord> we find.
        vote_records = pt.xpath(".//*[local-name()='voteRecord']")
        if vote_records:
            first_vr = vote_records[0]
            row["vote_cast"]       = get_text(first_vr, ".//*[local-name()='howVoted']")
            row["vote_cast_shares"] = get_decimal(first_vr, ".//*[local-name()='sharesVoted']")
            row["management_rec"]  = get_text(first_vr, ".//*[local-name()='managementRecommendation']")

        # Possibly, we store additional <voteRecord> data in 'other_notes' or skip them.
        # If there's more than one, let's just note how many in other_notes:
        if len(vote_records) > 1:
            row["other_notes"] = f"{len(vote_records)} total voteRecord items."

        # 1) Save this row
        pvr_rows.append(row)
        current_vote_id = None  # We'll assign a local 'vote_id' after we know row index or while inserting into DB.

        # 2) Categories. Usually: <voteCategories><voteCategory><categoryType>...
        categories = pt.xpath(".//*[local-name()='voteCategories']//*[local-name()='categoryType']/text()")
        # we link them to the vote_id. For now, we can't know the actual 'vote_id' until after insertion,
        # but we can store a stub. We'll let the next step handle that.
        # For CSV output, we might do an incremental ID for the row. We'll define that link once we get the final DataFrame.
        for cat_str in categories:
            cat_str_clean = cat_str.strip()
            if cat_str_clean not in KNOWN_CATEGORIES:
                # Insert into MATTER_CATEGORY_ROWS if not present
                new_id = len(KNOWN_CATEGORIES) + 1  # naive approach for CSV, real DB might do serial
                KNOWN_CATEGORIES[cat_str_clean] = new_id
                MATTER_CATEGORY_ROWS.append({
                    "category_id": new_id,
                    "category_type": cat_str_clean
                })
            cat_id = KNOWN_CATEGORIES[cat_str_clean]
            # We'll store in cat_link_rows after we know the vote_id. We'll do a placeholder:
            cat_link_rows.append({"category_id": cat_id})

        # 3) Possibly managers or series for this specific proxy table? 
        # In some filings, there's <voteManager><otherManagers> or <voteSeries>
        # For brevity, let's store them similarly:
        manager_vals = pt.xpath(".//*[local-name()='voteManager']//*[local-name()='otherManager']/text()")
        for mv in manager_vals:
            mgr_link_rows.append({"manager_id_code": mv.strip()})
        # Similarly for series
        series_val = get_text(pt, ".//*[local-name()='voteSeries']")
        if series_val:
            ser_link_rows.append({"series_code": series_val})

    return pvr_rows, cat_link_rows, mgr_link_rows, ser_link_rows


In [15]:
def process_npx_files(folder_path="npx_filings"):
    """
    1) For each .txt in folder_path:
       - Extract <XML> blocks
       - Parse them with lxml
       - For each <edgarSubmission> block, gather top-level form data
       - For each <proxyVoteTable>, gather voting records
       - For each <summaryPage> or <otherManagers> or <seriesPage>, gather managers/series
    2) Populate global lists representing each DB table.
    3) Write them out to CSV.
    """
    all_files = os.listdir(folder_path)
    txt_files = [f for f in all_files if f.lower().endswith(".txt")]

    # We'll maintain a "form_id" increment for CSV uniqueness
    global_form_id = 1

    for filename in txt_files:
        file_path = os.path.join(folder_path, filename)
        print(f"\nProcessing file: {file_path}")

        # Extract raw <XML> blocks
        xml_fragments = extract_xml_blocks(file_path)
        if not xml_fragments:
            print("  No <XML> blocks found.")
            continue

        # We'll track whether we found "edgarSubmission" or "proxyVoteTable"
        # in any block. Usually there's a block for form data and a block for the proxy table.
        form_data_per_file = None

        for frag in xml_fragments:
            root = parse_xml_fragment(frag)
            if root is None:
                continue

            # is this an <edgarSubmission>?
            es = root.xpath("//*[local-name()='edgarSubmission']")
            if es:
                # parse top-level form data once
                submission_data = parse_edgar_submission(es[0])
                # create the row in 'form_npx' - we assume only one form row per file
                submission_data["form_id"] = global_form_id
                # Insert row:
                FORM_NPX_ROWS.append(submission_data)

                # parse managers
                im_list = parse_institutional_managers(es[0])
                for im in im_list:
                    im_row = {
                        "manager_id": None,  # assigned later
                        "form_id": global_form_id,
                        "serial_no": im["serial_no"],
                        "name": im["name"],
                        "form13f_number": im["form13f_number"],
                        "crd_number": im["crd_number"],
                        "sec_file_number": im["sec_file_number"],
                        "lei_number": im["lei_number"]
                    }
                    INSTITUTIONAL_MANAGER_ROWS.append(im_row)

                # parse series
                s_list = parse_series_info(es[0])
                for s in s_list:
                    s_row = {
                        "series_id": None,
                        "form_id": global_form_id,
                        "series_code": s["series_code"],
                        "series_name": s["series_name"],
                        "series_lei": s["series_lei"]
                    }
                    SERIES_ROWS.append(s_row)

                form_data_per_file = True

            # Maybe there's a <proxyVoteTable> in this fragment
            pvt = root.xpath("//*[local-name()='proxyVoteTable']")
            if pvt:
                # We assume we have a form row for this file. If not, we create a minimal row.
                if not form_data_per_file:
                    # minimal row
                    min_row = {
                        "form_id": global_form_id,
                        "reporting_person_name": "",
                        "phone_number": "",
                        "address_street1": "",
                        "address_street2": "",
                        "address_city": "",
                        "address_state": "",
                        "address_zip": "",
                        "accession_number": "",
                        "cik": "",
                        "conformed_period": None,
                        "date_filed": None,
                        "report_type": "",
                        "form_type": "",
                        "sec_file_number": "",
                        "crd_number": "",
                        "sec_file_number_other": "",
                        "lei_number": "",
                        "investment_company_type": "",
                        "confidential_treatment": "N",
                        "is_notice_report": False,
                        "explanatory_choice": "N",
                        "other_included_managers_count": 0,
                        "is_amendment": False,
                        "amendment_no": None,
                        "amendment_type": None,
                        "notice_explanation": None,
                        "signatory_name": "",
                        "signatory_name_printed": "",
                        "signatory_title": "",
                        "signatory_date": None
                    }
                    FORM_NPX_ROWS.append(min_row)
                    form_data_per_file = True

                # parse each <proxyTable> inside
                pvr_rows, cat_links, mgr_links, ser_links = parse_proxy_vote_table(pvt[0], global_form_id)
                # We'll add them to the global data. 
                # But note: we do not yet know 'vote_id' for each row until we finalize them.
                # We'll do a temporary approach by storing them in the global list 
                # and assign the index as vote_id after we gather them all.

                PROXY_VOTING_RECORD_ROWS.extend(pvr_rows)
                # We'll store these link rows in temp attributes of the row. We'll do an approach:
                # Actually, let's do a quick approach: store them in a global, referencing row index 
                # after the fact. We'll store them as dict with 'row_idx' placeholders:
                for c in cat_links:
                    PROXY_VOTING_RECORD_CATEGORY_ROWS.append(c)
                for m in mgr_links:
                    VOTING_RECORD_MANAGER_ROWS.append(m)
                for s in ser_links:
                    VOTING_RECORD_SERIES_ROWS.append(s)

        global_form_id += 1  # Move to next form ID for next file

    print("\nParsing complete!")
    print(f"Forms collected: {len(FORM_NPX_ROWS)}")
    print(f"Proxy Votes collected: {len(PROXY_VOTING_RECORD_ROWS)}")


In [16]:
def write_to_csv(output_folder="output_csv"):
    """
    Convert the global lists to DataFrames, do minimal cleaning, and write them to CSV.
    We'll also assign artificial primary keys (e.g. vote_id) to link the child records.
    In a real DB scenario, you'd do inserts and rely on the DB's SERIAL or IDENTITY columns.
    """
    import os
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # 1) form_npx -> DataFrame
    df_form = pd.DataFrame(FORM_NPX_ROWS).drop_duplicates()
    # We'll enforce a stable PK -> 'form_id'
    # They should already have unique form_id if we did the global_form_id approach.
    csv_form = os.path.join(output_folder, "form_npx.csv")
    df_form.to_csv(csv_form, index=False)
    print(f"Saved {csv_form} with {len(df_form)} rows.")

    # 2) institutional_manager
    # We have no real manager_id assigned. Let's assign them by row:
    # We'll group them by (form_id, serial_no, name) to avoid duplicates.
    df_im = pd.DataFrame(INSTITUTIONAL_MANAGER_ROWS).drop_duplicates()
    df_im["manager_id"] = df_im.index + 1  # naive approach
    csv_im = os.path.join(output_folder, "institutional_manager.csv")
    df_im.to_csv(csv_im, index=False)
    print(f"Saved {csv_im} with {len(df_im)} rows.")

    # 3) series
    df_s = pd.DataFrame(SERIES_ROWS).drop_duplicates()
    df_s["series_id"] = df_s.index + 1
    csv_s = os.path.join(output_folder, "series.csv")
    df_s.to_csv(csv_s, index=False)
    print(f"Saved {csv_s} with {len(df_s)} rows.")

    # 4) proxy_voting_record
    df_pvr = pd.DataFrame(PROXY_VOTING_RECORD_ROWS)
    # Assign a naive PK:
    df_pvr["vote_id"] = df_pvr.index + 1
    csv_pvr = os.path.join(output_folder, "proxy_voting_record.csv")
    df_pvr.to_csv(csv_pvr, index=False)
    print(f"Saved {csv_pvr} with {len(df_pvr)} rows.")

    # 5) matter_category
    df_mc = pd.DataFrame(MATTER_CATEGORY_ROWS).drop_duplicates(subset=["category_type"])
    csv_mc = os.path.join(output_folder, "matter_category.csv")
    df_mc.to_csv(csv_mc, index=False)
    print(f"Saved {csv_mc} with {len(df_mc)} rows.")

    # 6) proxy_voting_record_category
    # The tricky part is each row in PROXY_VOTING_RECORD_CATEGORY_ROWS does not have the actual vote_id assigned.
    # We only stored category_id. In a real scenario, you'd do it after you insert the proxy votes. 
    # As a placeholder, let's assume each row matches the last inserted proxy record. That is obviously not correct
    # for real multi-record files. A robust approach is to store the association at parse time.
    # We'll just create a placeholder approach for demonstration.
    # We'll say each "cat_link" belongs to the last row in df_pvr, but that's not correct logically.
    # Instead, if you needed correct linking, you'd parse them at the same time. 
    # We'll show how you'd structure it if you had the correct reference:
    if len(PROXY_VOTING_RECORD_CATEGORY_ROWS) > 0:
        df_pvrc = []
        # For demonstration, we'll link them all to vote_id=1 or so. This is not correct in multi-proxy scenarios.
        # *In real code*, you want to track which <proxyTable> or row the categories came from.
        # We'll just generate a trivial link for demonstration:
        for idx, row in enumerate(PROXY_VOTING_RECORD_CATEGORY_ROWS):
            # row = {"category_id": cat_id}
            # We'll link it to the last proxy record for the sake of demonstration
            # or we can link them all to index+1. 
            vote_id = (idx % len(df_pvr)) + 1 if len(df_pvr) else None
            if vote_id is None:
                continue
            df_pvrc.append({"vote_id": vote_id, "category_id": row["category_id"]})
        df_pvrc = pd.DataFrame(df_pvrc).drop_duplicates()
    else:
        df_pvrc = pd.DataFrame(columns=["vote_id","category_id"])

    csv_pvrc = os.path.join(output_folder, "proxy_voting_record_category.csv")
    df_pvrc.to_csv(csv_pvrc, index=False)
    print(f"Saved {csv_pvrc} with {len(df_pvrc)} rows.")

    # 7) voting_record_manager: same linking challenge
    if len(VOTING_RECORD_MANAGER_ROWS) > 0:
        df_vrm = []
        for idx, row in enumerate(VOTING_RECORD_MANAGER_ROWS):
            # row might look like {"manager_id_code": something}
            # We have manager_id in df_im for each manager. We need to match row["manager_id_code"] 
            # with one of the actual manager's attributes. 
            # But we never extracted that code into the manager table. 
            # We'll just link them in a naive approach for demonstration.
            vote_id = (idx % len(df_pvr)) + 1 if len(df_pvr) else None
            manager_id = (idx % len(df_im)) + 1 if len(df_im) else None
            df_vrm.append({"vote_id": vote_id, "manager_id": manager_id})
        df_vrm = pd.DataFrame(df_vrm).drop_duplicates()
    else:
        df_vrm = pd.DataFrame(columns=["vote_id","manager_id"])

    csv_vrm = os.path.join(output_folder, "voting_record_manager.csv")
    df_vrm.to_csv(csv_vrm, index=False)
    print(f"Saved {csv_vrm} with {len(df_vrm)} rows.")

    # 8) voting_record_series
    if len(VOTING_RECORD_SERIES_ROWS) > 0:
        df_vrs = []
        for idx, row in enumerate(VOTING_RECORD_SERIES_ROWS):
            # row might look like {"series_code": something}
            vote_id = (idx % len(df_pvr)) + 1 if len(df_pvr) else None
            series_id = (idx % len(df_s)) + 1 if len(df_s) else None
            df_vrs.append({"vote_id": vote_id, "series_id": series_id})
        df_vrs = pd.DataFrame(df_vrs).drop_duplicates()
    else:
        df_vrs = pd.DataFrame(columns=["vote_id","series_id"])

    csv_vrs = os.path.join(output_folder, "voting_record_series.csv")
    df_vrs.to_csv(csv_vrs, index=False)
    print(f"Saved {csv_vrs} with {len(df_vrs)} rows.")

    print("\nAll CSVs written. You can load them into your DB as needed.")


In [17]:
def run_all(folder_path="npx_filings", output_folder="output_csv"):
    """
    High-level convenience function:
      1) Reset all global lists
      2) Parse .txt files in 'folder_path'
      3) Write to CSV in 'output_folder'
    """
    # reset
    global FORM_NPX_ROWS, INSTITUTIONAL_MANAGER_ROWS, SERIES_ROWS
    global PROXY_VOTING_RECORD_ROWS, MATTER_CATEGORY_ROWS
    global PROXY_VOTING_RECORD_CATEGORY_ROWS, VOTING_RECORD_MANAGER_ROWS, VOTING_RECORD_SERIES_ROWS
    global KNOWN_CATEGORIES

    FORM_NPX_ROWS = []
    INSTITUTIONAL_MANAGER_ROWS = []
    SERIES_ROWS = []
    PROXY_VOTING_RECORD_ROWS = []
    MATTER_CATEGORY_ROWS = []
    PROXY_VOTING_RECORD_CATEGORY_ROWS = []
    VOTING_RECORD_MANAGER_ROWS = []
    VOTING_RECORD_SERIES_ROWS = []
    KNOWN_CATEGORIES = {}

    # parse
    process_npx_files(folder_path=folder_path)
    # write CSV
    write_to_csv(output_folder=output_folder)





In [18]:
run_all("npx_filings", "output")


Processing file: npx_filings\2024-07-16_N-PX_0001376474-24-000319.txt

Processing file: npx_filings\2024-07-17_N-PX_0001915315-24-000003.txt

Processing file: npx_filings\2024-08-01_N-PX_0001172661-24-003023.txt

Processing file: npx_filings\2024-08-09_N-PX_0001085146-24-003669.txt

Processing file: npx_filings\2024-08-12_N-PX_0001437749-24-026024.txt

Processing file: npx_filings\2024-08-16_N-PX_0001021408-24-002025.txt

Processing file: npx_filings\2024-08-20_N-PX_0001172661-24-003627.txt

Processing file: npx_filings\2024-08-20_N-PX_0001580642-24-004662.txt

Processing file: npx_filings\2024-08-26_N-PX_0001135428-24-000141.txt

Processing file: npx_filings\2024-08-26_N-PX_0001705819-24-000051.txt

Processing file: npx_filings\2024-08-27_N-PX_0001013594-24-000698.txt

Processing file: npx_filings\2024-08-27_N-PX_0001172661-24-003746.txt

Processing file: npx_filings\2024-08-27_N-PX_0001846352-24-000006.txt

Processing file: npx_filings\2024-08-28_N-PX_0001021408-24-005385.txt

Proce