In [1]:
import os
import re
import datetime
import pandas as pd
import lxml.etree as ET

# -----------------------------------------------------------------------
# Global Data Lists (matching the final schema)
# -----------------------------------------------------------------------
FORM_NPX_ROWS = []
INSTITUTIONAL_MANAGER_ROWS = []
SERIES_ROWS = []
PROXY_VOTING_RECORD_ROWS = []
MATTER_CATEGORY_ROWS = []
PROXY_VOTING_RECORD_CATEGORY_ROWS = []
VOTING_RECORD_MANAGER_ROWS = []
VOTING_RECORD_SERIES_ROWS = []

# Weâ€™ll keep track of known categories (category_type -> category_id)
KNOWN_CATEGORIES = {}

# We'll generate incremental IDs for each form and each vote record
NEXT_FORM_ID = 1
NEXT_VOTE_ID = 1

# -----------------------------------------------------------------------
# Helper Functions
# -----------------------------------------------------------------------

def parse_date(date_string):
    """
    Try to parse a date string in multiple formats, returning a Python date or None.
    """
    if not date_string:
        return None
    patterns = ["%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%Y%m%d"]
    for fmt in patterns:
        try:
            return datetime.datetime.strptime(date_string.strip(), fmt).date()
        except ValueError:
            pass
    return None

def get_text(node, xpath_expr):
    """
    Returns the .text of the first match or '' if none.
    """
    result = node.xpath(xpath_expr)
    if result and result[0] is not None and result[0].text:
        return result[0].text.strip()
    return ""

def get_decimal(node, xpath_expr):
    """
    Convert matched element text to float, ignoring commas. Return None if invalid.
    """
    txt = get_text(node, xpath_expr)
    if not txt:
        return None
    try:
        return float(txt.replace(",", ""))
    except ValueError:
        return None

def extract_sec_header_info(file_path):
    """
    Parse <SEC-HEADER> lines: "ACCESSION NUMBER:" and "FILED AS OF DATE:" with regex.
    Returns a dict with 'accession_number' (str) and 'date_filed' (date).
    """
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        raw = f.read()

    info = {
        "accession_number": "",
        "date_filed": None
    }

    # Accession
    match_acc = re.search(r"ACCESSION\s+NUMBER:\s*([^\r\n]+)", raw, re.IGNORECASE)
    if match_acc:
        info["accession_number"] = match_acc.group(1).strip()

    # FILED AS OF DATE
    match_filed = re.search(r"FILED\s+AS\s+OF\s+DATE:\s*(\d{8})", raw, re.IGNORECASE)
    if match_filed:
        info["date_filed"] = parse_date(match_filed.group(1).strip())

    return info

def parse_edgar_submission(root, sec_header):
    """
    Extract top-level data from <edgarSubmission> for the 'form_npx' table
    (matching the final schema exactly).
    """
    global NEXT_FORM_ID

    form_id = NEXT_FORM_ID
    NEXT_FORM_ID += 1

    # Ensure no None for 'reporting_person_name' (NOT NULL in DB)
    rp_name = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='name']")
    if not rp_name:
        rp_name = ""  # empty string instead of None

    data = {
        "form_id": form_id,

        # Reporting Person / Filer Info
        "reporting_person_name": rp_name[:250],  # NOT NULL => empty if unknown
        "phone_number": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='phoneNumber']")[:50],
        "address_street1": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street1']")[:250],
        "address_street2": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street2']")[:250],
        "address_city": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='city']")[:100],
        "address_state": "",  # We'll set below if found
        "address_zip": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='zipCode']")[:30],

        # Form N-PX Filing Info
        "accession_number": sec_header["accession_number"][:30],
        "cik": get_text(root, ".//*[local-name()='issuerCredentials']/*[local-name()='cik']")[:15],
        "conformed_period": parse_date(get_text(root, ".//*[local-name()='periodOfReport']")),
        "date_filed": sec_header["date_filed"],
        "report_type": (get_text(root, ".//*[local-name()='reportInfo']/*[local-name()='reportType']") or "FUND VOTING REPORT")[:100],
        "form_type": (get_text(root, ".//*[local-name()='submissionType']") or "N-PX")[:10],
        "sec_file_number": get_text(root, ".//*[local-name()='fileNumber']")[:20],
        "crd_number": get_text(root, ".//*[local-name()='reportingCrdNumber']")[:20],
        "sec_file_number_other": get_text(root, ".//*[local-name()='reportingSecFileNumber']")[:20],
        "lei_number": get_text(root, ".//*[local-name()='leiNumber']")[:40],
        "investment_company_type": get_text(root, ".//*[local-name()='investmentCompanyType']")[:20],

        # Must be 'Y' or 'N'
        "confidential_treatment": "N",

        # Boolean defaults
        "is_notice_report": False,
        "explanatory_choice": "N",
        "other_included_managers_count": 0,
        "series_count": 0,

        # Amendment fields
        "is_amendment": False,
        "amendment_no": None,
        "amendment_type": None,
        "notice_explanation": None,  # up to 200 chars

        # Signature fields
        "signatory_name": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txSignature']")[:250],
        "signatory_name_printed": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txPrintedSignature']")[:250],
        "signatory_title": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txTitle']")[:100],
        "signatory_date": parse_date(get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txAsOfDate']")),
    }

    # If there's a stateOrCountry field, store it
    raw_state_country = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='stateOrCountry']")
    if raw_state_country:
        data["address_state"] = raw_state_country[:100]

    # Check confidentialTreatment
    conf_treat = get_text(root, ".//*[local-name()='reportInfo']/*[local-name()='confidentialTreatment']").upper()
    if conf_treat in ["Y", "YES", "TRUE", "1"]:
        data["confidential_treatment"] = "Y"

    # Check if the report_type indicates a NOTICE report
    if "NOTICE" in data["report_type"].upper():
        data["is_notice_report"] = True

    # Check explanatory_choice
    expl_choice = get_text(root, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryChoice']").upper()
    if expl_choice in ["Y", "YES", "TRUE", "1"]:
        data["explanatory_choice"] = "Y"

    # If we have an explanatoryNotes field, treat it as "notice_explanation" (max 200 chars)
    #expl_notes = get_text(root, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryNotes']")
    #if expl_notes:
    #    data["notice_explanation"] = expl_notes[:200]

    # If we have an explanatoryNotes field, treat it as "notice_explanation" (max 200 chars)
    note_expl = get_text(root, ".//*[local-name()='reportInfo']/*[local-name()='noticeExplanation']")
    if note_expl:
        data["notice_explanation"] = note_expl[:200]

    # otherIncludedManagersCount
    oimc = get_text(root, ".//*[local-name()='summaryPage']/*[local-name()='otherIncludedManagersCount']")
    if oimc.isdigit():
        data["other_included_managers_count"] = int(oimc)

    # isAmendment
    is_amd = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='isAmendment']").upper()
    if is_amd in ["Y", "YES", "TRUE", "1"]:
        data["is_amendment"] = True

    # amendment_no
    amd_no = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentNo']")
    if amd_no.isdigit():
        data["amendment_no"] = int(amd_no)

    # amendment_type
    amd_type = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentType']")
    if amd_type:
        data["amendment_type"] = amd_type[:20]

    return data

def parse_institutional_managers(root, form_id):
    """
    Extract institutional managers from <summaryPage> -> <otherManagers2> or <otherManager>.
    Return a list of dicts for the 'institutional_manager' table.
    
    Now also checks for the official EDGAR tags:
      - <managerName> (up to 150 chars)
      - <icaOr13FFileNumber> (up to 17)
      - <otherFileNumber> (up to 17)
      - <leiNumberOM> (up to 20)
      - <crdNumber> (up to 9 by the official spec, but we store up to 20)
    Fallback logic:
      - If <managerName> is found, use it, else fallback to <name>.
      - If <icaOr13FFileNumber> is found, use it, else fallback to <form13FFileNumber>.
      - If <otherFileNumber> is found, use it, else fallback to <secFileNumber>.
      - If <leiNumberOM> is found, use it, else fallback to <leiNumber>.
      - <crdNumber> is extracted from either tag if present
    """
    results = []
    # Try <otherManagers2> -> <investmentManagers>, else <otherManager>
    manager_nodes = root.xpath(".//*[local-name()='otherManagers2']//*[local-name()='investmentManagers']")
    if not manager_nodes:
        manager_nodes = root.xpath(".//*[local-name()='otherManager']")

    for mn in manager_nodes:
        row_im = {
            "manager_id": None,   # Will assign in write_to_csv
            "form_id": form_id,
            "serial_no": None,    # int
            "name": "",           # up to 250
            "form13f_number": "", # up to 20
            "crd_number": "",     # up to 20
            "sec_file_number": "",# up to 20
            "lei_number": ""      # up to 40
        }

        # 1) Serial No
        sn = mn.xpath(".//*[local-name()='serialNo']/text()")
        if sn and sn[0].isdigit():
            row_im["serial_no"] = int(sn[0])

        # 2) Manager Name
        #    official tag: <managerName> (up to 150)
        #    fallback:    <name>
        mgrName = mn.xpath(".//*[local-name()='managerName']/text()")
        if mgrName:
            row_im["name"] = mgrName[0].strip()[:150]
        else:
            nm = mn.xpath(".//*[local-name()='name']/text()")
            if nm:
                row_im["name"] = nm[0].strip()[:250]

        # 3) 13F / ICA number
        #    official: <icaOr13FFileNumber> up to 17
        #    fallback: <form13FFileNumber> up to 20
        ica = mn.xpath(".//*[local-name()='icaOr13FFileNumber']/text()")
        if ica:
            row_im["form13f_number"] = ica[0].strip()[:17]
        else:
            f13 = mn.xpath(".//*[local-name()='form13FFileNumber']/text()")
            if f13:
                row_im["form13f_number"] = f13[0].strip()[:20]

        # 4) CRD number
        #    official: <crdNumber> (up to 9), fallback is the same tag name
        crdN = mn.xpath(".//*[local-name()='crdNumber']/text()")
        if crdN:
            # store up to 9 or 20 as the schema says 20
            row_im["crd_number"] = crdN[0].strip()[:20]

        # 5) otherFileNumber or secFileNumber
        #    official: <otherFileNumber> up to 17
        #    fallback: <secFileNumber>
        otherF = mn.xpath(".//*[local-name()='otherFileNumber']/text()")
        if otherF:
            row_im["sec_file_number"] = otherF[0].strip()[:17]
        else:
            sfn = mn.xpath(".//*[local-name()='secFileNumber']/text()")
            if sfn:
                row_im["sec_file_number"] = sfn[0].strip()[:20]

        # 6) LEI number
        #    official: <leiNumberOM> up to 20
        #    fallback: <leiNumber> up to 40
        leiOM = mn.xpath(".//*[local-name()='leiNumberOM']/text()")
        if leiOM:
            row_im["lei_number"] = leiOM[0].strip()[:20]
        else:
            lei = mn.xpath(".//*[local-name()='leiNumber']/text()")
            if lei:
                row_im["lei_number"] = lei[0].strip()[:40]

        results.append(row_im)

    return results

def parse_series_info(root, form_dict):
    """
    <seriesPage> -> <seriesDetails> -> <seriesReports>
    Returns a list referencing form_id for the 'series' table.
    Also updates 'series_count' in the form_npx row.
    """
    form_id = form_dict["form_id"]
    series_nodes = root.xpath(".//*[local-name()='seriesReports']")
    results = []
    for sn in series_nodes:
        s_data = {
            "series_id": None,
            "form_id": form_id,
            "series_code": get_text(sn, ".//*[local-name()='idOfSeries']")[:25],
            "series_name": get_text(sn, ".//*[local-name()='nameOfSeries']")[:250],
            "series_lei": get_text(sn, ".//*[local-name()='leiOfSeries']")[:40],
        }
        results.append(s_data)

    form_dict["series_count"] += len(results)
    return results

def parse_proxy_vote_table(proxy_vote_node, form_id):
    """
    Parse each <proxyTable> inside <proxyVoteTable> => rows for 'proxy_voting_record'
    plus bridging rows for managers & series.
    """
    global NEXT_VOTE_ID

    for pt in proxy_vote_node.xpath(".//*[local-name()='proxyTable']"):
        vote_id = NEXT_VOTE_ID
        NEXT_VOTE_ID += 1

        row = {
            "vote_id": vote_id,
            "form_id": form_id,
            "issuer_name": get_text(pt, ".//*[local-name()='issuerName']")[:250],
            "cusip": get_text(pt, ".//*[local-name()='cusip']")[:30],
            "isin": get_text(pt, ".//*[local-name()='isin']")[:30],
            "figi": get_text(pt, ".//*[local-name()='figi']")[:30],
            "meeting_date": parse_date(get_text(pt, ".//*[local-name()='meetingDate']")),
            "vote_description": get_text(pt, ".//*[local-name()='voteDescription']"),  # TEXT
            "proposed_by": get_text(pt, ".//*[local-name()='voteSource']")[:20],
            "shares_voted": get_decimal(pt, ".//*[local-name()='sharesVoted'][1]"),     # numeric(20,6)
            "shares_on_loan": get_decimal(pt, ".//*[local-name()='sharesOnLoan'][1]"),  # numeric(20,6)
            "vote_cast": None,            # varchar(50)
            "vote_cast_shares": None,     # numeric(20,6)
            "management_rec": None,       # varchar(50)
            "other_notes": None           # text
        }

        vote_records = pt.xpath(".//*[local-name()='voteRecord']")
        if vote_records:
            vr = vote_records[0]
            row["vote_cast"] = get_text(vr, ".//*[local-name()='howVoted']")[:50]
            row["vote_cast_shares"] = get_decimal(vr, ".//*[local-name()='sharesVoted']")
            row["management_rec"] = get_text(vr, ".//*[local-name()='managementRecommendation']")[:50]

            if len(vote_records) > 1:
                row["other_notes"] = f"{len(vote_records)} total <voteRecord> items found."

        PROXY_VOTING_RECORD_ROWS.append(row)

        # Parse categories (<voteCategories>)
        categories = pt.xpath(".//*[local-name()='voteCategories']//*[local-name()='categoryType']/text()")
        for cat_str in categories:
            cat_clean = cat_str.strip()[:100]
            if cat_clean not in KNOWN_CATEGORIES:
                new_cat_id = len(KNOWN_CATEGORIES) + 1
                KNOWN_CATEGORIES[cat_clean] = new_cat_id
                MATTER_CATEGORY_ROWS.append({"category_id": new_cat_id, "category_type": cat_clean})

            cat_id = KNOWN_CATEGORIES[cat_clean]
            PROXY_VOTING_RECORD_CATEGORY_ROWS.append({
                "vote_id": vote_id,
                "category_id": cat_id
            })

        # Parse manager references: store (form_id, serial_no) placeholders
        other_mgrs = pt.xpath(".//*[local-name()='voteManager']//*[local-name()='otherManager']/text()")
        for mgr_code in other_mgrs:
            mgr_code = mgr_code.strip()
            try:
                serial_no_int = int(mgr_code)
            except ValueError:
                serial_no_int = None

            if serial_no_int is not None:
                VOTING_RECORD_MANAGER_ROWS.append({
                    "vote_id": vote_id,
                    "manager_id": None,  # unify later
                    "form_id": form_id,
                    "serial_no": serial_no_int
                })

        # Parse series reference (<voteSeries>)
        vs_code = get_text(pt, ".//*[local-name()='voteSeries']")
        if vs_code:
            vs_code = vs_code.strip()[:25]
            VOTING_RECORD_SERIES_ROWS.append({
                "vote_id": vote_id,
                "series_id": None,  # unify later
                "form_id": form_id,
                "series_code": vs_code
            })

def extract_xml_blocks(file_path):
    """
    Return a list of <XML> ... </XML> substrings from the file.
    """
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        text = f.read()
    pattern = re.compile(r"<XML>(.*?)</XML>", re.IGNORECASE | re.DOTALL)
    return pattern.findall(text)

def parse_xml_fragment(xml_string):
    """
    Attempt to parse an XML fragment with lxml.etree in recovery mode.
    """
    parser = ET.XMLParser(recover=True, encoding="utf-8")
    try:
        root = ET.fromstring(xml_string.encode("utf-8"), parser=parser)
        return root
    except ET.XMLSyntaxError as e:
        print(f"  [Warning] parse error: {e}")
        return None

def process_npx_files(folder_path="npx_filings"):
    """
    Main loop over each .txt file. For each:
      - read SEC header
      - find <XML> blocks
      - parse <edgarSubmission>
      - parse institutional managers & series
      - parse <proxyVoteTable>
    """
    global NEXT_FORM_ID
    global FORM_NPX_ROWS, INSTITUTIONAL_MANAGER_ROWS, SERIES_ROWS

    all_files = os.listdir(folder_path)
    txt_files = [f for f in all_files if f.lower().endswith(".txt")]

    for fname in txt_files:
        file_path = os.path.join(folder_path, fname)
        print(f"\nProcessing: {file_path}")

        header_info = extract_sec_header_info(file_path)
        xml_fragments = extract_xml_blocks(file_path)
        if not xml_fragments:
            print("  No <XML> blocks found.")
            continue

        found_form_data = False
        current_form = None

        for frag in xml_fragments:
            root = parse_xml_fragment(frag)
            if root is None:
                continue

            es_nodes = root.xpath("//*[local-name()='edgarSubmission']")
            if es_nodes:
                es = es_nodes[0]
                form_row = parse_edgar_submission(es, header_info)
                current_form = form_row
                FORM_NPX_ROWS.append(form_row)
                found_form_data = True

                im_list = parse_institutional_managers(es, form_row["form_id"])
                INSTITUTIONAL_MANAGER_ROWS.extend(im_list)

                s_list = parse_series_info(es, form_row)
                SERIES_ROWS.extend(s_list)

            pvt_nodes = root.xpath("//*[local-name()='proxyVoteTable']")
            if pvt_nodes:
                if not found_form_data:
                    # Minimal form if there's a <proxyVoteTable> but no <edgarSubmission>
                    form_id = NEXT_FORM_ID
                    NEXT_FORM_ID += 1
                    minimal_form = {
                        "form_id": form_id,
                        "reporting_person_name": "",  # NOT NULL => empty if unknown
                        "phone_number": "",
                        "address_street1": "",
                        "address_street2": "",
                        "address_city": "",
                        "address_state": "",
                        "address_zip": "",
                        "accession_number": header_info["accession_number"][:30],
                        "cik": "",
                        "conformed_period": None,
                        "date_filed": header_info["date_filed"],
                        "report_type": "FUND VOTING REPORT",
                        "form_type": "N-PX",
                        "sec_file_number": "",
                        "crd_number": "",
                        "sec_file_number_other": "",
                        "lei_number": "",
                        "investment_company_type": "",
                        "confidential_treatment": "N",
                        "is_notice_report": False,
                        "explanatory_choice": "N",
                        "other_included_managers_count": 0,
                        "series_count": 0,
                        "is_amendment": False,
                        "amendment_no": None,
                        "amendment_type": None,
                        "notice_explanation": None,
                        "signatory_name": "",
                        "signatory_name_printed": "",
                        "signatory_title": "",
                        "signatory_date": None,
                    }
                    FORM_NPX_ROWS.append(minimal_form)
                    current_form = minimal_form
                    found_form_data = True

                for pvt in pvt_nodes:
                    parse_proxy_vote_table(pvt, current_form["form_id"])

    print("\nFinished parsing N-PX.")
    print(f"  Forms: {len(FORM_NPX_ROWS)}")
    print(f"  Proxy Voting Rows: {len(PROXY_VOTING_RECORD_ROWS)}")

def unify_voting_references():
    """
    Replace the placeholder (form_id, serial_no) or (form_id, series_code)
    in VOTING_RECORD_MANAGER_ROWS / VOTING_RECORD_SERIES_ROWS 
    with the assigned manager_id or series_id.
    """
    # 1) Build manager lookup
    manager_lookup = {}
    for im_row in INSTITUTIONAL_MANAGER_ROWS:
        key = (im_row["form_id"], im_row["serial_no"])
        manager_lookup[key] = im_row["manager_id"]

    for vrm in VOTING_RECORD_MANAGER_ROWS:
        key = (vrm["form_id"], vrm["serial_no"])
        vrm["manager_id"] = manager_lookup.get(key, None)

    # 2) Build series lookup
    series_lookup = {}
    for s_row in SERIES_ROWS:
        key = (s_row["form_id"], s_row["series_code"])
        series_lookup[key] = s_row["series_id"]

    for vrs in VOTING_RECORD_SERIES_ROWS:
        key = (vrs["form_id"], vrs["series_code"])
        vrs["series_id"] = series_lookup.get(key, None)

def write_to_csv(output_folder="output_csv"):
    """
    Writes each global list to CSV, ensuring columns match the schema.
    Then calls unify_voting_references() so bridging tables get correct IDs.
    """
    os.makedirs(output_folder, exist_ok=True)

    # -----------------------------------------------------------------------
    # form_npx
    # -----------------------------------------------------------------------
    df_form = pd.DataFrame(FORM_NPX_ROWS)
    date_cols = ["conformed_period", "date_filed", "signatory_date"]
    for dc in date_cols:
        if dc in df_form.columns:
            df_form[dc] = pd.to_datetime(df_form[dc], errors="coerce").dt.date
    df_form.drop_duplicates(subset=["form_id"], inplace=True)

    form_npx_columns = [
        "form_id",
        "reporting_person_name",
        "phone_number",
        "address_street1",
        "address_street2",
        "address_city",
        "address_state",
        "address_zip",
        "accession_number",
        "cik",
        "conformed_period",
        "date_filed",
        "report_type",
        "form_type",
        "sec_file_number",
        "crd_number",
        "sec_file_number_other",
        "lei_number",
        "investment_company_type",
        "confidential_treatment",
        "is_notice_report",
        "explanatory_choice",
        "other_included_managers_count",
        "series_count",
        "is_amendment",
        "amendment_no",
        "amendment_type",
        "notice_explanation",
        "signatory_name",
        "signatory_name_printed",
        "signatory_title",
        "signatory_date"
    ]
    for col in form_npx_columns:
        if col not in df_form.columns:
            df_form[col] = None
    df_form = df_form[form_npx_columns]

    csv_form = os.path.join(output_folder, "form_npx.csv")
    df_form.to_csv(csv_form, index=False)
    print(f"Saved form_npx.csv: {len(df_form)} rows.")

    # -----------------------------------------------------------------------
    # institutional_manager
    # -----------------------------------------------------------------------
    df_im = pd.DataFrame(INSTITUTIONAL_MANAGER_ROWS)
    if not df_im.empty:
        df_im.drop_duplicates(
            subset=["form_id","serial_no","name","form13f_number","crd_number","sec_file_number","lei_number"],
            inplace=True
        )
        # assign new IDs
        df_im["manager_id"] = range(1, len(df_im) + 1)
        # push them back
        df_im_reset = df_im.reset_index(drop=True)
        for i, row in df_im_reset.iterrows():
            INSTITUTIONAL_MANAGER_ROWS[i]["manager_id"] = row["manager_id"]

    im_columns = [
        "manager_id",
        "form_id",
        "serial_no",
        "name",
        "form13f_number",
        "crd_number",
        "sec_file_number",
        "lei_number"
    ]
    for col in im_columns:
        if col not in df_im.columns:
            df_im[col] = None
    df_im = df_im[im_columns]

    csv_im = os.path.join(output_folder, "institutional_manager.csv")
    df_im.to_csv(csv_im, index=False)
    print(f"Saved institutional_manager.csv: {len(df_im)} rows.")

    # -----------------------------------------------------------------------
    # series
    # -----------------------------------------------------------------------
    df_s = pd.DataFrame(SERIES_ROWS)
    if not df_s.empty:
        df_s.drop_duplicates(
            subset=["form_id","series_code","series_name","series_lei"],
            inplace=True
        )
        df_s["series_id"] = range(1, len(df_s) + 1)
        df_s_reset = df_s.reset_index(drop=True)
        for i, row in df_s_reset.iterrows():
            SERIES_ROWS[i]["series_id"] = row["series_id"]

    s_columns = [
        "series_id",
        "form_id",
        "series_code",
        "series_name",
        "series_lei"
    ]
    for col in s_columns:
        if col not in df_s.columns:
            df_s[col] = None
    df_s = df_s[s_columns]

    csv_s = os.path.join(output_folder, "series.csv")
    df_s.to_csv(csv_s, index=False)
    print(f"Saved series.csv: {len(df_s)} rows.")

    # -----------------------------------------------------------------------
    # unify bridging references (manager_id, series_id)
    # -----------------------------------------------------------------------
    unify_voting_references()

    # -----------------------------------------------------------------------
    # proxy_voting_record
    # -----------------------------------------------------------------------
    df_pvr = pd.DataFrame(PROXY_VOTING_RECORD_ROWS)
    if not df_pvr.empty:
        df_pvr.drop_duplicates(subset=["vote_id"], inplace=True)
        if "meeting_date" in df_pvr.columns:
            df_pvr["meeting_date"] = pd.to_datetime(df_pvr["meeting_date"], errors="coerce").dt.date

    pvr_columns = [
        "vote_id",
        "form_id",
        "issuer_name",
        "cusip",
        "isin",
        "figi",
        "meeting_date",
        "vote_description",
        "proposed_by",
        "shares_voted",
        "shares_on_loan",
        "vote_cast",
        "vote_cast_shares",
        "management_rec",
        "other_notes"
    ]
    for col in pvr_columns:
        if col not in df_pvr.columns:
            df_pvr[col] = None
    df_pvr = df_pvr[pvr_columns]

    csv_pvr = os.path.join(output_folder, "proxy_voting_record.csv")
    df_pvr.to_csv(csv_pvr, index=False)
    print(f"Saved proxy_voting_record.csv: {len(df_pvr)} rows.")

    # -----------------------------------------------------------------------
    # matter_category
    # -----------------------------------------------------------------------
    df_mc = pd.DataFrame(MATTER_CATEGORY_ROWS)
    if not df_mc.empty:
        df_mc.drop_duplicates(subset=["category_type"], inplace=True)

    mc_columns = ["category_id", "category_type"]
    for col in mc_columns:
        if col not in df_mc.columns:
            df_mc[col] = None
    df_mc = df_mc[mc_columns]

    csv_mc = os.path.join(output_folder, "matter_category.csv")
    df_mc.to_csv(csv_mc, index=False)
    print(f"Saved matter_category.csv: {len(df_mc)} rows.")

    # -----------------------------------------------------------------------
    # proxy_voting_record_category
    # -----------------------------------------------------------------------
    df_pvrc = pd.DataFrame(PROXY_VOTING_RECORD_CATEGORY_ROWS)
    if not df_pvrc.empty:
        df_pvrc.drop_duplicates(inplace=True)

    pvrc_columns = ["vote_id", "category_id"]
    for col in pvrc_columns:
        if col not in df_pvrc.columns:
            df_pvrc[col] = None
    df_pvrc = df_pvrc[pvrc_columns]

    csv_pvrc = os.path.join(output_folder, "proxy_voting_record_category.csv")
    df_pvrc.to_csv(csv_pvrc, index=False)
    print(f"Saved proxy_voting_record_category.csv: {len(df_pvrc)} rows.")

    # -----------------------------------------------------------------------
    # voting_record_manager
    # -----------------------------------------------------------------------
    df_vrm = pd.DataFrame(VOTING_RECORD_MANAGER_ROWS)
    if not df_vrm.empty:
        df_vrm.drop_duplicates(inplace=True)

    vrm_columns = ["vote_id", "manager_id"]
    for col in vrm_columns:
        if col not in df_vrm.columns:
            df_vrm[col] = None
    df_vrm = df_vrm[vrm_columns]

    csv_vrm = os.path.join(output_folder, "voting_record_manager.csv")
    df_vrm.to_csv(csv_vrm, index=False)
    print(f"Saved voting_record_manager.csv: {len(df_vrm)} rows.")

    # -----------------------------------------------------------------------
    # voting_record_series
    # -----------------------------------------------------------------------
    df_vrs = pd.DataFrame(VOTING_RECORD_SERIES_ROWS)
    if not df_vrs.empty:
        df_vrs.drop_duplicates(inplace=True)

    vrs_columns = ["vote_id", "series_id"]
    for col in vrs_columns:
        if col not in df_vrs.columns:
            df_vrs[col] = None
    df_vrs = df_vrs[vrs_columns]

    csv_vrs = os.path.join(output_folder, "voting_record_series.csv")
    df_vrs.to_csv(csv_vrs, index=False)
    print(f"Saved voting_record_series.csv: {len(df_vrs)} rows.")

    print("\nAll CSVs have been written. Check your output folder to confirm.")

def run_all(folder_path="npx_filings", output_folder="output_csv"):
    """
    High-level convenience function to:
      1) Reset global data
      2) Parse the N-PX .txt files
      3) Write results to CSV
    """
    global FORM_NPX_ROWS, INSTITUTIONAL_MANAGER_ROWS, SERIES_ROWS
    global PROXY_VOTING_RECORD_ROWS, MATTER_CATEGORY_ROWS
    global PROXY_VOTING_RECORD_CATEGORY_ROWS, VOTING_RECORD_MANAGER_ROWS
    global VOTING_RECORD_SERIES_ROWS, KNOWN_CATEGORIES
    global NEXT_FORM_ID, NEXT_VOTE_ID

    # Reset everything
    FORM_NPX_ROWS = []
    INSTITUTIONAL_MANAGER_ROWS = []
    SERIES_ROWS = []
    PROXY_VOTING_RECORD_ROWS = []
    MATTER_CATEGORY_ROWS = []
    PROXY_VOTING_RECORD_CATEGORY_ROWS = []
    VOTING_RECORD_MANAGER_ROWS = []
    VOTING_RECORD_SERIES_ROWS = []
    KNOWN_CATEGORIES = {}

    NEXT_FORM_ID = 1
    NEXT_VOTE_ID = 1

    # Parse the files
    process_npx_files(folder_path)

    # Write the results to CSV
    write_to_csv(output_folder)


In [2]:
run_all("npx_filings", "output")


Processing: npx_filings\2024-05-17_N-PX_A_0001193125-24-141098.txt
  No <XML> blocks found.

Processing: npx_filings\2024-05-17_N-PX_A_0001193125-24-141099.txt
  No <XML> blocks found.

Processing: npx_filings\2024-06-20_N-PX_0001829126-24-004328.txt
  No <XML> blocks found.

Processing: npx_filings\2024-07-03_N-PX_0001896711-24-000005.txt

Processing: npx_filings\2024-07-09_N-PX_0001512026-24-000004.txt

Processing: npx_filings\2024-07-17_N-PX_0001140361-24-033397.txt

Processing: npx_filings\2024-07-22_N-PX_0001085146-24-003136.txt

Processing: npx_filings\2024-07-23_N-PX_0001754960-24-000314.txt

Processing: npx_filings\2024-07-24_N-PX_0001801413-24-000006.txt

Processing: npx_filings\2024-07-26_N-PX_0001172661-24-002922.txt

Processing: npx_filings\2024-07-30_N-PX_0001044929-24-000010.txt

Processing: npx_filings\2024-07-31_N-PX_0001667731-24-000356.txt

Processing: npx_filings\2024-08-02_N-PX_0001085146-24-003449.txt

Processing: npx_filings\2024-08-05_N-PX_0001413042-24-000583.t