In [7]:
import os
import re
import datetime
import pandas as pd
import lxml.etree as ET

# -----------------------------------------------------------------------
# Global Data Lists (improved, with direct linking)
# -----------------------------------------------------------------------
FORM_NPX_ROWS = []
INSTITUTIONAL_MANAGER_ROWS = []
SERIES_ROWS = []
PROXY_VOTING_RECORD_ROWS = []
MATTER_CATEGORY_ROWS = []
PROXY_VOTING_RECORD_CATEGORY_ROWS = []
VOTING_RECORD_MANAGER_ROWS = []
VOTING_RECORD_SERIES_ROWS = []

# We'll keep track of known categories (category_type -> category_id) so we don't re-insert them
KNOWN_CATEGORIES = {}

# We'll generate incremental IDs for each form and for each vote record:
NEXT_FORM_ID = 1
NEXT_VOTE_ID = 1

# -----------------------------------------------------------------------
# Helper Functions
# -----------------------------------------------------------------------

def parse_date(date_string):
    """
    Try to parse a date string in multiple formats, returning a Python date or None.
    """
    if not date_string:
        return None
    patterns = ["%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%Y%m%d"]
    for fmt in patterns:
        try:
            return datetime.datetime.strptime(date_string.strip(), fmt).date()
        except ValueError:
            pass
    return None

def get_text(node, xpath_expr):
    """
    Returns the .text of the first match or '' if none.
    """
    result = node.xpath(xpath_expr)
    if result and result[0] is not None and result[0].text:
        return result[0].text.strip()
    return ""

def get_decimal(node, xpath_expr):
    """
    Convert matched element text to float, ignoring commas. Return None if invalid.
    """
    txt = get_text(node, xpath_expr)
    if not txt:
        return None
    try:
        return float(txt.replace(",", ""))
    except ValueError:
        return None

def extract_sec_header_info(file_path):
    """
    Parse <SEC-HEADER> lines: "ACCESSION NUMBER:" and "FILED AS OF DATE:" with regex.
    """
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        raw = f.read()

    info = {
        "accession_number": "",
        "date_filed": None  # We'll parse date to a Python date
    }

    # Accession
    match_acc = re.search(r"ACCESSION\s+NUMBER:\s*([^\r\n]+)", raw, re.IGNORECASE)
    if match_acc:
        info["accession_number"] = match_acc.group(1).strip()

    # FILED AS OF DATE
    match_filed = re.search(r"FILED\s+AS\s+OF\s+DATE:\s*(\d{8})", raw, re.IGNORECASE)
    if match_filed:
        info["date_filed"] = parse_date(match_filed.group(1).strip())

    return info

def parse_edgar_submission(root, sec_header):
    """
    Extract top-level data from <edgarSubmission>.
    Returns a dictionary for one row in 'form_npx'.

    We also handle the address parsing so that <stateOrCountry> becomes
    address_state or address_country if possible.
    """
    global NEXT_FORM_ID

    form_id = NEXT_FORM_ID

    # Basic dictionary we’ll fill
    data = {
        "form_id": form_id,
        "form_type": get_text(root, ".//*[local-name()='submissionType']"),
        "registrant_type": get_text(root, ".//*[local-name()='registrantType']"),
        "live_test_flag": get_text(root, ".//*[local-name()='liveTestFlag']"),

        # CIK can sometimes be numeric or string w/ leading zeros
        "cik": get_text(root, ".//*[local-name()='issuerCredentials']/*[local-name()='cik']"),

        # Potential phone number from <reportingPerson><phoneNumber>
        "phone_number": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='phoneNumber']"),

        # e.g. "N-1A", "N-2", etc.
        "investment_company_type": get_text(root, ".//*[local-name()='investmentCompanyType']"),

        # parse conformed period
        "conformed_period": parse_date(get_text(root, ".//*[local-name()='periodOfReport']")),

        # year_or_quarter, plus optional calendar/quarter year
        "year_or_quarter": get_text(root, ".//*[local-name()='coverPage']/*[local-name()='yearOrQuarter']"),
        "report_calendar_year": get_text(root, ".//*[local-name()='coverPage']/*[local-name()='reportCalendarYear']"),
        "report_quarter_year": get_text(root, ".//*[local-name()='coverPage']/*[local-name()='reportQuarterYear']"),

        # N-PX or an alternative
        "report_type": get_text(root, ".//*[local-name()='reportInfo']/*[local-name()='reportType']") or "INSTITUTIONAL MANAGER VOTING REPORT",

        # Y or N
        "confidential_treatment": "N",
        "is_notice_report": False,
        "is_amendment": False,
        "amendment_no": None,
        "amendment_type": None,
        "reason_for_non_confidentiality": None,

        # Some filers have <crdNumber> in <reportInfo>
        "crd_number": get_text(root, ".//*[local-name()='reportingCrdNumber']"),
        # rename from 'reporting_sec_file_number' → 'sec_file_number_other'
        "sec_file_number_other": get_text(root, ".//*[local-name()='reportingSecFileNumber']"),
        "lei_number": get_text(root, ".//*[local-name()='leiNumber']"),

        "sec_file_number": get_text(root, ".//*[local-name()='fileNumber']"),

        "explanatory_choice": "N",
        "explanatory_notes": None,
        "other_included_managers_count": 0,

        "reporting_person_name": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='name']"),
        # We'll parse address below
        "address_street1": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street1']"),
        "address_street2": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street2']"),
        "address_city": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='city']"),

        # We'll figure out if <stateOrCountry> is indeed a 2-letter state or a full country
        "address_state": "",
        "address_country": "",
        "address_zip": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='zipCode']"),

        "signatory_name": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txSignature']"),
        "signatory_name_printed": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txPrintedSignature']"),
        "signatory_title": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txTitle']"),
        "signatory_date": parse_date(get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txAsOfDate']")),

        "accession_number": sec_header["accession_number"],
        "date_filed": sec_header["date_filed"],
    }

    # parse <confidentialTreatment>
    conf_treat = get_text(root, ".//*[local-name()='reportInfo']/*[local-name()='confidentialTreatment']").upper()
    if conf_treat in ["Y", "YES", "TRUE", "1"]:
        data["confidential_treatment"] = "Y"

    # detect if notice report
    rt_up = data["report_type"].upper()
    data["is_notice_report"] = ("NOTICE" in rt_up)

    # parse isAmendment
    is_amd = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='isAmendment']").upper()
    if is_amd == "Y":
        data["is_amendment"] = True

    # parse amendment_no
    amd_no = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentNo']")
    if amd_no.isdigit():
        data["amendment_no"] = int(amd_no)

    # parse amendment_type
    amd_type = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentType']")
    if amd_type:
        data["amendment_type"] = amd_type

    # parse reason_for_non_confidentiality
    rfn = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='reasonForNonConfidentiality']")
    if rfn:
        data["reason_for_non_confidentiality"] = rfn

    # parse other_included_managers_count
    oimc = get_text(root, ".//*[local-name()='summaryPage']/*[local-name()='otherIncludedManagersCount']")
    if oimc.isdigit():
        data["other_included_managers_count"] = int(oimc)

    # parse explanatoryInformation
    expl_choice = get_text(root, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryChoice']").upper()
    if expl_choice in ["Y", "YES", "TRUE", "1"]:
        data["explanatory_choice"] = "Y"

    expl_notes = get_text(root, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryNotes']")
    if expl_notes:
        data["explanatory_notes"] = expl_notes

    # Now handle <stateOrCountry> logic
    raw_state_country = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='stateOrCountry']")
    if raw_state_country:
        if len(raw_state_country) == 2:  # e.g. "TX", "NY"
            data["address_state"] = raw_state_country
            data["address_country"] = "US"
        else:
            # We'll assume it's the full country (e.g. "UNITED KINGDOM")
            data["address_state"] = ""
            data["address_country"] = raw_state_country

    return data

def parse_institutional_managers(root, form_id):
    """
    Extract institutional managers from <summaryPage> -> <otherManagers2> or <otherManager>.
    Return a list of manager dicts referencing the same form_id.
    We rename 'reporting_sec_file_number' -> 'sec_file_number' to avoid confusion here.
    """
    results = []
    manager_nodes = root.xpath(".//*[local-name()='otherManagers2']//*[local-name()='investmentManagers']")
    if not manager_nodes:
        manager_nodes = root.xpath(".//*[local-name()='otherManager']")

    for mn in manager_nodes:
        # We do not try to unify them with <otherManager> references from <proxyTable>.
        # This is strictly the 'institutional_manager' table from the summary page.
        row_im = {
            "manager_id": None,  # Will assign in write_to_csv
            "form_id": form_id,
            "serial_no": None,
            "name": "",
            "form13f_number": "",
            "crd_number": "",
            "sec_file_number": "",
            "lei_number": ""
        }

        # serialNo
        sn = mn.xpath(".//*[local-name()='serialNo']/text()")
        if sn and sn[0].isdigit():
            row_im["serial_no"] = float(sn[0])  # store as float or int

        # name
        nm = mn.xpath(".//*[local-name()='name']/text()")
        if nm:
            row_im["name"] = nm[0].strip()

        # form13FFileNumber
        f13 = mn.xpath(".//*[local-name()='form13FFileNumber']/text()")
        if f13:
            row_im["form13f_number"] = f13[0].strip()

        # crdNumber
        crd = mn.xpath(".//*[local-name()='crdNumber']/text()")
        if crd:
            row_im["crd_number"] = crd[0].strip()

        # secFileNumber
        sfn = mn.xpath(".//*[local-name()='secFileNumber']/text()")
        if sfn:
            row_im["sec_file_number"] = sfn[0].strip()

        # leiNumber
        lei = mn.xpath(".//*[local-name()='leiNumber']/text()")
        if lei:
            row_im["lei_number"] = lei[0].strip()

        results.append(row_im)

    return results

def parse_series_info(root, form_id):
    """
    <seriesPage> -> <seriesDetails> -> <seriesReports> -> <idOfSeries>, <nameOfSeries>, <leiOfSeries>.
    Return list referencing form_id.
    """
    results = []
    series_nodes = root.xpath(".//*[local-name()='seriesReports']")
    for sn in series_nodes:
        s_data = {
            "series_id": None,  # assigned later
            "form_id": form_id,
            "series_code": get_text(sn, ".//*[local-name()='idOfSeries']"),
            "series_name": get_text(sn, ".//*[local-name()='nameOfSeries']"),
            "series_lei": get_text(sn, ".//*[local-name()='leiOfSeries']")
        }
        results.append(s_data)
    return results

def parse_proxy_vote_table(proxy_vote_node, form_id):
    """
    Parse each <proxyTable> inside <proxyVoteTable>, generate one row in PROXY_VOTING_RECORD_ROWS
    per <proxyTable>, plus direct link rows for categories, managers, and series.

    This eliminates the old "round-robin" approach by assigning vote_id as we parse.
    """
    global NEXT_VOTE_ID, KNOWN_CATEGORIES

    for pt in proxy_vote_node.xpath(".//*[local-name()='proxyTable']"):
        vote_id = NEXT_VOTE_ID
        NEXT_VOTE_ID += 1

        # Build the main proxy voting record
        row = {
            "vote_id": vote_id,
            "form_id": form_id,
            "issuer_name": get_text(pt, ".//*[local-name()='issuerName']"),
            "cusip": get_text(pt, ".//*[local-name()='cusip']"),
            "isin": get_text(pt, ".//*[local-name()='isin']"),
            "figi": get_text(pt, ".//*[local-name()='figi']"),

            "meeting_date": parse_date(get_text(pt, ".//*[local-name()='meetingDate']")),
            "vote_description": get_text(pt, ".//*[local-name()='voteDescription']"),
            "proposed_by": get_text(pt, ".//*[local-name()='voteSource']"),

            "shares_voted": get_decimal(pt, ".//*[local-name()='sharesVoted'][1]"),
            "shares_on_loan": get_decimal(pt, ".//*[local-name()='sharesOnLoan'][1]"),

            "vote_cast": None,
            "vote_cast_shares": None,
            "management_rec": None,
            "other_notes": None
        }

        # If there is <voteRecord>, pull out howVoted, managementRecommendation, etc.
        vote_records = pt.xpath(".//*[local-name()='voteRecord']")
        if vote_records:
            # We'll just grab the first <voteRecord>
            vr = vote_records[0]
            row["vote_cast"] = get_text(vr, ".//*[local-name()='howVoted']")
            row["vote_cast_shares"] = get_decimal(vr, ".//*[local-name()='sharesVoted']")
            row["management_rec"] = get_text(vr, ".//*[local-name()='managementRecommendation']")

            if len(vote_records) > 1:
                row["other_notes"] = f"{len(vote_records)} total voteRecord items."

        # Append to global PROXY_VOTING_RECORD_ROWS
        PROXY_VOTING_RECORD_ROWS.append(row)

        # Parse categories (<voteCategories>)
        categories = pt.xpath(".//*[local-name()='voteCategories']//*[local-name()='categoryType']/text()")
        for cat_str in categories:
            cat_clean = cat_str.strip()
            if cat_clean not in KNOWN_CATEGORIES:
                new_cat_id = len(KNOWN_CATEGORIES) + 1
                KNOWN_CATEGORIES[cat_clean] = new_cat_id
                MATTER_CATEGORY_ROWS.append({"category_id": new_cat_id, "category_type": cat_clean})

            cat_id = KNOWN_CATEGORIES[cat_clean]
            PROXY_VOTING_RECORD_CATEGORY_ROWS.append({
                "vote_id": vote_id,
                "category_id": cat_id
            })

        # Parse manager references (<voteManager><otherManagers> -> <otherManager>)
        # Typically these are codes or short strings. We'll store them directly:
        other_mgrs = pt.xpath(".//*[local-name()='voteManager']//*[local-name()='otherManager']/text()")
        for mgr_code in other_mgrs:
            # We'll store the text in VOTING_RECORD_MANAGER_ROWS with manager_id=None
            # If you want to unify it with institutional_manager, you'd do more matching logic.
            VOTING_RECORD_MANAGER_ROWS.append({
                "vote_id": vote_id,
                "manager_id": mgr_code.strip()  # We'll store the code in this field for now
            })

        # Parse series reference (<voteSeries>)
        vs = get_text(pt, ".//*[local-name()='voteSeries']")
        if vs:
            # For now, we store the raw code. If needed, you match it to the 'series' table.
            VOTING_RECORD_SERIES_ROWS.append({
                "vote_id": vote_id,
                "series_id": vs
            })

def extract_xml_blocks(file_path):
    """
    Return a list of <XML> ... </XML> substrings from the file.
    """
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        text = f.read()
    pattern = re.compile(r"<XML>(.*?)</XML>", re.IGNORECASE | re.DOTALL)
    return pattern.findall(text)

def parse_xml_fragment(xml_string):
    """
    Attempt to parse an XML fragment with lxml.etree in recovery mode.
    """
    parser = ET.XMLParser(recover=True, encoding="utf-8")
    try:
        root = ET.fromstring(xml_string.encode("utf-8"), parser=parser)
        return root
    except ET.XMLSyntaxError as e:
        print(f"  [Warning] parse error: {e}")
        return None


def process_npx_files(folder_path="npx_filings"):
    """
    Main loop:
      1) For each .txt in folder_path
      2) Extract <SEC-HEADER> info
      3) Extract <XML> blocks
      4) Parse <edgarSubmission> (build 'form_npx' row, plus managers, series)
      5) Parse <proxyVoteTable> (build 'proxy_voting_record' + link rows)
    """
    global NEXT_FORM_ID

    all_files = os.listdir(folder_path)
    txt_files = [f for f in all_files if f.lower().endswith(".txt")]

    for fname in txt_files:
        file_path = os.path.join(folder_path, fname)
        print(f"\nProcessing: {file_path}")

        # 1) SEC header info
        header_info = extract_sec_header_info(file_path)

        # 2) <XML> blocks
        xml_fragments = extract_xml_blocks(file_path)
        if not xml_fragments:
            print("  No <XML> blocks found.")
            continue

        found_form_data = False
        current_form_id = None

        for frag in xml_fragments:
            root = parse_xml_fragment(frag)
            if root is None:
                continue

            # find <edgarSubmission>
            es_nodes = root.xpath("//*[local-name()='edgarSubmission']")
            if es_nodes:
                es = es_nodes[0]
                # parse submission
                form_row = parse_edgar_submission(es, header_info)
                current_form_id = form_row["form_id"]
                NEXT_FORM_ID += 1

                # add to FORM_NPX_ROWS
                FORM_NPX_ROWS.append(form_row)

                # parse managers
                im_list = parse_institutional_managers(es, current_form_id)
                INSTITUTIONAL_MANAGER_ROWS.extend(im_list)

                # parse series
                s_list = parse_series_info(es, current_form_id)
                SERIES_ROWS.extend(s_list)

                found_form_data = True

            # find <proxyVoteTable>
            pvt_nodes = root.xpath("//*[local-name()='proxyVoteTable']")
            if pvt_nodes:
                # If we haven't found any <edgarSubmission>, we must create a minimal form row
                # so that we have a form_id for these votes.
                if not found_form_data:
                    minimal_form = {
                        "form_id": NEXT_FORM_ID,
                        "form_type": "",
                        "registrant_type": "",
                        "live_test_flag": "",
                        "cik": "",
                        "phone_number": "",
                        "investment_company_type": "",
                        "conformed_period": None,
                        "year_or_quarter": "",
                        "report_calendar_year": "",
                        "report_quarter_year": "",
                        "report_type": "",
                        "confidential_treatment": "N",
                        "is_notice_report": False,
                        "is_amendment": False,
                        "amendment_no": None,
                        "amendment_type": None,
                        "reason_for_non_confidentiality": None,
                        # rename to match final schema
                        "crd_number": "",
                        "sec_file_number_other": "",
                        "lei_number": "",
                        "sec_file_number": "",
                        "explanatory_choice": "N",
                        "explanatory_notes": None,
                        "other_included_managers_count": 0,
                        "reporting_person_name": "",
                        "address_street1": "",
                        "address_street2": "",
                        "address_city": "",
                        "address_state": "",
                        "address_country": "",
                        "address_zip": "",
                        "signatory_name": "",
                        "signatory_name_printed": "",
                        "signatory_title": "",
                        "signatory_date": None,
                        "accession_number": header_info["accession_number"],
                        "date_filed": header_info["date_filed"],
                    }
                    current_form_id = minimal_form["form_id"]
                    FORM_NPX_ROWS.append(minimal_form)
                    NEXT_FORM_ID += 1
                    found_form_data = True

                # parse each proxyVoteTable node
                for pvt in pvt_nodes:
                    parse_proxy_vote_table(pvt, current_form_id)

    # End for
    print("\nFinished parsing N-PX.")
    print(f"  Forms: {len(FORM_NPX_ROWS)}")
    print(f"  Proxy Voting Rows: {len(PROXY_VOTING_RECORD_ROWS)}")

def write_to_csv(output_folder="output_csv"):
    """
    Writes each global list to CSV, creating an ID for managers, series, etc.
    Note that manager linking and series linking in the code now references actual
    <otherManager> text or <voteSeries> text. If you want to unify them with the
    institutional_manager or series table, you must do matching logic or store them
    differently.
    """
    os.makedirs(output_folder, exist_ok=True)

    # form_npx
    df_form = pd.DataFrame(FORM_NPX_ROWS)
    # Convert date-like columns to date or string (here we keep them as string if None).
    date_cols = ["conformed_period", "date_filed", "signatory_date"]
    for dc in date_cols:
        if dc in df_form.columns:
            df_form[dc] = pd.to_datetime(df_form[dc]).dt.date
    csv_form = os.path.join(output_folder, "form_npx.csv")
    df_form.drop_duplicates().to_csv(csv_form, index=False)
    print(f"Saved form_npx.csv: {len(df_form)} rows.")

    # institutional_manager
    # We'll assign manager_id from the index + 1
    df_im = pd.DataFrame(INSTITUTIONAL_MANAGER_ROWS)
    if not df_im.empty:
        df_im.drop_duplicates(inplace=True)
        df_im["manager_id"] = range(1, len(df_im) + 1)
    csv_im = os.path.join(output_folder, "institutional_manager.csv")
    df_im.to_csv(csv_im, index=False)
    print(f"Saved institutional_manager.csv: {len(df_im)} rows.")

    # series
    df_s = pd.DataFrame(SERIES_ROWS)
    if not df_s.empty:
        df_s.drop_duplicates(inplace=True)
        df_s["series_id"] = range(1, len(df_s) + 1)
    csv_s = os.path.join(output_folder, "series.csv")
    df_s.to_csv(csv_s, index=False)
    print(f"Saved series.csv: {len(df_s)} rows.")

    # proxy_voting_record
    df_pvr = pd.DataFrame(PROXY_VOTING_RECORD_ROWS)
    # We already assigned a vote_id, so just drop duplicates
    if not df_pvr.empty:
        df_pvr.drop_duplicates(subset=["vote_id"], inplace=True)
        # convert date columns
        if "meeting_date" in df_pvr.columns:
            df_pvr["meeting_date"] = pd.to_datetime(df_pvr["meeting_date"]).dt.date
    csv_pvr = os.path.join(output_folder, "proxy_voting_record.csv")
    df_pvr.to_csv(csv_pvr, index=False)
    print(f"Saved proxy_voting_record.csv: {len(df_pvr)} rows.")

    # matter_category
    df_mc = pd.DataFrame(MATTER_CATEGORY_ROWS)
    if not df_mc.empty:
        df_mc.drop_duplicates(subset=["category_type"], inplace=True)
    csv_mc = os.path.join(output_folder, "matter_category.csv")
    df_mc.to_csv(csv_mc, index=False)
    print(f"Saved matter_category.csv: {len(df_mc)} rows.")

    # proxy_voting_record_category
    df_pvrc = pd.DataFrame(PROXY_VOTING_RECORD_CATEGORY_ROWS)
    # Here we keep the exact links we generated, no round-robin:
    if not df_pvrc.empty:
        df_pvrc.drop_duplicates(inplace=True)
    csv_pvrc = os.path.join(output_folder, "proxy_voting_record_category.csv")
    df_pvrc.to_csv(csv_pvrc, index=False)
    print(f"Saved proxy_voting_record_category.csv: {len(df_pvrc)} rows.")

    # voting_record_manager
    # This code captures the raw text from <otherManager> as manager_id,
    # so the table might have manager_id as strings. If you want to unify them,
    # you'd match them to institutional_manager. For now, we just store them.
    df_vrm = pd.DataFrame(VOTING_RECORD_MANAGER_ROWS)
    if not df_vrm.empty:
        df_vrm.drop_duplicates(inplace=True)
    csv_vrm = os.path.join(output_folder, "voting_record_manager.csv")
    df_vrm.to_csv(csv_vrm, index=False)
    print(f"Saved voting_record_manager.csv: {len(df_vrm)} rows.")

    # voting_record_series
    df_vrs = pd.DataFrame(VOTING_RECORD_SERIES_ROWS)
    if not df_vrs.empty:
        df_vrs.drop_duplicates(inplace=True)
    csv_vrs = os.path.join(output_folder, "voting_record_series.csv")
    df_vrs.to_csv(csv_vrs, index=False)
    print(f"Saved voting_record_series.csv: {len(df_vrs)} rows.")

    print("\nAll CSVs have been written. Check your output folder to confirm.")


def run_all(folder_path="npx_filings", output_folder="output_csv"):
    """
    High-level convenience function to:
      1) Reset global data
      2) Parse the N-PX .txt files
      3) Write results to CSV
    """
    global FORM_NPX_ROWS, INSTITUTIONAL_MANAGER_ROWS, SERIES_ROWS, \
       PROXY_VOTING_RECORD_ROWS, MATTER_CATEGORY_ROWS, \
       PROXY_VOTING_RECORD_CATEGORY_ROWS, VOTING_RECORD_MANAGER_ROWS, \
       VOTING_RECORD_SERIES_ROWS, KNOWN_CATEGORIES, NEXT_FORM_ID, NEXT_VOTE_ID

    # Reset everything
    FORM_NPX_ROWS = []
    INSTITUTIONAL_MANAGER_ROWS = []
    SERIES_ROWS = []
    PROXY_VOTING_RECORD_ROWS = []
    MATTER_CATEGORY_ROWS = []
    PROXY_VOTING_RECORD_CATEGORY_ROWS = []
    VOTING_RECORD_MANAGER_ROWS = []
    VOTING_RECORD_SERIES_ROWS = []
    KNOWN_CATEGORIES = {}

    NEXT_FORM_ID = 1
    NEXT_VOTE_ID = 1

    # Parse the files
    process_npx_files(folder_path)

    # Write the results to CSV
    write_to_csv(output_folder)





In [8]:
run_all("npx_filings", "output")



Processing: npx_filings\2024-06-20_N-PX_0001829126-24-004328.txt
  No <XML> blocks found.

Processing: npx_filings\2024-07-01_N-PX_0001062993-24-013157.txt

Processing: npx_filings\2024-07-01_N-PX_0001172661-24-002616.txt

Processing: npx_filings\2024-07-02_N-PX_0001788241-24-000005.txt

Processing: npx_filings\2024-07-03_N-PX_0000354923-24-000004.txt

Processing: npx_filings\2024-07-03_N-PX_0001545812-24-000002.txt

Processing: npx_filings\2024-07-03_N-PX_0001896711-24-000005.txt

Processing: npx_filings\2024-07-05_N-PX_0001768130-24-000002.txt

Processing: npx_filings\2024-07-05_N-PX_0001786241-24-000005.txt

Processing: npx_filings\2024-07-09_N-PX_0001085146-24-002797.txt

Processing: npx_filings\2024-07-09_N-PX_0001512026-24-000004.txt

Processing: npx_filings\2024-07-09_N-PX_0001641296-24-000003.txt

Processing: npx_filings\2024-07-09_N-PX_0001765380-24-000233.txt

Processing: npx_filings\2024-07-10_N-PX_0001172661-24-002724.txt

Processing: npx_filings\2024-07-11_N-PX_0001754960

OutOfBoundsDatetime: Out of bounds nanosecond timestamp: 0247-05-22, at position 385

In [None]:
import os
import re
import datetime
import pandas as pd
import lxml.etree as ET

# -----------------------------------------------------------------------
# Global Data Lists (matching the final schema)
# -----------------------------------------------------------------------
FORM_NPX_ROWS = []
INSTITUTIONAL_MANAGER_ROWS = []
SERIES_ROWS = []
PROXY_VOTING_RECORD_ROWS = []
MATTER_CATEGORY_ROWS = []
PROXY_VOTING_RECORD_CATEGORY_ROWS = []
VOTING_RECORD_MANAGER_ROWS = []
VOTING_RECORD_SERIES_ROWS = []

# We’ll keep track of known categories (category_type -> category_id)
KNOWN_CATEGORIES = {}

# We'll generate incremental IDs for each form and each vote record
NEXT_FORM_ID = 1
NEXT_VOTE_ID = 1

# -----------------------------------------------------------------------
# Helper Functions
# -----------------------------------------------------------------------

def parse_date(date_string):
    """
    Try to parse a date string in multiple formats, returning a Python date or None.
    """
    if not date_string:
        return None
    patterns = ["%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%Y%m%d"]
    for fmt in patterns:
        try:
            return datetime.datetime.strptime(date_string.strip(), fmt).date()
        except ValueError:
            pass
    return None

def get_text(node, xpath_expr):
    """
    Returns the .text of the first match or '' if none.
    """
    result = node.xpath(xpath_expr)
    if result and result[0] is not None and result[0].text:
        return result[0].text.strip()
    return ""

def get_decimal(node, xpath_expr):
    """
    Convert matched element text to float, ignoring commas. Return None if invalid.
    """
    txt = get_text(node, xpath_expr)
    if not txt:
        return None
    try:
        return float(txt.replace(",", ""))
    except ValueError:
        return None

def extract_sec_header_info(file_path):
    """
    Parse <SEC-HEADER> lines: "ACCESSION NUMBER:" and "FILED AS OF DATE:" with regex.
    Returns a dict with 'accession_number' (str) and 'date_filed' (date).
    """
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        raw = f.read()

    info = {
        "accession_number": "",
        "date_filed": None
    }

    # Accession
    match_acc = re.search(r"ACCESSION\s+NUMBER:\s*([^\r\n]+)", raw, re.IGNORECASE)
    if match_acc:
        info["accession_number"] = match_acc.group(1).strip()

    # FILED AS OF DATE
    match_filed = re.search(r"FILED\s+AS\s+OF\s+DATE:\s*(\d{8})", raw, re.IGNORECASE)
    if match_filed:
        info["date_filed"] = parse_date(match_filed.group(1).strip())

    return info

def parse_edgar_submission(root, sec_header):
    """
    Extract top-level data from <edgarSubmission> for the 'form_npx' table
    (matching the final schema).
    """
    global NEXT_FORM_ID  # Must be at the top of the function scope

    form_id = NEXT_FORM_ID
    NEXT_FORM_ID += 1

    # Create a dict that only contains columns from the final "form_npx" schema.
    data = {
        # Primary Key (auto-assigned)
        "form_id": form_id,

        # Reporting Person / Filer Info
        "reporting_person_name": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='name']"),
        "phone_number": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='phoneNumber']"),
        "address_street1": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street1']"),
        "address_street2": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street2']"),
        "address_city": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='city']"),
        "address_state": "",  # We'll set below if found
        "address_zip": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='zipCode']"),

        # Form N-PX Filing Info
        "accession_number": sec_header["accession_number"],
        "cik": get_text(root, ".//*[local-name()='issuerCredentials']/*[local-name()='cik']"),
        "conformed_period": parse_date(get_text(root, ".//*[local-name()='periodOfReport']")),
        "date_filed": sec_header["date_filed"],

        # We'll parse these from the submission
        "report_type": get_text(root, ".//*[local-name()='reportInfo']/*[local-name()='reportType']") or "FUND VOTING REPORT",
        "form_type": get_text(root, ".//*[local-name()='submissionType']") or "N-PX",
        "sec_file_number": get_text(root, ".//*[local-name()='fileNumber']"),
        "crd_number": get_text(root, ".//*[local-name()='reportingCrdNumber']"),
        "sec_file_number_other": get_text(root, ".//*[local-name()='reportingSecFileNumber']"),
        "lei_number": get_text(root, ".//*[local-name()='leiNumber']"),
        "investment_company_type": get_text(root, ".//*[local-name()='investmentCompanyType']"),

        # Y or N
        "confidential_treatment": "N",

        # Additional flags
        "is_notice_report": False,
        "explanatory_choice": "N",
        "other_included_managers_count": 0,
        "series_count": 0,  # Will update after parsing <seriesReports>

        # Amendment fields
        "is_amendment": False,
        "amendment_no": None,
        "amendment_type": None,
        "notice_explanation": None,

        # Signature fields
        "signatory_name": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txSignature']"),
        "signatory_name_printed": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txPrintedSignature']"),
        "signatory_title": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txTitle']"),
        "signatory_date": parse_date(get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txAsOfDate']")),
    }

    # Confidential treatment
    conf_treat = get_text(root, ".//*[local-name()='reportInfo']/*[local-name()='confidentialTreatment']").upper()
    if conf_treat in ["Y", "YES", "TRUE", "1"]:
        data["confidential_treatment"] = "Y"

    # Check if the report_type indicates a NOTICE report
    rt_up = data["report_type"].upper()
    if "NOTICE" in rt_up:
        data["is_notice_report"] = True

    # Check explanatory_choice
    expl_choice = get_text(root, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryChoice']").upper()
    if expl_choice in ["Y", "YES", "TRUE", "1"]:
        data["explanatory_choice"] = "Y"

    # If we have an explanatoryNotes field, treat it as "notice_explanation"
    expl_notes = get_text(root, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryNotes']")
    if expl_notes:
        data["notice_explanation"] = expl_notes

    # otherIncludedManagersCount
    oimc = get_text(root, ".//*[local-name()='summaryPage']/*[local-name()='otherIncludedManagersCount']")
    if oimc.isdigit():
        data["other_included_managers_count"] = int(oimc)

    # isAmendment
    is_amd = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='isAmendment']").upper()
    if is_amd in ["Y", "YES", "TRUE", "1"]:
        data["is_amendment"] = True

    # amendment_no
    amd_no = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentNo']")
    if amd_no.isdigit():
        data["amendment_no"] = int(amd_no)

    # amendment_type
    amd_type = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentType']")
    if amd_type:
        data["amendment_type"] = amd_type

    # stateOrCountry logic: not in final schema except we do have address_state
    raw_state_country = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='stateOrCountry']")
    # If it's 2 letters, store it directly. Otherwise, store the full string.
    if raw_state_country:
        data["address_state"] = raw_state_country

    return data

def parse_institutional_managers(root, form_id):
    """
    Extract institutional managers from <summaryPage> -> <otherManagers2> or <otherManager>.
    Return a list of dicts for the 'institutional_manager' table.
    """
    results = []
    manager_nodes = root.xpath(".//*[local-name()='otherManagers2']//*[local-name()='investmentManagers']")
    if not manager_nodes:
        manager_nodes = root.xpath(".//*[local-name()='otherManager']")

    for mn in manager_nodes:
        row_im = {
            "manager_id": None,  # Assigned later in write_to_csv
            "form_id": form_id,
            "serial_no": None,
            "name": "",
            "form13f_number": "",
            "crd_number": "",
            "sec_file_number": "",
            "lei_number": ""
        }

        # Parse possible fields
        sn = mn.xpath(".//*[local-name()='serialNo']/text()")
        if sn and sn[0].isdigit():
            row_im["serial_no"] = int(sn[0])

        nm = mn.xpath(".//*[local-name()='name']/text()")
        if nm:
            row_im["name"] = nm[0].strip()

        f13 = mn.xpath(".//*[local-name()='form13FFileNumber']/text()")
        if f13:
            row_im["form13f_number"] = f13[0].strip()

        crd = mn.xpath(".//*[local-name()='crdNumber']/text()")
        if crd:
            row_im["crd_number"] = crd[0].strip()

        sfn = mn.xpath(".//*[local-name()='secFileNumber']/text()")
        if sfn:
            row_im["sec_file_number"] = sfn[0].strip()

        lei = mn.xpath(".//*[local-name()='leiNumber']/text()")
        if lei:
            row_im["lei_number"] = lei[0].strip()

        results.append(row_im)

    return results

def parse_series_info(root, form_dict):
    """
    <seriesPage> -> <seriesDetails> -> <seriesReports>
    Returns list referencing form_id for the 'series' table.
    Also updates 'series_count' in form_dict to the total # of series found.
    """
    form_id = form_dict["form_id"]
    series_nodes = root.xpath(".//*[local-name()='seriesReports']")
    results = []
    for sn in series_nodes:
        s_data = {
            "series_id": None,  # assigned later
            "form_id": form_id,
            "series_code": get_text(sn, ".//*[local-name()='idOfSeries']"),
            "series_name": get_text(sn, ".//*[local-name()='nameOfSeries']"),
            "series_lei": get_text(sn, ".//*[local-name()='leiOfSeries']")
        }
        results.append(s_data)

    # If we found any series, update the count in the form dict
    form_dict["series_count"] += len(results)
    return results

def parse_proxy_vote_table(proxy_vote_node, form_id):
    """
    Parse each <proxyTable> inside <proxyVoteTable>, generate rows for 'proxy_voting_record'
    plus linking rows (categories, managers, series).
    """
    global NEXT_VOTE_ID  # Must be declared at top of function if we're incrementing it

    for pt in proxy_vote_node.xpath(".//*[local-name()='proxyTable']"):
        vote_id = NEXT_VOTE_ID
        NEXT_VOTE_ID += 1

        # Build the main proxy voting record
        row = {
            "vote_id": vote_id,
            "form_id": form_id,
            "issuer_name": get_text(pt, ".//*[local-name()='issuerName']"),
            "cusip": get_text(pt, ".//*[local-name()='cusip']"),
            "isin": get_text(pt, ".//*[local-name()='isin']"),
            "figi": get_text(pt, ".//*[local-name()='figi']"),
            "meeting_date": parse_date(get_text(pt, ".//*[local-name()='meetingDate']")),
            "vote_description": get_text(pt, ".//*[local-name()='voteDescription']"),
            "proposed_by": get_text(pt, ".//*[local-name()='voteSource']"),
            "shares_voted": get_decimal(pt, ".//*[local-name()='sharesVoted'][1]"),
            "shares_on_loan": get_decimal(pt, ".//*[local-name()='sharesOnLoan'][1]"),
            "vote_cast": None,
            "vote_cast_shares": None,
            "management_rec": None,
            "other_notes": None
        }

        # If there is <voteRecord>, pull out howVoted, managementRecommendation, etc.
        vote_records = pt.xpath(".//*[local-name()='voteRecord']")
        if vote_records:
            vr = vote_records[0]
            row["vote_cast"] = get_text(vr, ".//*[local-name()='howVoted']")
            row["vote_cast_shares"] = get_decimal(vr, ".//*[local-name()='sharesVoted']")
            row["management_rec"] = get_text(vr, ".//*[local-name()='managementRecommendation']")

            if len(vote_records) > 1:
                row["other_notes"] = f"{len(vote_records)} total <voteRecord> items found."

        # Append to global PROXY_VOTING_RECORD_ROWS
        PROXY_VOTING_RECORD_ROWS.append(row)

        # Parse categories (<voteCategories>)
        categories = pt.xpath(".//*[local-name()='voteCategories']//*[local-name()='categoryType']/text()")
        for cat_str in categories:
            cat_clean = cat_str.strip()
            if cat_clean not in KNOWN_CATEGORIES:
                new_cat_id = len(KNOWN_CATEGORIES) + 1
                KNOWN_CATEGORIES[cat_clean] = new_cat_id
                MATTER_CATEGORY_ROWS.append({"category_id": new_cat_id, "category_type": cat_clean})

            cat_id = KNOWN_CATEGORIES[cat_clean]
            PROXY_VOTING_RECORD_CATEGORY_ROWS.append({
                "vote_id": vote_id,
                "category_id": cat_id
            })

        # Parse manager references (<voteManager><otherManagers> -> <otherManager>)
        # According to the final schema, we store (vote_id, manager_id).
        # We do a simple approach: if <otherManager> text matches a serialNo in institutional_manager for the same form.
        other_mgrs = pt.xpath(".//*[local-name()='voteManager']//*[local-name()='otherManager']/text()")
        for mgr_code in other_mgrs:
            mgr_code = mgr_code.strip()
            match_id = None
            for im_row in INSTITUTIONAL_MANAGER_ROWS:
                if im_row["form_id"] == form_id and im_row["serial_no"] is not None:
                    try:
                        if int(mgr_code) == im_row["serial_no"]:
                            match_id = im_row["manager_id"]  # Will be assigned later in write_to_csv
                            break
                    except ValueError:
                        pass

            if match_id is not None:
                VOTING_RECORD_MANAGER_ROWS.append({
                    "vote_id": vote_id,
                    "manager_id": match_id
                })

        # Parse series reference (<voteSeries>)
        vs_code = get_text(pt, ".//*[local-name()='voteSeries']")
        if vs_code:
            # We must match this to the series table by 'series_code' for the same form_id
            VOTING_RECORD_SERIES_ROWS.append({
                "vote_id": vote_id,
                # Temporarily storing the code in 'series_id' until we unify after we create them
                "series_id": vs_code
            })

def extract_xml_blocks(file_path):
    """
    Return a list of <XML> ... </XML> substrings from the file.
    """
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        text = f.read()
    pattern = re.compile(r"<XML>(.*?)</XML>", re.IGNORECASE | re.DOTALL)
    return pattern.findall(text)

def parse_xml_fragment(xml_string):
    """
    Attempt to parse an XML fragment with lxml.etree in recovery mode.
    """
    parser = ET.XMLParser(recover=True, encoding="utf-8")
    try:
        root = ET.fromstring(xml_string.encode("utf-8"), parser=parser)
        return root
    except ET.XMLSyntaxError as e:
        print(f"  [Warning] parse error: {e}")
        return None

def process_npx_files(folder_path="npx_filings"):
    """
    Main loop:
      1) For each .txt in folder_path
      2) Extract <SEC-HEADER> info
      3) Extract <XML> blocks
      4) Parse <edgarSubmission> (build 'form_npx' row)
      5) Parse managers, series
      6) Parse <proxyVoteTable>
    """
    global NEXT_FORM_ID  # Must appear at the top of the function, before usage
    global FORM_NPX_ROWS, INSTITUTIONAL_MANAGER_ROWS, SERIES_ROWS

    all_files = os.listdir(folder_path)
    txt_files = [f for f in all_files if f.lower().endswith(".txt")]

    for fname in txt_files:
        file_path = os.path.join(folder_path, fname)
        print(f"\nProcessing: {file_path}")

        # 1) SEC header info
        header_info = extract_sec_header_info(file_path)

        # 2) <XML> blocks
        xml_fragments = extract_xml_blocks(file_path)
        if not xml_fragments:
            print("  No <XML> blocks found.")
            continue

        found_form_data = False
        current_form = None

        for frag in xml_fragments:
            root = parse_xml_fragment(frag)
            if root is None:
                continue

            # find <edgarSubmission>
            es_nodes = root.xpath("//*[local-name()='edgarSubmission']")
            if es_nodes:
                es = es_nodes[0]
                # parse submission for the form_npx table
                form_row = parse_edgar_submission(es, header_info)
                current_form = form_row
                FORM_NPX_ROWS.append(form_row)
                found_form_data = True

                # parse managers
                im_list = parse_institutional_managers(es, form_row["form_id"])
                INSTITUTIONAL_MANAGER_ROWS.extend(im_list)

                # parse series (will also update series_count in form_row)
                s_list = parse_series_info(es, form_row)
                SERIES_ROWS.extend(s_list)

            # find <proxyVoteTable>
            pvt_nodes = root.xpath("//*[local-name()='proxyVoteTable']")
            if pvt_nodes:
                if not found_form_data:
                    # If we haven't created a form row yet, create a minimal one
                    form_id = NEXT_FORM_ID
                    NEXT_FORM_ID += 1

                    minimal_form = {
                        "form_id": form_id,
                        "reporting_person_name": "",
                        "phone_number": "",
                        "address_street1": "",
                        "address_street2": "",
                        "address_city": "",
                        "address_state": "",
                        "address_zip": "",
                        "accession_number": header_info["accession_number"],
                        "cik": "",
                        "conformed_period": None,
                        "date_filed": header_info["date_filed"],
                        "report_type": "FUND VOTING REPORT",
                        "form_type": "N-PX",
                        "sec_file_number": "",
                        "crd_number": "",
                        "sec_file_number_other": "",
                        "lei_number": "",
                        "investment_company_type": "",
                        "confidential_treatment": "N",
                        "is_notice_report": False,
                        "explanatory_choice": "N",
                        "other_included_managers_count": 0,
                        "series_count": 0,
                        "is_amendment": False,
                        "amendment_no": None,
                        "amendment_type": None,
                        "notice_explanation": None,
                        "signatory_name": "",
                        "signatory_name_printed": "",
                        "signatory_title": "",
                        "signatory_date": None,
                    }
                    FORM_NPX_ROWS.append(minimal_form)
                    current_form = minimal_form
                    found_form_data = True

                for pvt in pvt_nodes:
                    parse_proxy_vote_table(pvt, current_form["form_id"])

    # End of main loop
    print("\nFinished parsing N-PX.")
    print(f"  Forms: {len(FORM_NPX_ROWS)}")
    print(f"  Proxy Voting Rows: {len(PROXY_VOTING_RECORD_ROWS)}")

def unify_voting_references():
    """
    Optional step:
    Because we captured 'series_id' in VOTING_RECORD_SERIES_ROWS as text (the code),
    we need to replace it with the actual integer series_id if we find a matching series_code.

    Similarly for manager_id if we store it after the CSV assignment.

    Typically, you'd do this after CSV creation or handle it in the DB with a JOIN statement.
    """
    pass

def write_to_csv(output_folder="output_csv"):
    """
    Writes each global list to CSV, creating an ID for managers, series, etc.
    Then if needed, unify references.
    """
    os.makedirs(output_folder, exist_ok=True)

    # form_npx
    df_form = pd.DataFrame(FORM_NPX_ROWS)
    date_cols = ["conformed_period", "date_filed", "signatory_date"]
    for dc in date_cols:
        if dc in df_form.columns:
            df_form[dc] = pd.to_datetime(df_form[dc]).dt.date
    df_form.drop_duplicates(subset=["form_id"], inplace=True)
    csv_form = os.path.join(output_folder, "form_npx.csv")
    df_form.to_csv(csv_form, index=False)
    print(f"Saved form_npx.csv: {len(df_form)} rows.")

    # institutional_manager
    df_im = pd.DataFrame(INSTITUTIONAL_MANAGER_ROWS)
    if not df_im.empty:
        df_im.drop_duplicates(
            subset=["form_id","serial_no","name","form13f_number","crd_number","sec_file_number","lei_number"],
            inplace=True
        )
        df_im["manager_id"] = range(1, len(df_im) + 1)
    csv_im = os.path.join(output_folder, "institutional_manager.csv")
    df_im.to_csv(csv_im, index=False)
    print(f"Saved institutional_manager.csv: {len(df_im)} rows.")

    # series
    df_s = pd.DataFrame(SERIES_ROWS)
    if not df_s.empty:
        df_s.drop_duplicates(
            subset=["form_id","series_code","series_name","series_lei"], 
            inplace=True
        )
        df_s["series_id"] = range(1, len(df_s) + 1)
    csv_s = os.path.join(output_folder, "series.csv")
    df_s.to_csv(csv_s, index=False)
    print(f"Saved series.csv: {len(df_s)} rows.")

    # proxy_voting_record
    df_pvr = pd.DataFrame(PROXY_VOTING_RECORD_ROWS)
    if not df_pvr.empty:
        df_pvr.drop_duplicates(subset=["vote_id"], inplace=True)
        if "meeting_date" in df_pvr.columns:
            df_pvr["meeting_date"] = pd.to_datetime(df_pvr["meeting_date"]).dt.date
    csv_pvr = os.path.join(output_folder, "proxy_voting_record.csv")
    df_pvr.to_csv(csv_pvr, index=False)
    print(f"Saved proxy_voting_record.csv: {len(df_pvr)} rows.")

    # matter_category
    df_mc = pd.DataFrame(MATTER_CATEGORY_ROWS)
    if not df_mc.empty:
        df_mc.drop_duplicates(subset=["category_type"], inplace=True)
    csv_mc = os.path.join(output_folder, "matter_category.csv")
    df_mc.to_csv(csv_mc, index=False)
    print(f"Saved matter_category.csv: {len(df_mc)} rows.")

    # proxy_voting_record_category
    df_pvrc = pd.DataFrame(PROXY_VOTING_RECORD_CATEGORY_ROWS)
    if not df_pvrc.empty:
        df_pvrc.drop_duplicates(inplace=True)
    csv_pvrc = os.path.join(output_folder, "proxy_voting_record_category.csv")
    df_pvrc.to_csv(csv_pvrc, index=False)
    print(f"Saved proxy_voting_record_category.csv: {len(df_pvrc)} rows.")

    # voting_record_manager
    df_vrm = pd.DataFrame(VOTING_RECORD_MANAGER_ROWS)
    if not df_vrm.empty:
        df_vrm.drop_duplicates(inplace=True)
    csv_vrm = os.path.join(output_folder, "voting_record_manager.csv")
    df_vrm.to_csv(csv_vrm, index=False)
    print(f"Saved voting_record_manager.csv: {len(df_vrm)} rows.")

    # voting_record_series
    df_vrs = pd.DataFrame(VOTING_RECORD_SERIES_ROWS)
    if not df_vrs.empty:
        df_vrs.drop_duplicates(inplace=True)
    csv_vrs = os.path.join(output_folder, "voting_record_series.csv")
    df_vrs.to_csv(csv_vrs, index=False)
    print(f"Saved voting_record_series.csv: {len(df_vrs)} rows.")

    print("\nAll CSVs have been written. Check your output folder to confirm.")


def write_to_csv(output_folder="output_csv"):
    os.makedirs(output_folder, exist_ok=True)

    # form_npx
    df_form = pd.DataFrame(FORM_NPX_ROWS)
    date_cols = ["conformed_period", "date_filed", "signatory_date"]
    for dc in date_cols:
        if dc in df_form.columns:
            # Use errors="coerce" so that any out-of-bounds or invalid date becomes NaT
            df_form[dc] = pd.to_datetime(df_form[dc], errors="coerce").dt.date
    df_form.drop_duplicates(subset=["form_id"], inplace=True)
    csv_form = os.path.join(output_folder, "form_npx.csv")
    df_form.to_csv(csv_form, index=False)
    print(f"Saved form_npx.csv: {len(df_form)} rows.")

    # institutional_manager
    df_im = pd.DataFrame(INSTITUTIONAL_MANAGER_ROWS)
    if not df_im.empty:
        df_im.drop_duplicates(
            subset=["form_id","serial_no","name","form13f_number","crd_number","sec_file_number","lei_number"],
            inplace=True
        )
        df_im["manager_id"] = range(1, len(df_im) + 1)
    csv_im = os.path.join(output_folder, "institutional_manager.csv")
    df_im.to_csv(csv_im, index=False)
    print(f"Saved institutional_manager.csv: {len(df_im)} rows.")

    # series
    df_s = pd.DataFrame(SERIES_ROWS)
    if not df_s.empty:
        df_s.drop_duplicates(
            subset=["form_id","series_code","series_name","series_lei"], 
            inplace=True
        )
        df_s["series_id"] = range(1, len(df_s) + 1)
    csv_s = os.path.join(output_folder, "series.csv")
    df_s.to_csv(csv_s, index=False)
    print(f"Saved series.csv: {len(df_s)} rows.")

    # proxy_voting_record
    df_pvr = pd.DataFrame(PROXY_VOTING_RECORD_ROWS)
    if not df_pvr.empty:
        df_pvr.drop_duplicates(subset=["vote_id"], inplace=True)
        # Same 'errors="coerce"' approach for meeting_date
        if "meeting_date" in df_pvr.columns:
            df_pvr["meeting_date"] = pd.to_datetime(
                df_pvr["meeting_date"], 
                errors="coerce"   # <-- key
            ).dt.date
    csv_pvr = os.path.join(output_folder, "proxy_voting_record.csv")
    df_pvr.to_csv(csv_pvr, index=False)
    print(f"Saved proxy_voting_record.csv: {len(df_pvr)} rows.")

    # matter_category
    df_mc = pd.DataFrame(MATTER_CATEGORY_ROWS)
    if not df_mc.empty:
        df_mc.drop_duplicates(subset=["category_type"], inplace=True)
    csv_mc = os.path.join(output_folder, "matter_category.csv")
    df_mc.to_csv(csv_mc, index=False)
    print(f"Saved matter_category.csv: {len(df_mc)} rows.")

    # proxy_voting_record_category
    df_pvrc = pd.DataFrame(PROXY_VOTING_RECORD_CATEGORY_ROWS)
    if not df_pvrc.empty:
        df_pvrc.drop_duplicates(inplace=True)
    csv_pvrc = os.path.join(output_folder, "proxy_voting_record_category.csv")
    df_pvrc.to_csv(csv_pvrc, index=False)
    print(f"Saved proxy_voting_record_category.csv: {len(df_pvrc)} rows.")

    # voting_record_manager
    df_vrm = pd.DataFrame(VOTING_RECORD_MANAGER_ROWS)
    if not df_vrm.empty:
        df_vrm.drop_duplicates(inplace=True)
    csv_vrm = os.path.join(output_folder, "voting_record_manager.csv")
    df_vrm.to_csv(csv_vrm, index=False)
    print(f"Saved voting_record_manager.csv: {len(df_vrm)} rows.")

    # voting_record_series
    df_vrs = pd.DataFrame(VOTING_RECORD_SERIES_ROWS)
    if not df_vrs.empty:
        df_vrs.drop_duplicates(inplace=True)
    csv_vrs = os.path.join(output_folder, "voting_record_series.csv")
    df_vrs.to_csv(csv_vrs, index=False)
    print(f"Saved voting_record_series.csv: {len(df_vrs)} rows.")

    print("\nAll CSVs have been written. Check your output folder to confirm.")


def run_all(folder_path="npx_filings", output_folder="output_csv"):
    """
    High-level convenience function to:
      1) Reset global data
      2) Parse the N-PX .txt files
      3) Write results to CSV
    """
    # Declare global right away
    global FORM_NPX_ROWS, INSTITUTIONAL_MANAGER_ROWS, SERIES_ROWS
    global PROXY_VOTING_RECORD_ROWS, MATTER_CATEGORY_ROWS
    global PROXY_VOTING_RECORD_CATEGORY_ROWS, VOTING_RECORD_MANAGER_ROWS
    global VOTING_RECORD_SERIES_ROWS, KNOWN_CATEGORIES
    global NEXT_FORM_ID, NEXT_VOTE_ID

    # Reset everything
    FORM_NPX_ROWS = []
    INSTITUTIONAL_MANAGER_ROWS = []
    SERIES_ROWS = []
    PROXY_VOTING_RECORD_ROWS = []
    MATTER_CATEGORY_ROWS = []
    PROXY_VOTING_RECORD_CATEGORY_ROWS = []
    VOTING_RECORD_MANAGER_ROWS = []
    VOTING_RECORD_SERIES_ROWS = []
    KNOWN_CATEGORIES = {}

    NEXT_FORM_ID = 1
    NEXT_VOTE_ID = 1

    # Parse the files
    process_npx_files(folder_path)

    # unify if needed
    # unify_voting_references()

    # Write the results to CSV
    write_to_csv(output_folder)


In [None]:
run_all("npx_filings", "output")