In [None]:
#!/usr/bin/env python
# coding: utf-8

import os
import re
import datetime
import psycopg2  # For PostgreSQL
import lxml.etree as ET

# -----------------------------------------------------------------------
# Database Config: Fill these with your AWS RDS info
# -----------------------------------------------------------------------
DB_HOST = "your-rds-instance.xxxx.us-east-1.rds.amazonaws.com"
DB_NAME = "your_database_name"
DB_USER = "your_username"
DB_PASS = "your_password"
DB_PORT = 5432  # or whichever port your RDS uses

# -----------------------------------------------------------------------
# Global Data Structures (NO auto-increment IDs)
# -----------------------------------------------------------------------
# Instead of storing actual ID values, we'll keep lists/dicts that we map to final DB IDs.

FORMS = []  # Each item is a dict with all columns for form_npx (minus the PK)
MANAGERS = []  # Each item a dict with manager data, referencing "local_form_index"
SERIES_LIST = []
PROXY_VOTES = []
MATTER_CATEGORIES = []  # We'll store them uniquely and insert them
PROXY_VOTE_CATEGORIES = []  # bridging (vote_id -> category_id), but we only have local placeholders
VOTING_RECORD_MANAGERS = []  # bridging (vote -> manager)
VOTING_RECORD_SERIES = []    # bridging (vote -> series)

# We store local indexes to handle bridging. E.g. each form has "form_index"
# Each manager references "form_index", each vote references "form_index".
# Once inserted, we get real form_id from DB.

# We'll also keep a dictionary "category_type -> local_category_id"
CATEGORY_LOOKUP = {}  # so we don’t insert duplicates

# -----------------------------------------------------------------------
# 1. Parsing Logic
#    (No Next ID counters needed; we store data in lists)
# -----------------------------------------------------------------------

def parse_date(date_string):
    if not date_string:
        return None
    patterns = ["%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%Y%m%d"]
    for fmt in patterns:
        try:
            return datetime.datetime.strptime(date_string.strip(), fmt).date()
        except ValueError:
            pass
    return None

def get_text(node, xpath_expr):
    result = node.xpath(xpath_expr)
    if result and result[0] is not None and result[0].text:
        return result[0].text.strip()
    return ""

def get_decimal(node, xpath_expr):
    txt = get_text(node, xpath_expr)
    if not txt:
        return None
    try:
        return float(txt.replace(",", ""))
    except ValueError:
        return None

def extract_sec_header_info(file_path):
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        raw = f.read()

    info = {
        "accession_number": "",
        "date_filed": None
    }

    # Accession
    match_acc = re.search(r"ACCESSION\s+NUMBER:\s*([^\r\n]+)", raw, re.IGNORECASE)
    if match_acc:
        info["accession_number"] = match_acc.group(1).strip()

    # FILED AS OF DATE
    match_filed = re.search(r"FILED\s+AS\s+OF\s+DATE:\s*(\d{8})", raw, re.IGNORECASE)
    if match_filed:
        info["date_filed"] = parse_date(match_filed.group(1).strip())

    return info

def parse_edgar_submission(root, sec_header, local_form_index):
    rp_name = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='name']")
    if not rp_name:
        rp_name = ""  # to avoid None in NOT NULL

    form_data = {
        "local_form_index": local_form_index,
        "reporting_person_name": rp_name[:250],
        "phone_number": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='phoneNumber']")[:50],
        "address_street1": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street1']")[:250],
        "address_street2": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='street2']")[:250],
        "address_city": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='city']")[:100],
        "address_state": "",
        "address_zip": get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='zipCode']")[:30],

        "accession_number": sec_header["accession_number"][:30],
        "cik": get_text(root, ".//*[local-name()='issuerCredentials']/*[local-name()='cik']")[:15],
        "conformed_period": parse_date(get_text(root, ".//*[local-name()='periodOfReport']")),
        "date_filed": sec_header["date_filed"],
        "report_type": (get_text(root, ".//*[local-name()='reportInfo']/*[local-name()='reportType']") or "FUND VOTING REPORT")[:100],
        "form_type": (get_text(root, ".//*[local-name()='submissionType']") or "N-PX")[:10],
        "sec_file_number": get_text(root, ".//*[local-name()='fileNumber']")[:20],
        "crd_number": get_text(root, ".//*[local-name()='reportingCrdNumber']")[:20],
        "sec_file_number_other": get_text(root, ".//*[local-name()='reportingSecFileNumber']")[:20],
        "lei_number": get_text(root, ".//*[local-name()='leiNumber']")[:40],
        "investment_company_type": get_text(root, ".//*[local-name()='investmentCompanyType']")[:20],

        "confidential_treatment": "N",
        "is_notice_report": False,
        "explanatory_choice": "N",
        "other_included_managers_count": 0,
        "series_count": 0,

        "is_amendment": False,
        "amendment_no": None,
        "amendment_type": None,
        "notice_explanation": None,

        "signatory_name": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txSignature']")[:250],
        "signatory_name_printed": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txPrintedSignature']")[:250],
        "signatory_title": get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txTitle']")[:100],
        "signatory_date": parse_date(get_text(root, ".//*[local-name()='signaturePage']/*[local-name()='txAsOfDate']")),
    }

    raw_state_country = get_text(root, ".//*[local-name()='reportingPerson']/*[local-name()='address']/*[local-name()='stateOrCountry']")
    if raw_state_country:
        form_data["address_state"] = raw_state_country[:100]

    conf_treat = get_text(root, ".//*[local-name()='reportInfo']/*[local-name()='confidentialTreatment']").upper()
    if conf_treat in ["Y", "YES", "TRUE", "1"]:
        form_data["confidential_treatment"] = "Y"

    if "NOTICE" in form_data["report_type"].upper():
        form_data["is_notice_report"] = True

    expl_choice = get_text(root, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryChoice']").upper()
    if expl_choice in ["Y", "YES", "TRUE", "1"]:
        form_data["explanatory_choice"] = "Y"

    expl_notes = get_text(root, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryNotes']")
    if expl_notes:
        form_data["notice_explanation"] = expl_notes[:200]

    oimc = get_text(root, ".//*[local-name()='summaryPage']/*[local-name()='otherIncludedManagersCount']")
    if oimc.isdigit():
        form_data["other_included_managers_count"] = int(oimc)

    is_amd = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='isAmendment']").upper()
    if is_amd in ["Y", "YES", "TRUE", "1"]:
        form_data["is_amendment"] = True

    amd_no = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentNo']")
    if amd_no.isdigit():
        form_data["amendment_no"] = int(amd_no)

    amd_type = get_text(root, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentType']")
    if amd_type:
        form_data["amendment_type"] = amd_type[:20]

    return form_data

def parse_institutional_managers(root, local_form_index):
    # (Same logic as your final code with official EDGAR tags, condensed for brevity)
    results = []
    manager_nodes = root.xpath(".//*[local-name()='otherManagers2']//*[local-name()='investmentManagers']")
    if not manager_nodes:
        manager_nodes = root.xpath(".//*[local-name()='otherManager']")

    for mn in manager_nodes:
        row_im = {
            "local_form_index": local_form_index,
            "serial_no": None,
            "name": "",
            "form13f_number": "",
            "crd_number": "",
            "sec_file_number": "",
            "lei_number": ""
        }
        # fill with your fallback logic
        sn = mn.xpath(".//*[local-name()='serialNo']/text()")
        if sn and sn[0].isdigit():
            row_im["serial_no"] = int(sn[0])

        # <managerName> or <name>
        mgrName = mn.xpath(".//*[local-name()='managerName']/text()")
        if mgrName:
            row_im["name"] = mgrName[0].strip()[:150]
        else:
            nm = mn.xpath(".//*[local-name()='name']/text()")
            if nm:
                row_im["name"] = nm[0].strip()[:250]

        # <icaOr13FFileNumber> or <form13FFileNumber>
        ica = mn.xpath(".//*[local-name()='icaOr13FFileNumber']/text()")
        if ica:
            row_im["form13f_number"] = ica[0].strip()[:17]
        else:
            f13 = mn.xpath(".//*[local-name()='form13FFileNumber']/text()")
            if f13:
                row_im["form13f_number"] = f13[0].strip()[:20]

        # <crdNumber>
        crdN = mn.xpath(".//*[local-name()='crdNumber']/text()")
        if crdN:
            row_im["crd_number"] = crdN[0].strip()[:20]

        # <otherFileNumber> or <secFileNumber>
        otherF = mn.xpath(".//*[local-name()='otherFileNumber']/text()")
        if otherF:
            row_im["sec_file_number"] = otherF[0].strip()[:17]
        else:
            sfn = mn.xpath(".//*[local-name()='secFileNumber']/text()")
            if sfn:
                row_im["sec_file_number"] = sfn[0].strip()[:20]

        # <leiNumberOM> or <leiNumber>
        leiOM = mn.xpath(".//*[local-name()='leiNumberOM']/text()")
        if leiOM:
            row_im["lei_number"] = leiOM[0].strip()[:20]
        else:
            lei = mn.xpath(".//*[local-name()='leiNumber']/text()")
            if lei:
                row_im["lei_number"] = lei[0].strip()[:40]

        results.append(row_im)
    return results

def parse_series_info(root, form_dict):
    # form_dict has "local_form_index" but we need that
    local_form_index = form_dict["local_form_index"]
    series_nodes = root.xpath(".//*[local-name()='seriesReports']")
    results = []
    for sn in series_nodes:
        s_data = {
            "local_form_index": local_form_index,
            "series_code": get_text(sn, ".//*[local-name()='idOfSeries']")[:25],
            "series_name": get_text(sn, ".//*[local-name()='nameOfSeries']")[:250],
            "series_lei": get_text(sn, ".//*[local-name()='leiOfSeries']")[:40],
        }
        results.append(s_data)

    # update series_count
    form_dict["series_count"] += len(results)
    return results

def parse_proxy_vote_table(proxy_vote_node, local_form_index):
    # Each vote references local_form_index
    # We'll store them and bridging references (manager & series) with local placeholders
    for pt in proxy_vote_node.xpath(".//*[local-name()='proxyTable']"):
        # We'll give each proxy vote a local key "local_vote_index".
        # For bridging, we need that local key
        # Let’s just use len(PROXY_VOTES) as the local index
        local_vote_index = len(PROXY_VOTES)

        row = {
            "local_vote_index": local_vote_index,
            "local_form_index": local_form_index,
            "issuer_name": get_text(pt, ".//*[local-name()='issuerName']")[:250],
            "cusip": get_text(pt, ".//*[local-name()='cusip']")[:30],
            "isin": get_text(pt, ".//*[local-name()='isin']")[:30],
            "figi": get_text(pt, ".//*[local-name()='figi']")[:30],
            "meeting_date": parse_date(get_text(pt, ".//*[local-name()='meetingDate']")),
            "vote_description": get_text(pt, ".//*[local-name()='voteDescription']"),
            "proposed_by": get_text(pt, ".//*[local-name()='voteSource']")[:20],
            "shares_voted": get_decimal(pt, ".//*[local-name()='sharesVoted'][1]"),
            "shares_on_loan": get_decimal(pt, ".//*[local-name()='sharesOnLoan'][1]"),
            "vote_cast": None,
            "vote_cast_shares": None,
            "management_rec": None,
            "other_notes": None
        }

        vote_records = pt.xpath(".//*[local-name()='voteRecord']")
        if vote_records:
            vr = vote_records[0]
            row["vote_cast"] = get_text(vr, ".//*[local-name()='howVoted']")[:50]
            row["vote_cast_shares"] = get_decimal(vr, ".//*[local-name()='sharesVoted']")
            row["management_rec"] = get_text(vr, ".//*[local-name()='managementRecommendation']")[:50]
            if len(vote_records) > 1:
                row["other_notes"] = f"{len(vote_records)} total <voteRecord> items found."

        PROXY_VOTES.append(row)

        # parse categories
        categories = pt.xpath(".//*[local-name()='voteCategories']//*[local-name()='categoryType']/text()")
        for cat_str in categories:
            cat_clean = cat_str.strip()[:100]
            if cat_clean not in CATEGORY_LOOKUP:
                local_cat_id = len(CATEGORY_LOOKUP) + 1
                CATEGORY_LOOKUP[cat_clean] = local_cat_id
                MATTER_CATEGORIES.append({"local_category_id": local_cat_id, "category_type": cat_clean})
            # bridging
            local_cat_id = CATEGORY_LOOKUP[cat_clean]
            PROXY_VOTE_CATEGORIES.append({
                "local_vote_index": local_vote_index,
                "local_category_id": local_cat_id
            })

        # parse manager references
        other_mgrs = pt.xpath(".//*[local-name()='voteManager']//*[local-name()='otherManager']/text()")
        for mgr_code in other_mgrs:
            mgr_code = mgr_code.strip()
            try:
                serial_no_int = int(mgr_code)
            except ValueError:
                serial_no_int = None

            if serial_no_int is not None:
                # We'll store local bridging references
                VOTING_RECORD_MANAGERS.append({
                    "local_vote_index": local_vote_index,
                    "local_form_index": local_form_index,
                    "serial_no": serial_no_int
                })

        # parse series reference
        vs_code = get_text(pt, ".//*[local-name()='voteSeries']")
        if vs_code:
            vs_code = vs_code.strip()[:25]
            VOTING_RECORD_SERIES.append({
                "local_vote_index": local_vote_index,
                "local_form_index": local_form_index,
                "series_code": vs_code
            })


def extract_xml_blocks(file_path):
    with open(file_path, "r", encoding="utf-8", errors="replace") as f:
        text = f.read()
    pattern = re.compile(r"<XML>(.*?)</XML>", re.IGNORECASE | re.DOTALL)
    return pattern.findall(text)

def parse_xml_fragment(xml_string):
    parser = ET.XMLParser(recover=True, encoding="utf-8")
    try:
        root = ET.fromstring(xml_string.encode("utf-8"), parser=parser)
        return root
    except ET.XMLSyntaxError as e:
        print(f"  [Warning] parse error: {e}")
        return None

def process_npx_files(folder_path="npx_filings"):
    """
    We'll parse all .txt files, generating a list of forms, managers, series, votes, etc.
    No DB logic yet. The DB insertion will happen later.
    """
    all_files = os.listdir(folder_path)
    txt_files = [f for f in all_files if f.lower().endswith(".txt")]

    for fname in txt_files:
        file_path = os.path.join(folder_path, fname)
        print(f"\nProcessing: {file_path}")

        header_info = extract_sec_header_info(file_path)
        xml_fragments = extract_xml_blocks(file_path)
        if not xml_fragments:
            print("  No <XML> blocks found.")
            continue

        found_form_data = False
        current_form_dict = None
        local_form_index = len(FORMS)  # We'll treat the next form as local index

        for frag in xml_fragments:
            root = parse_xml_fragment(frag)
            if root is None:
                continue

            es_nodes = root.xpath("//*[local-name()='edgarSubmission']")
            if es_nodes:
                es = es_nodes[0]
                form_row = parse_edgar_submission(es, header_info, local_form_index)
                current_form_dict = form_row
                FORMS.append(form_row)
                found_form_data = True

                im_list = parse_institutional_managers(es, local_form_index)
                MANAGERS.extend(im_list)

                s_list = parse_series_info(es, form_row)
                SERIES_LIST.extend(s_list)

            pvt_nodes = root.xpath("//*[local-name()='proxyVoteTable']")
            if pvt_nodes:
                if not found_form_data:
                    # minimal form if no <edgarSubmission>
                    local_form_index = len(FORMS)
                    minimal_form = {
                        "local_form_index": local_form_index,
                        "reporting_person_name": "",
                        "phone_number": "",
                        "address_street1": "",
                        "address_street2": "",
                        "address_city": "",
                        "address_state": "",
                        "address_zip": "",
                        "accession_number": header_info["accession_number"][:30],
                        "cik": "",
                        "conformed_period": None,
                        "date_filed": header_info["date_filed"],
                        "report_type": "FUND VOTING REPORT",
                        "form_type": "N-PX",
                        "sec_file_number": "",
                        "crd_number": "",
                        "sec_file_number_other": "",
                        "lei_number": "",
                        "investment_company_type": "",
                        "confidential_treatment": "N",
                        "is_notice_report": False,
                        "explanatory_choice": "N",
                        "other_included_managers_count": 0,
                        "series_count": 0,
                        "is_amendment": False,
                        "amendment_no": None,
                        "amendment_type": None,
                        "notice_explanation": None,
                        "signatory_name": "",
                        "signatory_name_printed": "",
                        "signatory_title": "",
                        "signatory_date": None,
                    }
                    FORMS.append(minimal_form)
                    current_form_dict = minimal_form
                    found_form_data = True

                for pvt in pvt_nodes:
                    parse_proxy_vote_table(pvt, local_form_index)

        # end of each .txt file
    print("\nFinished parsing N-PX.")
    print(f"  # Forms: {len(FORMS)}")
    print(f"  # Proxy Votes: {len(PROXY_VOTES)}")

# -----------------------------------------------------------------------
# 2. Database Insertion
#    We'll insert in an order that respects PK/FK constraints:
#    1) form_npx
#    2) institutional_manager (each references form_id)
#    3) series (references form_id)
#    4) proxy_voting_record (references form_id)
#    5) matter_category (unique on category_type)
#    6) proxy_voting_record_category (references vote_id, category_id)
#    7) voting_record_manager (references vote_id, manager_id)
#    8) voting_record_series (references vote_id, series_id)
# -----------------------------------------------------------------------

def insert_data_into_postgres():
    # 1) Connect
    conn = psycopg2.connect(
        host=DB_HOST,
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASS,
        port=DB_PORT
    )
    conn.autocommit = False  # we'll commit at the end
    cur = conn.cursor()

    try:
        # We'll keep dicts to map local_form_index -> form_id, etc.
        form_id_map = {}
        manager_id_map = {}  # (local_form_index, serial_no, name, etc.) -> manager_id
        vote_id_map = {}     # local_vote_index -> vote_id
        series_id_map = {}   # (local_form_index, series_code) -> series_id
        category_id_map = {} # local_category_id -> category_id

        # -------------------------------------------------------------------
        # Insert FORMS (form_npx) => get form_id
        # -------------------------------------------------------------------
        insert_form_sql = """
            INSERT INTO form_npx (
                reporting_person_name,
                phone_number,
                address_street1,
                address_street2,
                address_city,
                address_state,
                address_zip,
                accession_number,
                cik,
                conformed_period,
                date_filed,
                report_type,
                form_type,
                sec_file_number,
                crd_number,
                sec_file_number_other,
                lei_number,
                investment_company_type,
                confidential_treatment,
                is_notice_report,
                explanatory_choice,
                other_included_managers_count,
                series_count,
                is_amendment,
                amendment_no,
                amendment_type,
                notice_explanation,
                signatory_name,
                signatory_name_printed,
                signatory_title,
                signatory_date
            )
            VALUES (
                %s, %s, %s, %s, %s, %s, %s,
                %s, %s, %s, %s, %s, %s, %s, %s, %s,
                %s, %s, %s, %s, %s,
                %s, %s, %s, %s, %s, %s,
                %s, %s, %s, %s
            )
            RETURNING form_id
        """

        for fdict in FORMS:
            values = (
                fdict["reporting_person_name"],
                fdict["phone_number"],
                fdict["address_street1"],
                fdict["address_street2"],
                fdict["address_city"],
                fdict["address_state"],
                fdict["address_zip"],
                fdict["accession_number"],
                fdict["cik"],
                fdict["conformed_period"],
                fdict["date_filed"],
                fdict["report_type"],
                fdict["form_type"],
                fdict["sec_file_number"],
                fdict["crd_number"],
                fdict["sec_file_number_other"],
                fdict["lei_number"],
                fdict["investment_company_type"],
                fdict["confidential_treatment"],
                fdict["is_notice_report"],
                fdict["explanatory_choice"],
                fdict["other_included_managers_count"],
                fdict["series_count"],
                fdict["is_amendment"],
                fdict["amendment_no"],
                fdict["amendment_type"],
                fdict["notice_explanation"],
                fdict["signatory_name"],
                fdict["signatory_name_printed"],
                fdict["signatory_title"],
                fdict["signatory_date"]
            )
            cur.execute(insert_form_sql, values)
            new_form_id = cur.fetchone()[0]
            form_id_map[fdict["local_form_index"]] = new_form_id

        # -------------------------------------------------------------------
        # Insert MANAGERS => get manager_id
        # -------------------------------------------------------------------
        insert_manager_sql = """
            INSERT INTO institutional_manager (
                form_id,
                serial_no,
                name,
                form13f_number,
                crd_number,
                sec_file_number,
                lei_number
            )
            VALUES (%s, %s, %s, %s, %s, %s, %s)
            RETURNING manager_id
        """

        for m in MANAGERS:
            real_form_id = form_id_map[m["local_form_index"]]
            values = (
                real_form_id,
                m["serial_no"],
                m["name"],
                m["form13f_number"],
                m["crd_number"],
                m["sec_file_number"],
                m["lei_number"]
            )
            cur.execute(insert_manager_sql, values)
            new_manager_id = cur.fetchone()[0]
            # We'll store in manager_id_map by something that identifies the manager
            # Typically we might do (local_form_index, serial_no) as the key
            # But if multiple managers share the same serial_no, we might also include name, etc.
            key = (m["local_form_index"], m["serial_no"], m["name"])
            manager_id_map[key] = new_manager_id

        # -------------------------------------------------------------------
        # Insert SERIES => get series_id
        # -------------------------------------------------------------------
        insert_series_sql = """
            INSERT INTO series (
                form_id,
                series_code,
                series_name,
                series_lei
            )
            VALUES (%s, %s, %s, %s)
            RETURNING series_id
        """

        for s in SERIES_LIST:
            real_form_id = form_id_map[s["local_form_index"]]
            values = (
                real_form_id,
                s["series_code"],
                s["series_name"],
                s["series_lei"]
            )
            cur.execute(insert_series_sql, values)
            new_series_id = cur.fetchone()[0]
            key = (s["local_form_index"], s["series_code"])
            series_id_map[key] = new_series_id

        # -------------------------------------------------------------------
        # Insert PROXY VOTES => get vote_id
        # -------------------------------------------------------------------
        insert_vote_sql = """
            INSERT INTO proxy_voting_record (
                form_id,
                issuer_name,
                cusip,
                isin,
                figi,
                meeting_date,
                vote_description,
                proposed_by,
                shares_voted,
                shares_on_loan,
                vote_cast,
                vote_cast_shares,
                management_rec,
                other_notes
            )
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            RETURNING vote_id
        """

        for pv in PROXY_VOTES:
            real_form_id = form_id_map[pv["local_form_index"]]
            values = (
                real_form_id,
                pv["issuer_name"],
                pv["cusip"],
                pv["isin"],
                pv["figi"],
                pv["meeting_date"],
                pv["vote_description"],
                pv["proposed_by"],
                pv["shares_voted"],
                pv["shares_on_loan"],
                pv["vote_cast"],
                pv["vote_cast_shares"],
                pv["management_rec"],
                pv["other_notes"]
            )
            cur.execute(insert_vote_sql, values)
            new_vote_id = cur.fetchone()[0]
            vote_id_map[pv["local_vote_index"]] = new_vote_id

        # -------------------------------------------------------------------
        # Insert MATTER_CATEGORIES => get category_id
        # (some might already exist because matter_category has UNIQUE(category_type))
        # So we do an upsert or check for existing. For simplicity, let's do a naive approach:
        # We'll attempt to insert, if fail, we SELECT the existing ID. Or use "ON CONFLICT".
        # -------------------------------------------------------------------
        insert_category_sql = """
            INSERT INTO matter_category (category_type)
            VALUES (%s)
            ON CONFLICT (category_type) DO UPDATE SET category_type = EXCLUDED.category_type
            RETURNING category_id
        """

        for mc in MATTER_CATEGORIES:
            cat_type = mc["category_type"]
            cur.execute(insert_category_sql, (cat_type,))
            new_cat_id = cur.fetchone()[0]
            category_id_map[mc["local_category_id"]] = new_cat_id

        # -------------------------------------------------------------------
        # Insert PROXY_VOTING_RECORD_CATEGORY
        # bridging (vote_id, category_id)
        # -------------------------------------------------------------------
        insert_proxy_cat_sql = """
            INSERT INTO proxy_voting_record_category (vote_id, category_id)
            VALUES (%s, %s)
        """
        for pvc in PROXY_VOTE_CATEGORIES:
            real_vote_id = vote_id_map[pvc["local_vote_index"]]
            real_cat_id = category_id_map[pvc["local_category_id"]]
            cur.execute(insert_proxy_cat_sql, (real_vote_id, real_cat_id))

        # -------------------------------------------------------------------
        # Insert VOTING_RECORD_MANAGER
        # bridging (vote_id, manager_id)
        # -------------------------------------------------------------------
        insert_vrm_sql = """
            INSERT INTO voting_record_manager (vote_id, manager_id)
            VALUES (%s, %s)
        """
        for vrm in VOTING_RECORD_MANAGERS:
            real_vote_id = vote_id_map[vrm["local_vote_index"]]
            # manager_id key = (local_form_index, serial_no, name)
            key = (vrm["local_form_index"], vrm["serial_no"], "")
            # we might have to find the best match if the manager had a name
            # But let's assume name is blank or we do partial matching:
            # (In a robust scenario, you might do an exact match. For brevity, using name="" if needed.)
            # If your real logic uses 'name' as well, be consistent with parse_institutional_managers.
            # Or we can do a loop to find the manager with that form + serial_no ignoring name, etc.
            # For now, let's do a small search approach:
            potential_keys = []
            for mk in manager_id_map.keys():
                # mk is (local_form_index, serial_no, managerName)
                if mk[0] == vrm["local_form_index"] and mk[1] == vrm["serial_no"]:
                    # We found a match
                    potential_keys.append(mk)
            if potential_keys:
                # pick the first
                the_key = potential_keys[0]
                real_manager_id = manager_id_map[the_key]
                cur.execute(insert_vrm_sql, (real_vote_id, real_manager_id))

        # -------------------------------------------------------------------
        # Insert VOTING_RECORD_SERIES
        # bridging (vote_id, series_id)
        # -------------------------------------------------------------------
        insert_vrs_sql = """
            INSERT INTO voting_record_series (vote_id, series_id)
            VALUES (%s, %s)
        """
        for vrs in VOTING_RECORD_SERIES:
            real_vote_id = vote_id_map[vrs["local_vote_index"]]
            key = (vrs["local_form_index"], vrs["series_code"])
            if key in series_id_map:
                real_series_id = series_id_map[key]
                cur.execute(insert_vrs_sql, (real_vote_id, real_series_id))

        # COMMIT
        conn.commit()
        print("All data inserted successfully into AWS RDS PostgreSQL.")

    except Exception as e:
        conn.rollback()
        print(f"Error during DB insert: {e}")
    finally:
        cur.close()
        conn.close()
        print("Connection closed.")

# -----------------------------------------------------------------------
# 3. Full Notebook-Style Flow
# -----------------------------------------------------------------------

def run_all(folder_path="npx_filings"):
    # 1) Reset global lists
    global FORMS, MANAGERS, SERIES_LIST, PROXY_VOTES
    global MATTER_CATEGORIES, PROXY_VOTE_CATEGORIES
    global VOTING_RECORD_MANAGERS, VOTING_RECORD_SERIES
    global CATEGORY_LOOKUP

    FORMS = []
    MANAGERS = []
    SERIES_LIST = []
    PROXY_VOTES = []
    MATTER_CATEGORIES = []
    PROXY_VOTE_CATEGORIES = []
    VOTING_RECORD_MANAGERS = []
    VOTING_RECORD_SERIES = []
    CATEGORY_LOOKUP = {}

    # 2) Parse local files
    process_npx_files(folder_path)

    # 3) Insert into Postgres
    insert_data_into_postgres()

    print("\nDone!")



In [None]:
run_all("npx_filings")