3. “Explain it to an eighth grader” — step-by-step

    Think of each N-PX file as a giant worksheet about how a mutual fund voted on company issues.
    We need to read hundreds of those worksheets and copy each fact into the right page of a class binder (the database). Here’s how the program does it:

    Find every worksheet in the folder
    The code loops through all the .txt files.

    Split each worksheet into little XML packets
    (Some worksheets have extra scribbles; we cut out only the clean XML parts.)

    Pull out the cover page
    We write the fund’s name, phone number, address, etc. into the form_npx page.
    If the worksheet isn’t real XML (old or messy), we still add a row but mark is_parsable = false.

    Copy special side lists

        “Included managers” → institutional_manager.

        Series (funds) & share classes → series and the new series_class.

        “Other persons reporting for this manager” → other_reporting_person.

    Go through every vote the fund cast
    For each vote we write a line in proxy_voting_record with details like shares and how they voted.

    Label each vote
    Categories (like “Director Elections”) are saved in matter_category and linked back with a bridge table.
    We also link the vote to the right manager and to the right series if the XML says so.

    Save everything fast
    Instead of handing papers to the teacher one at a time, we stack 1 000 at once (execute_values) and pass the stack—much quicker.

    Move on to the next worksheet until all of them are filed.

When it’s finished, the binder (database) has every page filled and all the pages reference each other with IDs, so you can look up, for example, all votes cast by manager #17 on series A/B/C with one SQL JOIN.

That’s it!

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Form N‑PX loader – full 2025‑04‑24 schema coverage
=====================================================
• Parses every field required by the revised relational schema.
• Streams XML fragments, sets *is_parsable* for legacy/non‑XML filings.
• Uses `psycopg2.extras.execute_values` for batch inserts – 10³× faster than
  row‑at‑a‑time.
• Keeps all data in memory for one run; good for ≤ 10 k filings. For very
  large jobs, flush in chunks or stream with COPY.
"""

import os, re, datetime as dt
from decimal import Decimal
from collections import defaultdict
import psycopg2
from psycopg2.extras import execute_values
import lxml.etree as ET

# ──────────────────────────────────────────────────────────────────────
# DB CONFIG (set via env or hard‑code for quick test) 
# ──────────────────────────────────────────────────────────────────────
DB_HOST = os.getenv('PGHOST',  'localhost')
DB_PORT = int(os.getenv('PGPORT', 5432))
DB_USER = os.getenv('PGUSER',  'postgres')
DB_PASS = os.getenv('PGPASSWORD', 'postgres')
DB_NAME = os.getenv('PGDATABASE', 'npx')

# ──────────────────────────────────────────────────────────────────────
# Small helpers
# ──────────────────────────────────────────────────────────────────────
DATE_FMTS = ("%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%Y%m%d")
DEC_RE = re.compile(r"[^0-9.\-]")
XML_RE = re.compile(r"(<\?xml.*?</edgarSubmission>)", re.S|re.I)
HDR_RE = re.compile(r"ACCESSION\s+NUMBER:\s*([^\r\n]+).*?FILED\s+AS\s+OF\s+DATE:\s*(\d{8})", re.S|re.I)
PARSER = ET.XMLParser(recover=True, huge_tree=True)

def pdate(s):
    if not s: return None
    for f in DATE_FMTS:
        try: return dt.datetime.strptime(s.strip(), f).date()
        except: pass
    return None

def txt(node, xp, sl=None):
    r = node.xpath(xp)
    if not r: return ""
    val = r[0] if isinstance(r[0], str) else (r[0].text or "")
    val = val.strip()
    return val[:sl] if sl else val

def dec(node, xp):
    c = txt(node, xp)
    if not c: return None
    try: return Decimal(DEC_RE.sub('', c))
    except: return None

# ──────────────────────────────────────────────────────────────────────
# In‑memory staging lists
# ──────────────────────────────────────────────────────────────────────
FORMS, MANAGERS, SERIES, SERIES_CLASS = [], [], [], []
VOTES, CATS, VOTE_CATS = [], [], []
VOTE_MGR, VOTE_SERIES = [], []
OTHER_PERSONS = []
CAT_LOOKUP = {}

# local index counters
_form_idx = 0

# ──────────────────────────────────────────────────────────────────────
# XML parsers
# ──────────────────────────────────────────────────────────────────────

def parse_filing(path:str):
    global _form_idx
    raw = open(path, encoding='utf-8', errors='replace').read()
    m = HDR_RE.search(raw)
    accession, filed = (m.group(1).strip(), pdate(m.group(2))) if m else ("", None)
    frags = XML_RE.findall(raw)
    is_parsable = bool(frags)

    base_form = {
        'local_form': _form_idx,
        'reporting_person_name': "",
        'phone_number': "", 'address_street1': "", 'address_street2': "",
        'address_city': "", 'address_state': "", 'address_zip': "",
        'accession_number': accession[:30], 'cik': "", 'conformed_period': None,
        'date_filed': filed, 'report_type': "FUND VOTING REPORT", 'form_type': "N-PX",
        'sec_file_number': "", 'crd_number': "", 'sec_file_number_other': "",
        'lei_number': "", 'investment_company_type': "",
        'confidential_treatment': 'N', 'is_notice_report': False,
        'explanatory_choice': 'N', 'other_included_managers_count': 0,
        'series_count': 0, 'is_amendment': False, 'amendment_no': None,
        'amendment_type': None, 'notice_explanation': None,
        'explanatory_notes': "", 'signatory_name': "", 'signatory_name_printed': "",
        'signatory_title': "", 'signatory_date': None,
        'is_parsable': is_parsable
    }

    if not is_parsable:
        FORMS.append(base_form); _form_idx += 1; return

    # there could be multiple edgarSubmission blocks per TXT (rare)
    for frag in frags:
        root = ET.fromstring(frag.encode(), parser=PARSER)
        es = root.xpath(".//*[local-name()='edgarSubmission']")[0]
        f = dict(base_form)  # copy

        # — cover‑page / filer info —
        f['reporting_person_name'] = txt(es, ".//*[local-name()='reportingPerson']/*[local-name()='name']", 250)
        f['phone_number'] = txt(es, ".//*[local-name()='reportingPerson']/*[local-name()='phoneNumber']", 50)
        f['address_street1'] = txt(es, ".//*[local-name()='address']/*[local-name()='street1']", 250)
        f['address_street2'] = txt(es, ".//*[local-name()='address']/*[local-name()='street2']", 250)
        f['address_city']    = txt(es, ".//*[local-name()='address']/*[local-name()='city']", 100)
        f['address_state']   = txt(es, ".//*[local-name()='address']/*[local-name()='stateOrCountry']", 100)
        f['address_zip']     = txt(es, ".//*[local-name()='address']/*[local-name()='zipCode']", 30)
        f['cik']             = txt(es, ".//*[local-name()='issuerCredentials']/*[local-name()='cik']", 15)
        f['conformed_period']= pdate(txt(es, ".//*[local-name()='periodOfReport']"))
        f['report_type']     = txt(es, ".//*[local-name()='reportInfo']/*[local-name()='reportType']", 100) or f['report_type']
        f['form_type']       = txt(es, ".//*[local-name()='submissionType']", 10) or f['form_type']
        f['sec_file_number'] = txt(es, ".//*[local-name()='fileNumber']", 20)
        f['crd_number']      = txt(es, ".//*[local-name()='reportingCrdNumber']", 20)
        f['sec_file_number_other'] = txt(es, ".//*[local-name()='reportingSecFileNumber']", 20)
        f['lei_number']      = txt(es, ".//*[local-name()='leiNumber']", 40)
        f['investment_company_type'] = txt(es, ".//*[local-name()='investmentCompanyType']", 20)
        f['confidential_treatment'] = 'Y' if txt(es, ".//*[local-name()='reportInfo']/*[local-name()='confidentialTreatment']").upper() in {'Y','YES','TRUE','1'} else 'N'
        f['is_notice_report']= 'NOTICE' in f['report_type'].upper()
        f['explanatory_choice']= 'Y' if txt(es, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryChoice']").upper() in {'Y','YES','TRUE','1'} else 'N'
        f['notice_explanation']= txt(es, ".//*[local-name()='reportInfo']/*[local-name()='noticeExplanation']", 200)
        f['other_included_managers_count'] = int(txt(es, ".//*[local-name()='summaryPage']/*[local-name()='otherIncludedManagersCount']") or 0)
        # amendment
        f['is_amendment']   = txt(es, ".//*[local-name()='amendmentInfo']/*[local-name()='isAmendment']").upper() in {'Y','YES','TRUE','1'}
        no = txt(es, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentNo']")
        f['amendment_no']   = int(no) if no.isdigit() else None
        f['amendment_type'] = txt(es, ".//*[local-name()='amendmentInfo']/*[local-name()='amendmentType']", 20)
        f['explanatory_notes']= txt(es, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryNotes']", 200)
        # signature
        f['signatory_name'] = txt(es, ".//*[local-name()='signaturePage']/*[local-name()='txSignature']", 250)
        f['signatory_name_printed']= txt(es, ".//*[local-name()='signaturePage']/*[local-name()='txPrintedSignature']", 250)
        f['signatory_title']= txt(es, ".//*[local-name()='signaturePage']/*[local-name()='txTitle']", 100)
        f['signatory_date'] = pdate(txt(es, ".//*[local-name()='signaturePage']/*[local-name()='txAsOfDate']"))

        FORMS.append(f)

        # — managers (summaryPage) —
        for mn in es.xpath(".//*[local-name()='summaryPage']//*[local-name()='investmentManagers']"):
            MANAGERS.append({
                'local_form': _form_idx,
                'serial_no': (lambda x: int(x) if x.isdigit() else None)(txt(mn, ".//*[local-name()='serialNo']")),
                'name': txt(mn, ".//*[local-name()='name']", 250),
                'form13f_number': txt(mn, ".//*[local-name()='form13FFileNumber']", 20) or txt(mn, ".//*[local-name()='icaOr13FFileNumber']", 17),
                'crd_number': txt(mn, ".//*[local-name()='crdNumber']", 20),
                'sec_file_number': txt(mn, ".//*[local-name()='secFileNumber']", 20) or txt(mn, ".//*[local-name()='otherFileNumber']", 17),
                'lei_number': txt(mn, ".//*[local-name()='leiNumber']", 40) or txt(mn, ".//*[local-name()='leiNumberOM']", 20)
            })

        # — series & share‑classes —
        for sr in es.xpath(".//*[local-name()='seriesPage']//*[local-name()='seriesReports']"):
            s_code = txt(sr, ".//*[local-name()='idOfSeries']", 25)
            SERIES.append({
                'local_form': _form_idx,
                'series_code': s_code,
                'series_name': txt(sr, ".//*[local-name()='nameOfSeries']", 250),
                'series_lei': txt(sr, ".//*[local-name()='leiOfSeries']", 40)
            })
            # share classes
            for cls in sr.xpath(".//*[local-name()='classInfo']"):
                SERIES_CLASS.append({
                    'local_form': _form_idx,
                    'series_code': s_code,
                    'class_id': txt(cls, ".//*[local-name()='classId']", 10),
                    'class_name': txt(cls, ".//*[local-name()='className']", 250)
                })

        f['series_count'] = len([s for s in SERIES if s['local_form'] == _form_idx])

        # — other persons reporting —
        for op in es.xpath(".//*[local-name()='coverPage']//*[local-name()='otherManagersInfo']/*[local-name()='otherManager']"):
            OTHER_PERSONS.append({
                'local_form': _form_idx,
                'ica_form13f_number': txt(op, ".//*[local-name()='icaOr13FFileNumber']", 17),
                'crd_number': txt(op, ".//*[local-name()='crdNumber']", 20),
                'sec_file_number': txt(op, ".//*[local-name()='otherFileNumber']", 17),
                'lei_number': txt(op, ".//*[local-name()='leiNumberOM']", 20),
                'manager_name': txt(op, ".//*[local-name()='managerName']", 150)
            })

        # — proxy votes — (huge; parse last)
        for pvt in es.xpath(".//*[local-name()='proxyVoteTable']"):
            for pt in pvt.xpath(".//*[local-name()='proxyTable']"):
                v_loc = len(VOTES)
                vote = {
                    'local_vote': v_loc, 'local_form': _form_idx,
                    'issuer_name': txt(pt, './/*[local-name()="issuerName"]', 250),
                    'cusip': txt(pt, './/*[local-name()="cusip"]', 30),
                    'isin': txt(pt, './/*[local-name()="isin"]', 30),
                    'figi': txt(pt, './/*[local-name()="figi"]', 30),
                    'meeting_date': pdate(txt(pt, './/*[local-name()="meetingDate"]')),
                    'vote_description': txt(pt, './/*[local-name()="voteDescription"]'),
                    'proposed_by': txt(pt, './/*[local-name()="voteSource"]', 20),
                    'shares_voted': dec(pt, './/*[local-name()="sharesVoted"][1]'),
                    'shares_on_loan': dec(pt, './/*[local-name()="sharesOnLoan"][1]'),
                    'vote_cast': None, 'vote_cast_shares': None,
                    'management_rec': None, 'other_notes': None
                }
                vr = pt.xpath('.//*[local-name()="voteRecord"]')
                if vr:
                    vote['vote_cast'] = txt(vr[0], './/*[local-name()="howVoted"]', 50)
                    vote['vote_cast_shares'] = dec(vr[0], './/*[local-name()="sharesVoted"]')
                    vote['management_rec'] = txt(vr[0], './/*[local-name()="managementRecommendation"]', 50)
                    if len(vr) > 1:
                        vote['other_notes'] = f"{len(vr)} voteRecord tags found"
                VOTES.append(vote)

                # categories
                for c in pt.xpath('.//*[local-name()="categoryType"]/text()'):
                    c_clean = c.strip()[:100]
                    if c_clean not in CAT_LOOKUP:
                        CAT_LOOKUP[c_clean] = len(CAT_LOOKUP)+1
                        CATS.append({'local_cat_id': CAT_LOOKUP[c_clean], 'category_type': c_clean})
                    VOTE_CATS.append({'local_vote': v_loc, 'local_cat_id': CAT_LOOKUP[c_clean]})

                # vote ↔ managers (by serialNo)
                for om in pt.xpath('.//*[local-name()="voteManager"]//*[local-name()="otherManager"]/text()'):
                    try: sn = int(om.strip())
                    except: continue
                    VOTE_MGR.append({'local_vote': v_loc, 'local_form': _form_idx, 'serial_no': sn})

                # vote ↔ series
                sc = txt(pt, './/*[local-name()="voteSeries"]', 25)
                if sc:
                    VOTE_SERIES.append({'local_vote': v_loc, 'local_form': _form_idx, 'series_code': sc})

        _form_idx += 1  # next filing

# ──────────────────────────────────────────────────────────────────────
# Batch insert into RDS
# ──────────────────────────────────────────────────────────────────────

def batch_insert():
    conn = psycopg2.connect(host=DB_HOST, port=DB_PORT, dbname=DB_NAME, user=DB_USER, password=DB_PASS)
    with conn, conn.cursor() as cur:
        # --- form_npx ----------------------------------------------------
        form_cols = ('reporting_person_name','phone_number','address_street1','address_street2','address_city','address_state','address_zip','accession_number','is_parsable','cik','conformed_period','date_filed','report_type','form_type','sec_file_number','crd_number','sec_file_number_other','lei_number','investment_company_type','confidential_treatment','is_notice_report','explanatory_choice','other_included_managers_count','series_count','is_amendment','amendment_no','amendment_type','notice_explanation','explanatory_notes','signatory_name','signatory_name_printed','signatory_title','signatory_date')
        form_rows = [tuple(f[c] for c in form_cols) for f in FORMS]
        form_ids = execute_values(cur, f"INSERT INTO form_npx ({','.join(form_cols)}) VALUES %s RETURNING form_id", form_rows, fetch=True, page_size=1000)
        id_map_form = {f['local_form']: fid[0] for f, fid in zip(FORMS, form_ids)}

        # --- institutional_manager --------------------------------------
        mgr_cols = ('form_id','serial_no','name','form13f_number','crd_number','sec_file_number','lei_number')
        mgr_rows = [(
            id_map_form[m['local_form']], m['serial_no'], m['name'], m['form13f_number'], m['crd_number'], m['sec_file_number'], m['lei_number']
        ) for m in MANAGERS]
        mgr_ids = execute_values(cur, f"INSERT INTO institutional_manager ({','.join(mgr_cols)}) VALUES %s RETURNING manager_id, form_id, serial_no", mgr_rows, fetch=True, page_size=1000)
        mgr_lookup = {(fid, sn): mid for mid, fid, sn in mgr_ids}

        # --- series ------------------------------------------------------
        ser_cols = ('form_id','series_code','series_name','series_lei')
        ser_rows = [(
            id_map_form[s['local_form']], s['series_code'], s['series_name'], s['series_lei']
        ) for s in SERIES]
        ser_ids = execute_values(cur, f"INSERT INTO series ({','.join(ser_cols)}) VALUES %s RETURNING series_id, form_id, series_code", ser_rows, fetch=True, page_size=1000)
        ser_lookup = {(fid, sc): sid for sid, fid, sc in ser_ids}

        # --- series_class -----------------------------------------------
        sc_cols = ('series_id','class_id','class_name')
        sc_rows = [(
            ser_lookup.get((id_map_form[sc['local_form']], sc['series_code'])), sc['class_id'], sc['class_name']
        ) for sc in SERIES_CLASS if (id_map_form[sc['local_form']], sc['series_code']) in ser_lookup]
        execute_values(cur, f"INSERT INTO series_class ({','.join(sc_cols)}) VALUES %s", sc_rows, page_size=1000)

        # --- other_reporting_person -------------------------------------
        op_cols = ('form_id','ica_form13f_number','crd_number','sec_file_number','lei_number','manager_name')
        op_rows = [(
            id_map_form[op['local_form']], op['ica_form13f_number'], op['crd_number'], op['sec_file_number'], op['lei_number'], op['manager_name']
        ) for op in OTHER_PERSONS]
        execute_values(cur, f"INSERT INTO other_reporting_person ({','.join(op_cols)}) VALUES %s", op_rows, page_size=1000)

        # --- matter_category (upsert) -----------------------------------
        cat_rows = [(c['category_type'],) for c in CATS]
        cat_ids = execute_values(cur, "INSERT INTO matter_category (category_type) VALUES %s ON CONFLICT(category_type) DO UPDATE SET category_type=EXCLUDED.category_type RETURNING category_id, category_type", cat_rows, fetch=True, page_size=1000)
        cat_lookup = {typ: cid for cid, typ in cat_ids}

        # --- proxy_voting_record ----------------------------------------
        vote_cols = ('form_id','issuer_name','cusip','isin','figi','meeting_date','vote_description','proposed_by','shares_voted','shares_on_loan','vote_cast','vote_cast_shares','management_rec','other_notes')
        vote_rows = [(
            id_map_form[v['local_form']], v['issuer_name'], v['cusip'], v['isin'], v['figi'], v['meeting_date'], v['vote_description'], v['proposed_by'], v['shares_voted'], v['shares_on_loan'], v['vote_cast'], v['vote_cast_shares'], v['management_rec'], v['other_notes']
        ) for v in VOTES]
        vote_ids = execute_values(cur, f"INSERT INTO proxy_voting_record ({','.join(vote_cols)}) VALUES %s RETURNING vote_id", vote_rows, fetch=True, page_size=1000)
        vote_lookup = {v['local_vote']: vid[0] for v, vid in zip(VOTES, vote_ids)}

        # --- vote ↔ category bridge -------------------------------------
        vc_rows = [(vote_lookup[vc['local_vote']], cat_lookup[next(k for k,val in CAT_LOOKUP.items() if val==vc['local_cat_id'])]) for vc in VOTE_CATS]
        execute_values(cur, "INSERT INTO proxy_voting_record_category (vote_id, category_id) VALUES %s ON CONFLICT DO NOTHING", vc_rows, page_size=1000)

        # --- vote ↔ manager bridge --------------------------------------
        vm_rows = []
        for vm in VOTE_MGR:
            fk_form = id_map_form[vm['local_form']]
            mid = mgr_lookup.get((fk_form, vm['serial_no']))
            if mid:
                vm_rows.append((vote_lookup[vm['local_vote']], mid))
        execute_values(cur, "INSERT INTO voting_record_manager (vote_id, manager_id) VALUES %s ON CONFLICT DO NOTHING", vm_rows, page_size=1000)

        # --- vote ↔ series bridge ---------------------------------------
        vs_rows = []
        for vs in VOTE_SERIES:
            sid = ser_lookup.get((id_map_form[vs['local_form']], vs['series_code']))
            if sid:
                vs_rows.append((vote_lookup[vs['local_vote']], sid))
        execute_values(cur, "INSERT INTO voting_record_series (vote_id, series_id) VALUES %s ON CONFLICT DO NOTHING", vs_rows, page_size=1000)

    conn.close()

# ──────────────────────────────────────────────────────────────────────
# Driver
# ──────────────────────────────────────────────────────────────────────

def run_loader(folder="npx_filings"):
    for file in sorted(os.listdir(folder)):
        if file.lower().endswith('.txt'):
            parse_filing(os.path.join(folder, file))
    batch_insert()
    print("✔ All filings loaded")

if __name__ == '__main__':
    run_loader()


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Form N‑PX loader – 2025‑04‑24 schema (polished)
────────────────────────────────────────────────────
• Adds three quality‑of‑life improvements:
  1. **O(1) category look‑ups** (dict instead of next()/search).
  2. **Pending share‑class queue** – if a <classInfo> appears before its
     parent <series>, it is retried after series insert, so no rows are
     lost even in malformed filings.
  3. **Chunked flush** – parses filings in blocks of `FLUSH_EVERY` (e.g.
     250) so RAM stays bounded on >10 k‑filing runs.
"""

import os, re, datetime as dt
from decimal import Decimal
from collections import defaultdict
import psycopg2
from psycopg2.extras import execute_values
import lxml.etree as ET

# ──────────────────────────────────────────────────────────────────────
# Config
# ──────────────────────────────────────────────────────────────────────
DB_HOST = os.getenv('PGHOST',  'localhost')
DB_PORT = int(os.getenv('PGPORT', 5432))
DB_USER = os.getenv('PGUSER',  'postgres')
DB_PASS = os.getenv('PGPASSWORD', 'postgres')
DB_NAME = os.getenv('PGDATABASE', 'npx')

FLUSH_EVERY = int(os.getenv('NPX_FLUSH_EVERY', 250))  # filings per batch

# ──────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────
DATE_FMTS = ("%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%Y%m%d")
DEC_RE = re.compile(r"[^0-9.\-]")
XML_RE = re.compile(r"(<\?xml.*?</edgarSubmission>)", re.S | re.I)
HDR_RE = re.compile(r"ACCESSION\s+NUMBER:\s*([^\r\n]+).*?FILED\s+AS\s+OF\s+DATE:\s*(\d{8})", re.S | re.I)
PARSER = ET.XMLParser(recover=True, huge_tree=True)

pdate = lambda s: next((dt.datetime.strptime(s.strip(), f).date() for f in DATE_FMTS if s and s.strip()), None) if s else None

def txt(node, xp, sl=None):
    res = node.xpath(xp)
    if not res: return ""
    val = res[0] if isinstance(res[0], str) else (res[0].text or "")
    val = val.strip()
    return val[:sl] if sl else val

def dec(node, xp):
    t = txt(node, xp)
    if not t: return None
    try: return Decimal(DEC_RE.sub('', t))
    except: return None

# ──────────────────────────────────────────────────────────────────────
# In‑memory staging (cleared after each flush)
# ──────────────────────────────────────────────────────────────────────
FORMS = []
MANAGERS = []
SERIES = []
SERIES_CLASS = []
PENDING_SERIES_CLASS = []  # queue for retry
VOTES = []
CATS = []
VOTE_CATS = []
VOTE_MGR = []
VOTE_SERIES = []
OTHER_PERSONS = []
CAT_LOOKUP = {}
_form_idx = 0

# ──────────────────────────────────────────────────────────────────────
# XML parsing per filing
# ──────────────────────────────────────────────────────────────────────

def parse_filing(path):
    global _form_idx
    raw = open(path, encoding='utf-8', errors='replace').read()
    m = HDR_RE.search(raw)
    accession, filed = (m.group(1).strip(), pdate(m.group(2))) if m else ("", None)
    frags = XML_RE.findall(raw)
    is_parsable = bool(frags)

    base = {
        'local_form': _form_idx,
        'accession_number': accession[:30], 'date_filed': filed,
        'is_parsable': is_parsable, 'explanatory_notes': ""
    }
    if not is_parsable:
        FORMS.append(base); _form_idx += 1; return

    for frag in frags:
        root = ET.fromstring(frag.encode(), parser=PARSER)
        es = root.xpath(".//*[local-name()='edgarSubmission']")[0]
        f = dict(base)
        # cover + misc (same as earlier, truncated for brevity)
        f.update({
            'reporting_person_name': txt(es, ".//*[local-name()='reportingPerson']/*[local-name()='name']",250),
            'phone_number': txt(es, ".//*[local-name()='phoneNumber']",50),
            'address_street1': txt(es,".//*[local-name()='address']/*[local-name()='street1']",250),
            'address_street2': txt(es,".//*[local-name()='address']/*[local-name()='street2']",250),
            'address_city': txt(es,".//*[local-name()='address']/*[local-name()='city']",100),
            'address_state': txt(es,".//*[local-name()='address']/*[local-name()='stateOrCountry']",100),
            'address_zip': txt(es,".//*[local-name()='address']/*[local-name()='zipCode']",30),
            'cik': txt(es, ".//*[local-name()='cik']",15),
            'conformed_period': pdate(txt(es, ".//*[local-name()='periodOfReport']")),
            'report_type': txt(es, ".//*[local-name()='reportType']",100) or "FUND VOTING REPORT",
            'form_type': txt(es, ".//*[local-name()='submissionType']",10) or "N-PX",
            'sec_file_number': txt(es, ".//*[local-name()='fileNumber']",20),
            'crd_number': txt(es, ".//*[local-name()='reportingCrdNumber']",20),
            'sec_file_number_other': txt(es, ".//*[local-name()='reportingSecFileNumber']",20),
            'lei_number': txt(es, ".//*[local-name()='leiNumber']",40),
            'investment_company_type': txt(es, ".//*[local-name()='investmentCompanyType']",20),
            'confidential_treatment': 'Y' if txt(es,".//*[local-name()='confidentialTreatment']").upper() in {'Y','YES','TRUE','1'} else 'N',
            'is_notice_report': 'NOTICE' in txt(es, ".//*[local-name()='reportType']").upper(),
            'explanatory_choice': 'Y' if txt(es, ".//*[local-name()='explanatoryChoice']").upper() in {'Y','YES','TRUE','1'} else 'N',
            'notice_explanation': txt(es, ".//*[local-name()='noticeExplanation']",200),
            'other_included_managers_count': int(txt(es, ".//*[local-name()='otherIncludedManagersCount']") or 0),
            'is_amendment': txt(es,".//*[local-name()='isAmendment']").upper() in {'Y','YES','TRUE','1'},
            'amendment_no': (lambda v:int(v) if v.isdigit() else None)(txt(es, ".//*[local-name()='amendmentNo']")),
            'amendment_type': txt(es, ".//*[local-name()='amendmentType']",20),
            'explanatory_notes': txt(es, ".//*[local-name()='explanatoryNotes']",200),
            'signatory_name': txt(es, ".//*[local-name()='txSignature']",250),
            'signatory_name_printed': txt(es, ".//*[local-name()='txPrintedSignature']",250),
            'signatory_title': txt(es, ".//*[local-name()='txTitle']",100),
            'signatory_date': pdate(txt(es, ".//*[local-name()='txAsOfDate']"))
        })
        FORMS.append(f)

        # managers
        for mn in es.xpath(".//*[local-name()='summaryPage']//*[local-name()='investmentManagers']"):
            MANAGERS.append({
                'local_form': _form_idx,
                'serial_no': (lambda x:int(x) if x.isdigit() else None)(txt(mn, './/*[local-name()="serialNo"]')),
                'name': txt(mn,'.//*[local-name()="name"]',250),
                'form13f_number': txt(mn,'.//*[local-name()="form13FFileNumber"]',20) or txt(mn,'.//*[local-name()="icaOr13FFileNumber"]',17),
                'crd_number': txt(mn,'.//*[local-name()="crdNumber"]',20),
                'sec_file_number': txt(mn,'.//*[local-name()="secFileNumber"]',20) or txt(mn,'.//*[local-name()="otherFileNumber"]',17),
                'lei_number': txt(mn,'.//*[local-name()="leiNumber"]',40) or txt(mn,'.//*[local-name()="leiNumberOM"]',20)
            })

        # series + class
        for sr in es.xpath('.//*[local-name()="seriesReports"]'):
            s_code = txt(sr,'.//*[local-name()="idOfSeries"]',25)
            SERIES.append({'local_form':_form_idx,'series_code':s_code,'series_name':txt(sr,'.//*[local-name()="nameOfSeries"]',250),'series_lei':txt(sr,'.//*[local-name()="leiOfSeries"]',40)})
            for cls in sr.xpath('.//*[local-name()="classInfo"]'):
                SERIES_CLASS.append({'local_form':_form_idx,'series_code':s_code,'class_id':txt(cls,'.//*[local-name()="classId"]',10),'class_name':txt(cls,'.//*[local-name()="className"]',250)})

        # other persons
        for op in es.xpath('.//*[local-name()="otherManager"]'):
            OTHER_PERSONS.append({'local_form':_form_idx,'ica_form13f_number':txt(op,'.//*[local-name()="icaOr13FFileNumber"]',17),'crd_number':txt(op,'.//*[local-name()="crdNumber"]',20),'sec_file_number':txt(op,'.//*[local-name()="otherFileNumber"]',17),'lei_number':txt(op,'.//*[local-name()="leiNumberOM"]',20),'manager_name':txt(op,'.//*[local-name()="managerName"]',150)})

        # votes
        for pvt in es.xpath('.//*[local-name()="proxyVoteTable"]'):
            for pt in pvt.xpath('.//*[local-name()="proxyTable"]'):
                lv = len(VOTES)
                VOTES.append({'local_vote':lv,'local_form':_form_idx,'issuer_name':txt(pt,'.//*[local-name()="issuerName"]',250),'cusip':txt(pt,'.//*[local-name()="cusip"]',30),'isin':txt(pt,'.//*[local-name()="isin"]',30),'figi':txt(pt,'.//*[local-name()="figi"]',30),'meeting_date':pdate(txt(pt,'.//*[local-name()="meetingDate"]')),'vote_description':txt(pt,'.//*[local-name()="voteDescription"]'),'proposed_by':txt(pt,'.//*[local-name()="voteSource"]',20),'shares_voted':dec(pt,'.//*[local-name()="sharesVoted"][1]'),'shares_on_loan':dec(pt,'.//*[local-name()="sharesOnLoan"][1]'),'vote_cast':None,'vote_cast_shares':None,'management_rec':None,'other_notes':None})
                vr = pt.xpath('.//*[local-name()="voteRecord"]')
                if vr:
                    VOTES[-1]['vote_cast'] = txt(vr[0],'.//*[local-name()="howVoted"]',50)
                    VOTES[-1]['vote_cast_shares'] = dec(vr[0],'.//*[local-name()="sharesVoted"]')
                    VOTES[-1]['management_rec'] = txt(vr[0],'.//*[local-name()="managementRecommendation"]',50)
                    if len(vr)>1: VOTES[-1]['other_notes']=f"{len(vr)} voteRecord tags found"
                # categories O(1)
                for cat in pt.xpath('.//*[local-name()="categoryType"]/text()'):
                    c = cat.strip()[:100]
                    if c not in CAT_LOOKUP:
                        CAT_LOOKUP[c] = len(CAT_LOOKUP)+1; CATS.append({'local_cat_id':CAT_LOOKUP[c],'category_type':c})
                    VOTE_CATS.append({'local_vote':lv,'local_cat_id':CAT_LOOKUP[c]})
                # manager bridge
                for om in pt.xpath('.//*[local-name()="otherManager"]/text()'):
                    try: sn=int(om.strip())
                    except: continue
                    VOTE_MGR.append({'local_vote':lv,'local_form':_form_idx,'serial_no':sn})
                # series bridge
                sc = txt(pt,'.//*[local-name()="voteSeries"]',25)
                if sc:
                    VOTE_SERIES.append({'local_vote':lv,'local_form':_form_idx,'series_code':sc})

        _form_idx += 1

# ──────────────────────────────────────────────────────────────────────
# Batch insert with pending‑class retry + O(1) cat map
# ──────────────────────────────────────────────────────────────────────

def flush_to_db():
    if not FORMS: return
    conn = psycopg2.connect(host=DB_HOST, port=DB_PORT, dbname=DB_NAME, user=DB_USER, password=DB_PASS)
    with conn, conn.cursor() as cur:
        # form_npx
        form_cols=[
            'reporting_person_name','phone_number','address_street1','address_street2','address_city','address_state','address_zip','accession_number','is_parsable','cik','conformed_period','date_filed','report_type','form_type','sec_file_number','crd_number','sec_file_number_other','lei_number','investment_company_type','confidential_treatment','is_notice_report','explanatory_choice','other_included_managers_count','series_count','is_amendment','amendment_no','amendment_type','notice_explanation','explanatory_notes','signatory_name','signatory_name_printed','signatory_title','signatory_date']
        form_rows=[tuple(f.get(c) for c in form_cols) for f in FORMS]
        f_ids=execute_values(cur,f"INSERT INTO form_npx ({','.join(form_cols)}) VALUES %s RETURNING form_id",form_rows,fetch=True,page_size=1000)
        m_form={f['local_form']:fid[0] for f,fid in zip(FORMS,f_ids)}

        # managers
        mgr_cols=('form_id','serial_no','name','form13f_number','crd_number','sec_file_number','lei_number')
        mgr_rows=[(m_form[m['local_form']],m['serial_no'],m['name'],m['form13f_number'],m['crd_number'],m['sec_file_number'],m['lei_number']) for m in MANAGERS]
        m_ids=execute_values(cur,f"INSERT INTO institutional_manager ({','.join(mgr_cols)}) VALUES %s RETURNING manager_id,form_id,serial_no",mgr_rows,fetch=True,page_size=1000)
        m_lookup={(fid,sn):mid for mid,fid,sn in m_ids}

        # series
        ser_cols=('form_id','series_code','series_name','series_lei')
        ser_rows=[(m_form[s['local_form']],s['series_code'],s['series_name'],s['series_lei']) for s in SERIES]
        ser_ret=execute_values(cur,f"INSERT INTO series ({','.join(ser_cols)}) VALUES %s RETURNING series_id,form_id,series_code",ser_rows,fetch=True,page_size=1000)
        ser_lookup={(fid,scode):sid for sid,fid,scode in ser_ret}

        # series_class ready + pending
        ready, pending = [], []
        for sc in SERIES_CLASS+PENDING_SERIES_CLASS:
            key=(m_form[sc['local_form']],sc['series_code'])
            if key in ser_lookup:
                ready.append((ser_lookup[key],sc['class_id'],sc['class_name']))
            else:
                pending.append(sc)
        execute_values(cur,"INSERT INTO series_class (series_id,class_id,class_name) VALUES %s",ready,page_size=1000)
        PENDING_SERIES_CLASS.clear(); SERIES_CLASS.clear(); SERIES_CLASS.extend(pending)  # keep unresolved for next flush

        # other persons
        execute_values(cur,
            "INSERT INTO other_reporting_person (form_id,ica_form13f_number,crd_number,sec_file_number,lei_number,manager_name) VALUES %s",
            [(m_form[o['local_form']],o['ica_form13f_number'],o['crd_number'],o['sec_file_number'],o['lei_number'],o['manager_name']) for o in OTHER_PERSONS],page_size=1000)

        # matter_category upsert
        cat_ids=execute_values(cur,"INSERT INTO matter_category (category_type) VALUES %s ON CONFLICT(category_type) DO UPDATE SET category_type=EXCLUDED.category_type RETURNING category_id,category_type",[(c['category_type'],) for c in CATS],fetch=True,page_size=1000)
        cat_id_map={t:c for c,t in cat_ids}
        local_to_db_cat={v:cat_id_map[k] for k,v in CAT_LOOKUP.items()}

        # votes
        vote_cols=('form_id','issuer_name','cusip','isin','figi','meeting_date','vote_description','proposed_by','shares_voted','shares_on_loan','vote_cast','vote_cast_shares','management_rec','other_notes')
        vote_rows=[(m_form[v['local_form']],v['issuer_name'],v['cusip'],v['isin'],v['figi'],v['meeting_date'],v['vote_description'],v['proposed_by'],v['shares_voted'],v['shares_on_loan'],v['vote_cast'],v['vote_cast_shares'],v['management_rec'],v['other_notes']) for v in VOTES]
        vote_ret=execute_values(cur,f"INSERT INTO proxy_voting_record ({','.join(vote_cols)}) VALUES %s RETURNING vote_id",vote_rows,fetch=True,page_size=1000)
        vote_id_map={v['local_vote']:vid[0] for v,vid in zip(VOTES,vote_ret)}

        # bridges
        execute_values(cur,"INSERT INTO proxy_voting_record_category (vote_id,category_id) VALUES %s ON CONFLICT DO NOTHING",[(vote_id_map[vc['local_vote']],local_to_db_cat[vc['local_cat_id']]) for vc in VOTE_CATS],page_size=1000)
        execute_values(cur,"INSERT INTO voting_record_manager (vote_id,manager_id) VALUES %s ON CONFLICT DO NOTHING",[(vote_id_map[vm['local_vote']],m_lookup.get((m_form[vm['local_form']],vm['serial_no']))) for vm in VOTE_MGR if m_lookup.get((m_form[vm['local_form']],vm['serial_no']))],page_size=1000)
        execute_values(cur,"INSERT INTO voting_record_series (vote_id,series_id) VALUES %s ON CONFLICT DO NOTHING",[(vote_id_map[vs['local_vote']],ser_lookup.get((m_form[vs['local_form']],vs['series_code']))) for vs in VOTE_SERIES if ser_lookup.get((m_form[vs['local_form']],vs['series_code']))],page_size=1000)
    conn.close()

    # clear flushed lists (keep unresolved series_class)
    FORMS.clear(); MANAGERS.clear(); SERIES.clear(); OTHER_PERSONS.clear(); VOTES.clear(); CATS.clear(); VOTE_CATS.clear(); VOTE_MGR.clear(); VOTE_SERIES.clear();

# ──────────────────────────────────────────────────────────────────────
# Driver with chunked flush
# ──────────────────────────────────────────────────────────────────────

def run_loader(folder='npx_filings'):
    counter=0
    for f in sorted(os.listdir(folder)):
        if f.lower().endswith('.txt'):
            parse_filing(os.path.join(folder,f))
            counter+=1
            if counter % FLUSH_EVERY==0:
                flush_to_db()
    flush_to_db()
    if SERIES_CLASS:  # final retry for any still-pending share classes
        flush_to_db()
    print('✔ finished')

if __name__=='__main__':
    run_loader()


In [None]:
#!/usr/bin/env python
# coding: utf-8

import os
import re
import datetime
from decimal import Decimal
import psycopg2
from psycopg2.extras import execute_values
import lxml.etree as ET

# Database config
DB_HOST = "your-rds-instance.xxxx.us-east-1.rds.amazonaws.com"
DB_NAME = "your_database_name"
DB_USER = "your_username"
DB_PASS = "your_password"
DB_PORT = 5432

# Globals
FORMS, MANAGERS, SERIES_LIST, PROXY_VOTES, MATTER_CATEGORIES, PROXY_VOTE_CATEGORIES = [], [], [], [], [], []
VOTING_RECORD_MANAGERS, VOTING_RECORD_SERIES, SERIES_CLASSES, OTHER_PERSONS = [], [], [], []
CATEGORY_LOOKUP = {}

# Parsing Helpers
def parse_date(s):
    for fmt in ("%m/%d/%Y", "%Y-%m-%d", "%m-%d-%Y", "%Y%m%d"):
        try: return datetime.datetime.strptime(s.strip(), fmt).date()
        except: continue
    return None

def get_text(node, xp):
    res = node.xpath(xp)
    return res[0].text.strip() if res and res[0].text else ""

def get_decimal(node, xp):
    txt = get_text(node, xp).replace(",", "")
    try: return Decimal(txt)
    except: return None

# XML Parsing
def extract_xml_blocks(path):
    text = open(path, encoding="utf-8", errors="replace").read()
    return re.findall(r'(<\?xml.*?</edgarSubmission>)', text, re.S | re.I)

def parse_xml(xml_str):
    try: return ET.fromstring(xml_str.encode("utf-8"), ET.XMLParser(recover=True))
    except ET.XMLSyntaxError: return None

# Main Parsing Logic
def process_npx_files(folder):
    for fname in filter(lambda x: x.lower().endswith(".txt"), os.listdir(folder)):
        path = os.path.join(folder, fname)
        header = re.search(r"ACCESSION.*?:\s*([^\r\n]+).*?FILED.*?DATE:\s*(\d{8})", open(path).read(), re.S|re.I)
        accession, filed_date = header.group(1).strip(), parse_date(header.group(2))

        fragments = extract_xml_blocks(path)
        is_parsable = bool(fragments)
        local_form_idx = len(FORMS)

        FORMS.append({
            "local_form_idx": local_form_idx,
            "accession_number": accession,
            "date_filed": filed_date,
            "is_parsable": is_parsable,
            "explanatory_notes": ""
        })

        if not is_parsable: continue

        for frag in fragments:
            root = parse_xml(frag)
            if not root: continue

            es = root.xpath(".//*[local-name()='edgarSubmission']")[0]
            explanatory_notes = get_text(es, ".//*[local-name()='explanatoryInformation']/*[local-name()='explanatoryNotes']")[:200]
            FORMS[-1]["explanatory_notes"] = explanatory_notes

            for mn in es.xpath(".//*[local-name()='summaryPage']//*[local-name()='investmentManagers']"):
                MANAGERS.append({
                    "local_form_idx": local_form_idx,
                    "serial_no": int(get_text(mn, ".//*[local-name()='serialNo']") or 0),
                    "name": get_text(mn, ".//*[local-name()='name']")[:250],
                })

            for sn in es.xpath(".//*[local-name()='seriesReports']"):
                SERIES_LIST.append({
                    "local_form_idx": local_form_idx,
                    "series_code": get_text(sn, ".//*[local-name()='idOfSeries']")[:25],
                    "series_name": get_text(sn, ".//*[local-name()='nameOfSeries']")[:250],
                    "series_lei": get_text(sn, ".//*[local-name()='leiOfSeries']")[:40]
                })

            for cls in es.xpath(".//*[local-name()='rptSeriesClassInfo']"):
                SERIES_CLASSES.append({
                    "local_form_idx": local_form_idx,
                    "class_id": get_text(cls, ".//*[local-name()='classId']")[:10],
                    "class_name": get_text(cls, ".//*[local-name()='className']")[:250]
                })

            for op in es.xpath(".//*[local-name()='otherManagersInfo']/*[local-name()='otherManager']"):
                OTHER_PERSONS.append({
                    "local_form_idx": local_form_idx,
                    "ica_form13f_number": get_text(op, ".//*[local-name()='icaOr13FFileNumber']")[:17],
                    "crd_number": get_text(op, ".//*[local-name()='crdNumber']")[:20],
                    "sec_file_number": get_text(op, ".//*[local-name()='otherFileNumber']")[:17],
                    "lei_number": get_text(op, ".//*[local-name()='leiNumberOM']")[:20],
                    "manager_name": get_text(op, ".//*[local-name()='managerName']")[:150]
                })

# Main Execution
def run_all(folder_path="npx_filings"):
    global FORMS, MANAGERS, SERIES_LIST, SERIES_CLASSES, OTHER_PERSONS
    FORMS, MANAGERS, SERIES_LIST, SERIES_CLASSES, OTHER_PERSONS = [], [], [], [], []
    process_npx_files(folder_path)
    insert_data()
    print("Done!")

run_all()
