# ETP Filing Tracker — Notebook Front‑End (D:\ Ready, Modular Engine)

**Run order (top → bottom)**  
1. Paths & Imports  
2. CONFIG (D:\ paths, CIKs)  
3. Init client  
4. Step 2 → Step 3 → Step 4  
5. (Optional) Debug single accession


In [None]:
# 1) Paths & Imports (D:\ ready) — run this first
import sys, os
from pathlib import Path

PROJECT_ROOT = r"D:\REX_ETP_TRACKER"   # <-- change if your folder is different
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Ensure package structure & __init__.py exist
pkg_dir = Path(PROJECT_ROOT) / "etp_tracker"
pkg_dir.mkdir(parents=True, exist_ok=True)
init_file = pkg_dir / "__init__.py"
if not init_file.exists():
    init_file.write_text(
        "__all__ = ['config','utils','csvio','paths','sec_client','sgml','body_extractors','step2','step3','step4']\n",
        encoding="utf-8"
    )

print("Python exe:", sys.executable)
print("Using PROJECT_ROOT:", PROJECT_ROOT)

# Imports from the modular engine
from etp_tracker.sec_client import SECClient
from etp_tracker.step2 import step2_submissions_and_prospectus
from etp_tracker.step3 import step3_extract_for_trust
from etp_tracker.step4 import step4_rollup_for_trust
from etp_tracker.paths import output_paths_for_trust
from etp_tracker.sgml import parse_sgml_series_classes

# Display helper
try:
    from caas_jupyter_tools import display_dataframe_to_user as _show_df
    def show_df(name, df):
        _show_df(name, df)
except Exception:
    def show_df(name, df):
        display(df.head(50))

print("Imports OK")


## 2) CONFIG — edit and run

In [None]:
# SEC identification (required)
USER_AGENT = "REX-SEC-Filer/1.0 (contact: you@rexshares.com)"

# Absolute paths on D:\
OUTPUT_ROOT = r"D:\REX_ETP_TRACKER\outputs"
CACHE_DIR   = r"D:\REX_ETP_TRACKER\http_cache"

# Trusts to process
CIKS = [
    "2043954",     # REX ETF Trust
]

# Optional display-name overrides (CIK -> Friendly Trust Name)
OVERRIDES = {
    "2043954": "REX ETF Trust",
}

# Step 3 extraction window (optional)
SINCE = None   # e.g., "2025-10-01"
UNTIL = None   # e.g., "2025-11-05"
FORMS = None   # e.g., ["485A","485B","497"]

# Submissions refresh behavior
REFRESH_SUBMISSIONS   = True
REFRESH_MAX_AGE_HOURS = 6
REFRESH_FORCE_NOW     = False

# Path sanity
print("OUTPUT_ROOT:", OUTPUT_ROOT)
print("CACHE_DIR  :", CACHE_DIR)
Path(OUTPUT_ROOT).mkdir(parents=True, exist_ok=True)
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)


## 3) Init SEC client

In [None]:
client = SECClient(user_agent=USER_AGENT, request_timeout=45, pause=0.35, cache_dir=CACHE_DIR)
print("Client ready")


## 4) Step 2 — Submissions & prospectus subset

In [None]:
import pandas as pd

trusts = step2_submissions_and_prospectus(
    client=client,
    output_root=OUTPUT_ROOT,
    cik_list=CIKS,
    overrides=OVERRIDES,
    since=None, until=None,  # keep full history for _1_
    refresh_submissions=REFRESH_SUBMISSIONS,
    refresh_max_age_hours=REFRESH_MAX_AGE_HOURS,
    refresh_force_now=REFRESH_FORCE_NOW,
)
print("Trusts processed:", trusts)

# Preview _1_ and _2_ for each trust
for t in trusts:
    paths = output_paths_for_trust(OUTPUT_ROOT, t)
    df1 = pd.read_csv(paths["all_filings"], dtype=str)
    df2 = pd.read_csv(paths["prospectus_base"], dtype=str)
    print(f"\n=== {t} — _1_ all filings ({len(df1)}) ===")
    show_df(f"{t} — _1_", df1.tail(30))
    print(f"=== {t} — _2_ prospectus subset ({len(df2)}) ===")
    show_df(f"{t} — _2_", df2.tail(30))


## 5) Step 3 — Extraction (SGML-first, anchored tickers)

In [None]:
# Run extraction per trust with optional window
for t in trusts:
    n = step3_extract_for_trust(client, OUTPUT_ROOT, t, since=SINCE, until=UNTIL, forms=FORMS)
    print(f"{t}: extracted {n} rows")

# Preview _3_ for each trust + quick QA
for t in trusts:
    paths = output_paths_for_trust(OUTPUT_ROOT, t)
    p3 = paths["extracted_funds"]
    if not Path(p3).exists():
        print(f"{t}: no _3_ yet")
        continue
    df3 = pd.read_csv(p3, dtype=str)
    df3["Class Symbol"] = df3["Class Symbol"].fillna("")
    blanks = (df3["Class Symbol"] == "").sum()
    print(f"\n=== {t} — _3_ fund extraction ({len(df3)}) — blank tickers: {blanks} ===")
    show_df(f"{t} — _3_", df3.tail(50))


## 6) Step 4 — Roll-up latest per fund

In [None]:
for t in trusts:
    k = step4_rollup_for_trust(OUTPUT_ROOT, t)
    print(f"{t}: roll-up rows = {k}")
    paths = output_paths_for_trust(OUTPUT_ROOT, t)
    df4 = pd.read_csv(paths["latest_record"], dtype=str)
    show_df(f"{t} — _4_", df4.head(50))


## 7) Debug — Single accession (quick SGML + tentative ticker check)

In [None]:
def debug_extract_from_accession(txt_url: str, primary_html_url: str | None = None):
    import re, pandas as pd
    from etp_tracker.body_extractors import iter_txt_documents, extract_from_html_string

    txt = client.fetch_text(txt_url)
    series = parse_sgml_series_classes(txt)
    texts = [txt]
    if primary_html_url:
        try:
            texts.append(client.fetch_text(primary_html_url))
        except Exception:
            pass
    for doctype, fname, body_html in iter_txt_documents(txt):
        texts.append(extract_from_html_string(body_html)[1])

    STOP = {"THE","AND","FOR","WITH","ETF","FUND","RISK","USD","MEMBER"}
    def valid_tkr(s):
        s = s.strip().upper()
        return 1 <= len(s) <= 6 and s not in STOP and any(c.isalpha() for c in s)
    def pull(series_name, texts):
        s = re.sub(r"\s+", " ", series_name).strip()
        rx = re.compile(re.escape(s) + r"\s*\(\s*([A-Z0-9]{1,6})\s*\)", re.I)
        for t in texts:
            m = rx.search(t or "")
            if m and valid_tkr(m.group(1)):
                return m.group(1).upper(), "TITLE-PAREN"
        lab = re.compile(r"(?i)(Ticker|Trading\s*Symbol)\s*[:\-–]\s*([A-Z0-9]{1,6})")
        for t in texts:
            for m in re.finditer(re.escape(s), t or "", flags=re.I):
                window = t[max(0, m.start()-600): m.end()+600]
                m2 = lab.search(window)
                if m2 and valid_tkr(m2.group(2)):
                    return m2.group(2).upper(), "LABEL-WINDOW"
        return "", ""

    rows = []
    for s in series:
        nm = s.get("Class Contract Name") or s.get("Series Name") or ""
        tkr, how = pull(nm, texts)
        rows.append({
            "Series ID": s.get("Series ID",""),
            "Series Name": s.get("Series Name",""),
            "Class-Contract ID": s.get("Class-Contract ID",""),
            "Class Contract Name": nm,
            "Ticker?": tkr,
            "Method": how
        })
    df = pd.DataFrame(rows)
    show_df("Debug — single accession (series + tentative ticker)", df)
    return df

# Example (uncomment to run):
# debug_extract_from_accession(
#     "https://www.sec.gov/Archives/edgar/data/2043954/000199937125016273/0001999371-25-016273.txt",
#     "https://www.sec.gov/Archives/edgar/data/2043954/000199937125016273/ulti_485bpos102825.htm",
# )


## 8) (Optional) One‑liner end‑to‑end

In [None]:
from etp_tracker.run_pipeline import run_pipeline
_ = run_pipeline(
    ciks=CIKS, overrides=OVERRIDES, since=SINCE, until=UNTIL,
    output_root=OUTPUT_ROOT, cache_dir=CACHE_DIR,
    user_agent=USER_AGENT, request_timeout=45, pause=0.35,
    refresh_submissions=True, refresh_max_age_hours=6, refresh_force_now=False
)
print("Pipeline complete.")
