In [8]:
from __future__ import annotations

"""
Detect position concentration breaches (>THRESHOLD of portfolio value).

Fixes vs original:
1) Computes weights for EVERY day in the equity curve (not just trade days).
2) Uses DAILY close_adj pricing (from UNIVERSE_FILE or PRICE_DIR) instead of stale last-trade price.
3) Uses "last known close at or before date" fallback for missing prices (e.g., delistings / gaps).
"""

from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Dict, Any, List, Tuple

import numpy as np
import pandas as pd


# =========================
# USER SETTINGS
# =========================
TRADES_FILE   = Path("./13-trading_output_regression_insp500_spyfilter_cap15/13-trades_regression_insp500_spyfilter_cap15.parquet")
EQUITY_FILE   = Path("./13-trading_output_regression_insp500_spyfilter_cap15/13-equity_curve_regression_insp500_spyfilter_cap15.parquet")
UNIVERSE_FILE = Path("./12-tradable_sp500_universe/12-tradable_sp500_universe.parquet")  # used for close_adj prices if PRICE_DIR is None

# Optional folder containing per-ticker parquet OHLC/adj prices like "APA.parquet", "NVDA.parquet", etc.
# If set, the script will use that day's close (or best available column) to compute position weights.
PRICE_DIR: Optional[Path] = None

THRESHOLD = 0.12   # 15%
OUT_DIR = Path("./15d-position_too_big_concentration_flags")
OUT_DIR.mkdir(parents=True, exist_ok=True)


# =========================
# HELPERS
# =========================
def pick_first_existing(cols: List[str], available: List[str]) -> Optional[str]:
    for c in cols:
        if c in available:
            return c
    return None


def fast_price_lookup(px_array: np.ndarray, date_val: pd.Timestamp) -> float:
    """
    Structured array with fields ['date','px'], return last known px at or before date_val.
    """
    date64 = np.datetime64(date_val.normalize(), "ns")
    dates = px_array["date"]
    idx = np.searchsorted(dates, date64, side="right") - 1
    if idx < 0:
        return np.nan
    return float(px_array["px"][idx])


def load_trades(path: Path) -> pd.DataFrame:
    df = pd.read_parquet(path)

    date_col = pick_first_existing(["exec_date", "date", "signal_date"], list(df.columns))
    if not date_col:
        raise KeyError(f"No trade date column found. Columns: {list(df.columns)}")

    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.dropna(subset=[date_col]).copy()
    df.rename(columns={date_col: "trade_dt"}, inplace=True)

    if "ticker" not in df.columns:
        raise KeyError("Trades file missing 'ticker' column.")
    if "type" not in df.columns:
        raise KeyError("Trades file missing 'type' column (BUY/SELL).")
    if "shares" not in df.columns:
        raise KeyError("Trades file missing 'shares' column.")

    price_col = pick_first_existing(["price", "exec_open_adj", "signal_close_adj"], list(df.columns))
    if not price_col:
        raise KeyError(f"No price-like column found. Columns: {list(df.columns)}")
    df.rename(columns={price_col: "exec_price"}, inplace=True)

    port_col = pick_first_existing(
        ["portfolio_after", "equity_after", "portfolio_before", "equity_before"],
        list(df.columns),
    )
    if port_col:
        df.rename(columns={port_col: "portfolio_fallback"}, inplace=True)
    else:
        df["portfolio_fallback"] = pd.NA

    df["side"] = df["type"].astype(str).str.upper().str.strip()
    df["shares"] = pd.to_numeric(df["shares"], errors="coerce").fillna(0.0)
    df["exec_price"] = pd.to_numeric(df["exec_price"], errors="coerce")

    sort_cols = ["trade_dt"]
    if "trade_id" in df.columns:
        sort_cols.append("trade_id")
    df = df.sort_values(sort_cols).reset_index(drop=True)

    df["trade_date"] = df["trade_dt"].dt.normalize()
    df["ticker"] = df["ticker"].astype(str).str.upper().str.strip()

    return df


def load_equity_curve(path: Path) -> pd.DataFrame:
    eq = pd.read_parquet(path)

    date_col = pick_first_existing(["date", "Date", "datetime", "timestamp"], list(eq.columns))
    if not date_col:
        raise KeyError(f"Equity curve missing date column. Columns: {list(eq.columns)}")

    eq[date_col] = pd.to_datetime(eq[date_col], errors="coerce")
    eq = eq.dropna(subset=[date_col]).copy()
    eq.rename(columns={date_col: "date"}, inplace=True)
    eq["date"] = eq["date"].dt.normalize()

    val_col = pick_first_existing(
        ["portfolio_value", "portfolio", "equity", "equity_value", "total", "account_value"],
        list(eq.columns),
    )
    if not val_col:
        raise KeyError(f"Equity curve missing portfolio value column. Columns: {list(eq.columns)}")

    eq.rename(columns={val_col: "portfolio_value"}, inplace=True)
    eq["portfolio_value"] = pd.to_numeric(eq["portfolio_value"], errors="coerce")
    eq = eq.dropna(subset=["portfolio_value"])

    return eq[["date", "portfolio_value"]].drop_duplicates("date").set_index("date").sort_index()


@dataclass
class PriceCache:
    price_dir: Path
    cache: Dict[str, pd.DataFrame]

    def __init__(self, price_dir: Path):
        self.price_dir = price_dir
        self.cache = {}

    def _load(self, ticker: str) -> Optional[pd.DataFrame]:
        p = self.price_dir / f"{ticker}.parquet"
        if not p.exists():
            return None

        df = pd.read_parquet(p)
        date_col = pick_first_existing(["date", "Date", "datetime", "timestamp"], list(df.columns))
        if not date_col:
            return None

        df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
        df = df.dropna(subset=[date_col]).copy()
        df.rename(columns={date_col: "date"}, inplace=True)
        df["date"] = df["date"].dt.normalize()

        px_col = pick_first_existing(
            ["close_adj", "adj_close", "close", "Close", "close_price", "price"],
            list(df.columns),
        )
        if not px_col:
            px_col = pick_first_existing(["open_adj", "open", "Open"], list(df.columns))
        if not px_col:
            return None

        df.rename(columns={px_col: "px"}, inplace=True)
        df["px"] = pd.to_numeric(df["px"], errors="coerce")
        df = df.dropna(subset=["px"])

        return df[["date", "px"]].drop_duplicates("date").set_index("date").sort_index()

    def get_price(self, ticker: str, date: pd.Timestamp) -> Optional[float]:
        if ticker not in self.cache:
            self.cache[ticker] = self._load(ticker)  # may be None
        tdf = self.cache.get(ticker)
        if tdf is None:
            return None
        # last known at or before date
        try:
            loc = tdf.index.searchsorted(date, side="right") - 1
            if loc < 0:
                return None
            return float(tdf.iloc[loc]["px"])
        except Exception:
            return None


def build_universe_price_arrays(universe_file: Path) -> Dict[str, np.ndarray]:
    """
    Build per-ticker structured arrays for close_adj prices:
      arr dtype=[('date','datetime64[ns]'),('px','float64')]
    """
    if not universe_file.exists():
        return {}

    u = pd.read_parquet(universe_file, columns=["date", "ticker", "close_adj"])
    u["date"] = pd.to_datetime(u["date"])
    u["ticker"] = u["ticker"].astype(str).str.upper().str.strip()
    u["close_adj"] = pd.to_numeric(u["close_adj"], errors="coerce")
    u = u.dropna(subset=["date", "ticker", "close_adj"])
    u = u.sort_values(["ticker", "date"])

    px_by_ticker: Dict[str, np.ndarray] = {}
    for t, sub in u.groupby("ticker", sort=False):
        arr = np.zeros(len(sub), dtype=[("date", "datetime64[ns]"), ("px", "float64")])
        arr["date"] = sub["date"].values.astype("datetime64[ns]")
        arr["px"] = sub["close_adj"].astype(float).values
        px_by_ticker[t] = arr

    return px_by_ticker


def main():
    trades = load_trades(TRADES_FILE)
    eq_index = load_equity_curve(EQUITY_FILE)

    # Group trades by day for fast access
    trades_by_day = {d: g for d, g in trades.groupby("trade_date", sort=True)}

    # Price sources
    price_cache = PriceCache(PRICE_DIR) if PRICE_DIR else None
    universe_px = {} if PRICE_DIR else build_universe_price_arrays(UNIVERSE_FILE)

    holdings_shares: Dict[str, float] = {}
    last_trade_px: Dict[str, float] = {}

    flags: List[Dict[str, Any]] = []

    # IMPORTANT FIX: iterate ALL equity days (not just trade days)
    for day in eq_index.index:
        # Apply trades (if any) that occurred on this day
        day_trades = trades_by_day.get(day)
        if day_trades is not None:
            for _, r in day_trades.iterrows():
                ticker = str(r["ticker"]).upper().strip()
                side = str(r["side"])
                shares = float(r["shares"])
                px = r["exec_price"]

                if pd.notna(px):
                    last_trade_px[ticker] = float(px)

                cur = holdings_shares.get(ticker, 0.0)
                if "BUY" in side:
                    cur += shares
                elif "SELL" in side:
                    cur -= shares
                else:
                    continue

                if abs(cur) < 1e-9:
                    cur = 0.0
                holdings_shares[ticker] = cur

        portfolio_value = float(eq_index.loc[day, "portfolio_value"])
        if not np.isfinite(portfolio_value) or portfolio_value <= 0:
            continue

        # Compute weights for ALL currently-held tickers
        for ticker, sh in holdings_shares.items():
            if sh == 0.0:
                continue

            px_today: Optional[float] = None
            price_source = None

            # 1) Preferred: PRICE_DIR daily close (if configured)
            if price_cache is not None:
                px_today = price_cache.get_price(ticker, day)
                if px_today is not None and px_today > 0:
                    price_source = "PRICE_DIR"

            # 2) Next best: UNIVERSE_FILE close_adj (matches your strategy)
            if (px_today is None or px_today <= 0) and (ticker in universe_px):
                p = fast_price_lookup(universe_px[ticker], day)
                if np.isfinite(p) and p > 0:
                    px_today = float(p)
                    price_source = "UNIVERSE_close_adj"

            # 3) Last resort: last trade price (stale; only use if no daily data)
            if px_today is None or px_today <= 0:
                p = last_trade_px.get(ticker)
                if p is not None and p > 0:
                    px_today = float(p)
                    price_source = "last_trade_price"

            if px_today is None or px_today <= 0:
                continue

            pos_value = abs(sh) * float(px_today)
            weight = pos_value / portfolio_value

            if weight > THRESHOLD:
                flags.append({
                    "date": day,
                    "ticker": ticker,
                    "shares": sh,
                    "price_used": px_today,
                    "position_value": pos_value,
                    "portfolio_value": portfolio_value,
                    "weight": weight,
                    "threshold": THRESHOLD,
                    "price_source": price_source,
                })

    flags_df = pd.DataFrame(flags)
    if not flags_df.empty:
        flags_df = flags_df.sort_values(["date", "weight"], ascending=[True, False])

    out_csv = OUT_DIR / f"position_over_{int(THRESHOLD*100)}pct_flags.csv"
    out_parquet = OUT_DIR / f"position_over_{int(THRESHOLD*100)}pct_flags.parquet"
    flags_df.to_csv(out_csv, index=False)
    flags_df.to_parquet(out_parquet, index=False)

    print(f"✔ Flagged rows: {len(flags_df):,}")
    print(f"✔ Wrote: {out_csv}")
    print(f"✔ Wrote: {out_parquet}")

    if len(flags_df) > 0:
        print("\nTop 20 by weight:")
        print(flags_df.sort_values("weight", ascending=False).head(20).to_string(index=False))

        summary = (
            flags_df.groupby("ticker", as_index=False)
            .agg(
                first_date=("date", "min"),
                last_date=("date", "max"),
                max_weight=("weight", "max"),
                max_position_value=("position_value", "max"),
                n_days_flagged=("date", "nunique"),
            )
            .sort_values("max_weight", ascending=False)
        )
        summary_csv = OUT_DIR / f"position_over_{int(THRESHOLD*100)}pct_summary.csv"
        summary.to_csv(summary_csv, index=False)
        print(f"\n✔ Wrote summary: {summary_csv}")
        print("\nTop 20 tickers by max_weight:")
        print(summary.head(20).to_string(index=False))


if __name__ == "__main__":
    main()


✔ Flagged rows: 2,627
✔ Wrote: 15d-position_too_big_concentration_flags\position_over_12pct_flags.csv
✔ Wrote: 15d-position_too_big_concentration_flags\position_over_12pct_flags.parquet

Top 20 by weight:
      date ticker   shares  price_used  position_value  portfolio_value   weight  threshold       price_source
2013-01-29   NFLX 271382.0       2.416      655658.912     3.581358e+06 0.183075       0.12 UNIVERSE_close_adj
2013-01-30   NFLX 271382.0       2.396      650231.272     3.561932e+06 0.182550       0.12 UNIVERSE_close_adj
2013-01-25   NFLX 271382.0       2.422      657287.204     3.611174e+06 0.182015       0.12 UNIVERSE_close_adj
2013-01-28   NFLX 271382.0       2.316      628520.712     3.577133e+06 0.175705       0.12 UNIVERSE_close_adj
2013-01-24   NFLX 271382.0       2.098      569359.436     3.521752e+06 0.161669       0.12 UNIVERSE_close_adj
2009-09-16    GGP 118381.0       2.792      330519.752     2.050077e+06 0.161223       0.12 UNIVERSE_close_adj
2011-01-12   NVDA 