In [1]:
#!/usr/bin/env python
"""
P&L concentration analysis for the momentum system.

This script reads the trade log produced by the trading engine,
reconstructs realized P&L per closed trade using FIFO,
and analyzes profit concentration by trade, ticker, and year.

Enhancements included (kept from your current file):
  - Per-SELL "trade" analytics:
      * start_date / end_date
      * time_in_trade_days (share-weighted FIFO holding time)
      * avg_cost_basis (share-weighted)
      * sale_price
      * n_entry_lots_used (how many distinct BUY lots were consumed by the SELL)
      * return_frac / return_pct
  - Top N biggest winning trades table + CSV
  - Descriptive statistics (all / winners / losers) including hold times + returns
  - Exit cause classification for biggest winners (Rank vs Below 100MA vs both)
  - Deduped view aggregating partial closes by (ticker, start_date)

NEW (added without removing existing functionality):
  - Robust mapping of realized SELLs back to the exact SELL trade-row via trade_id
    (avoids ambiguous merges and prevents ticker column collisions).
  - Loss analysis:
      * losses as % of position (loss / entry_cost)
      * losses as % of portfolio at the time (loss / portfolio_before at SELL)
      * tables + CSV outputs for worst losses and loss descriptive stats

Key bug fix:
  - Prevent pandas merge collisions that were renaming `ticker` to `ticker_x/ticker_y`
    and causing: KeyError: "['ticker'] not in index"
    We rename SELL-side ticker to `sell_ticker` before merging.

Outputs (existing + new):
  - 25_realized_trades_pnl.parquet / .csv
  - 25_pnl_by_year.csv
  - 25_pnl_by_ticker.csv
  - 25_top_trades.csv
  - 25_top_tickers.csv
  - 25_top_years.csv
  - 25_topN_winning_trades_with_holds.csv
  - 25_topN_winners_with_exit_cause.csv
  - 25_top10_unique_trades_aggregated.csv
  - 25_top10_unique_trades_with_exit_cause.csv
  - 25_trade_descriptive_stats_all.csv
  - 25_trade_descriptive_stats_winners.csv
  - 25_trade_descriptive_stats_losers.csv
  - 25_trade_stats_by_hold_bucket.csv
  - 25_worst_losses_by_dollars.csv               (NEW)
  - 25_worst_losses_by_position_pct.csv          (NEW)
  - 25_worst_losses_by_portfolio_pct.csv         (NEW)
  - 25_loss_descriptive_stats_losers.csv         (NEW)
"""

import os
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# ============================================================
# CONFIG
# ============================================================

TRADES_FILE = "./13-trading_output_regression_insp500_spyfilter_cap15/13-trades_regression_insp500_spyfilter_cap15.parquet"
EQUITY_FILE = "./13-trading_output_regression_insp500_spyfilter_cap15/13-equity_curve_regression_insp500_spyfilter_cap15.parquet"  # optional
UNIVERSE_FILE = "./12-tradable_sp500_universe/12-tradable_sp500_universe.parquet"

OUTPUT_DIR  = "./25-pnl_concentration_output"
TOP_WINNERS = 30  # top N winning trades to list
TOP_LOSERS  = 30  # top N losing trades to list (NEW)

os.makedirs(OUTPUT_DIR, exist_ok=True)

print("=== P&L CONCENTRATION ANALYSIS ===")

# ============================================================
# LOAD TRADES
# ============================================================

trades = pd.read_parquet(TRADES_FILE)

# --- schema normalization ---
if "exec_date" in trades.columns:
    trades["date"] = pd.to_datetime(trades["exec_date"], errors="coerce")
elif "date" in trades.columns:
    trades["date"] = pd.to_datetime(trades["date"], errors="coerce")
else:
    raise ValueError("Trades file must contain 'exec_date' or 'date'.")

required_cols = {"date", "ticker", "type", "shares", "price"}
missing = required_cols - set(trades.columns)
if missing:
    raise ValueError(f"Trades file missing required columns: {missing}")

trades = trades.sort_values(["ticker", "date"]).reset_index(drop=True)

# --- unique id per trade-row (lets us map realized SELLs back to the exact SELL row) ---
if "trade_id" not in trades.columns:
    trades["trade_id"] = np.arange(len(trades), dtype=np.int64)

# --- compute portfolio_before from cash_before + equity_before
# We infer equity_before using equity_after and trade_value (shares*price):
#   SELL: equity_before = equity_after + trade_value
#   BUY : equity_before = equity_after - trade_value
if {"cash_before", "equity_after"} <= set(trades.columns):
    trades["trade_value"] = pd.to_numeric(trades["shares"], errors="coerce") * pd.to_numeric(trades["price"], errors="coerce")
    side_u = trades["type"].astype(str).str.upper()

    trades["equity_before"] = np.where(
        side_u.eq("SELL"),
        pd.to_numeric(trades["equity_after"], errors="coerce") + pd.to_numeric(trades["trade_value"], errors="coerce"),
        pd.to_numeric(trades["equity_after"], errors="coerce") - pd.to_numeric(trades["trade_value"], errors="coerce"),
    )

    trades["portfolio_before"] = (
        pd.to_numeric(trades["cash_before"], errors="coerce") + pd.to_numeric(trades["equity_before"], errors="coerce")
    )
else:
    trades["trade_value"] = np.nan
    trades["equity_before"] = np.nan
    trades["portfolio_before"] = np.nan

print(f"Loaded trades: {len(trades):,}")
print("Trade columns:", list(trades.columns), "\n")

# ============================================================
# HELPERS: universe lookup for exit features, exit classification
# ============================================================

def load_universe_exit_features(universe_file: str, pairs: pd.DataFrame) -> pd.DataFrame:
    """
    Load ma100/above_ma100/close_adj for specific (ticker, signal_date) pairs.

    pairs must have columns: ['ticker','signal_date'] where signal_date is datetime64[ns].

    NOTE: This reads the parquet with a limited set of columns then filters in-memory.
          If you want more speed, switch to pyarrow.dataset filtering.
    """
    u = pd.read_parquet(
        universe_file,
        columns=["date", "ticker", "ma100", "above_ma100", "close_adj"]
    ).copy()

    u["date"] = pd.to_datetime(u["date"], errors="coerce")
    u = u.dropna(subset=["date", "ticker"])

    tickers = pairs["ticker"].dropna().unique()
    dates = pd.to_datetime(pairs["signal_date"].dropna().unique(), errors="coerce")

    u = u[u["ticker"].isin(tickers) & u["date"].isin(dates)].copy()
    u = u.rename(columns={"date": "signal_date"})

    return u


def classify_exit_cause(reason: str, above_ma100: object) -> str:
    """
    Outputs: 'Below_100MA', 'Rank', 'Rank+Below_100MA', 'Unknown'
    Priority:
      - If reason indicates rank/selection-related exit -> Rank
      - If below MA100 on the signal day -> Below_100MA
      - If both -> Rank+Below_100MA
    """
    r = str(reason).lower().strip() if pd.notna(reason) else ""

    rank_tokens = [
        "not_in_top",          # not_in_top_quintile etc
        "rebalance_down",
        "fell",                # fell_in_rank etc (if you ever add)
        "rank",
        "exit_rank",
    ]
    is_rank_exit = any(tok in r for tok in rank_tokens)

    is_below_ma = (above_ma100 is False)  # above_ma100 is boolean or NaN

    if is_rank_exit and is_below_ma:
        return "Rank+Below_100MA"
    if is_rank_exit:
        return "Rank"
    if is_below_ma:
        return "Below_100MA"
    return "Unknown"


# ============================================================
# REALIZED P&L RECONSTRUCTION (FIFO)
# ============================================================

def compute_realized_pnl(trades_df: pd.DataFrame) -> pd.DataFrame:
    """
    Realized P&L per SELL using FIFO matching against prior BUYs.

    Adds:
      - sell_trade_id: trade_id of the SELL row (robust mapping back to the trade log)
      - n_entry_lots_used: number of distinct BUY lots consumed by the SELL
      - holding_days_wavg: share-weighted holding period across matched lots
      - avg_cost_basis: share-weighted cost basis for shares sold
      - sale_price: sell execution price
      - return_frac / return_pct
    """
    records = []

    trades_df = trades_df.sort_values(["ticker", "date"]).reset_index(drop=True)

    for ticker, tdf in trades_df.groupby("ticker", sort=False):
        open_lots = []  # FIFO queue: {shares, cost, date}

        for _, row in tdf.iterrows():
            side   = str(row["type"]).upper()
            shares = float(row["shares"])
            price  = float(row["price"])
            date   = row["date"]

            if shares <= 0 or price <= 0 or pd.isna(date):
                continue

            if side == "BUY":
                open_lots.append({
                    "shares": shares,
                    "cost": shares * price,
                    "date": date,
                })

            elif side == "SELL":
                if not open_lots:
                    continue

                remaining = shares
                total_cost = 0.0

                entry_date_earliest = None
                hold_days_weighted_sum = 0.0
                matched_shares = 0.0

                n_entry_lots_used = 0
                touched_current_lot = False

                while remaining > 1e-8 and open_lots:
                    lot = open_lots[0]
                    take = min(remaining, lot["shares"])

                    # count distinct BUY lots used
                    if not touched_current_lot:
                        n_entry_lots_used += 1
                        touched_current_lot = True

                    cost_part = lot["cost"] * (take / lot["shares"])
                    total_cost += cost_part

                    lot_days = (date - lot["date"]).days
                    hold_days_weighted_sum += take * lot_days
                    matched_shares += take

                    entry_date_earliest = (
                        lot["date"] if entry_date_earliest is None
                        else min(entry_date_earliest, lot["date"])
                    )

                    # reduce lot
                    lot["shares"] -= take
                    lot["cost"]   -= cost_part
                    remaining     -= take

                    if lot["shares"] <= 1e-8:
                        open_lots.pop(0)
                        touched_current_lot = False  # next lot is new

                if matched_shares <= 1e-8:
                    continue

                # If a SELL couldn't be fully matched, scale exit_value to matched fraction
                fill_ratio = matched_shares / shares
                exit_value = (shares * price) * fill_ratio

                realized_pnl = exit_value - total_cost
                holding_days_wavg = hold_days_weighted_sum / matched_shares

                avg_cost_basis = total_cost / matched_shares
                sale_price = price

                return_frac = realized_pnl / total_cost if total_cost != 0 else np.nan
                return_pct = 100.0 * return_frac if pd.notnull(return_frac) else np.nan

                sell_trade_id = row["trade_id"] if "trade_id" in row.index else np.nan

                records.append({
                    "ticker": ticker,
                    "sell_trade_id": sell_trade_id,   # <-- robust mapping
                    "shares_sold": matched_shares,

                    "start_date": entry_date_earliest,
                    "end_date": date,

                    "n_entry_lots_used": n_entry_lots_used,
                    "avg_cost_basis": avg_cost_basis,
                    "sale_price": sale_price,

                    # keep both names for convenience/back-compat
                    "holding_days_wavg": holding_days_wavg,
                    "time_in_trade_days": holding_days_wavg,

                    "entry_cost": total_cost,
                    "exit_value": exit_value,
                    "realized_pnl": realized_pnl,

                    "return_frac": return_frac,
                    "return_pct": return_pct,

                    "exit_year": date.year,
                })

    return pd.DataFrame(records)


realized = compute_realized_pnl(trades)

if realized.empty:
    raise RuntimeError("No realized P&L computed — no matched SELL trades found.")

# ============================================================
# CLEAN / ENSURE RETURN COLUMNS (robust)
# ============================================================

# Ensure entry_cost exists (needed for returns)
if "entry_cost" not in realized.columns:
    if {"exit_value", "realized_pnl"} <= set(realized.columns):
        realized["entry_cost"] = realized["exit_value"] - realized["realized_pnl"]
    else:
        raise KeyError("Need 'entry_cost' or ('exit_value' and 'realized_pnl') to compute returns.")

# Ensure return_frac exists
if "return_frac" not in realized.columns:
    realized["return_frac"] = np.where(
        pd.to_numeric(realized["entry_cost"], errors="coerce") != 0,
        pd.to_numeric(realized["realized_pnl"], errors="coerce") / pd.to_numeric(realized["entry_cost"], errors="coerce"),
        np.nan
    )

# Ensure return_pct exists
if "return_pct" not in realized.columns:
    realized["return_pct"] = 100.0 * realized["return_frac"]

# Clean numerics / infinities
for c in [
    "sell_trade_id", "entry_cost", "exit_value", "realized_pnl",
    "return_frac", "return_pct", "avg_cost_basis", "sale_price",
    "holding_days_wavg", "time_in_trade_days", "shares_sold", "n_entry_lots_used"
]:
    if c in realized.columns:
        realized[c] = pd.to_numeric(realized[c], errors="coerce")

realized.replace([np.inf, -np.inf], np.nan, inplace=True)
realized.loc[realized["entry_cost"] <= 0, ["return_frac", "return_pct"]] = np.nan

print(f"Realized trades (SELLs with P&L): {len(realized):,}\n")

# ============================================================
# ATTACH SELL-LOG METADATA (reason, signal_date, portfolio_before, etc.)
# Fixes ticker merge collisions by renaming SELL-side ticker -> sell_ticker.
# ============================================================

sell_meta = trades.copy()
sell_meta["type"] = sell_meta["type"].astype(str).str.upper()
sell_meta = sell_meta[sell_meta["type"].eq("SELL")].copy()

# Ensure expected fields exist (older logs / variants)
for col in [
    "signal_date", "reason", "signal_close_adj", "exec_open_adj", "slope_rank_within_top",
    "cash_before", "cash_after", "equity_before", "equity_after",
    "portfolio_before", "portfolio_after", "num_positions_after",
    "shares", "price", "value", "date", "trade_id", "ticker"
]:
    if col not in sell_meta.columns:
        sell_meta[col] = np.nan

sell_meta = sell_meta[[
    "trade_id", "ticker", "date",
    "signal_date", "reason",
    "signal_close_adj", "exec_open_adj", "slope_rank_within_top",
    "cash_before", "cash_after",
    "equity_before", "equity_after",
    "portfolio_before", "portfolio_after",
    "num_positions_after",
    "shares", "price", "value",
]].rename(columns={
    "trade_id": "sell_trade_id",
    "ticker": "sell_ticker",      # <-- CRITICAL: prevents ticker_x/ticker_y collisions
    "date": "exec_date",
    "shares": "sell_shares",
    "price": "sell_exec_price",
    "value": "sell_value",
})

sell_meta["signal_date"] = pd.to_datetime(sell_meta["signal_date"], errors="coerce")
sell_meta["exec_date"]   = pd.to_datetime(sell_meta["exec_date"], errors="coerce")

# Merge into realized using sell_trade_id (robust)
if "sell_trade_id" in realized.columns and realized["sell_trade_id"].notna().any():
    realized = realized.merge(sell_meta, on="sell_trade_id", how="left")
else:
    # fallback (should not happen with current pipeline)
    realized["exec_date"] = pd.to_datetime(realized["end_date"], errors="coerce")
    tmp = sell_meta.copy()
    # keep only keys + metadata if needed, but avoid collisions
    realized = realized.merge(
        tmp,
        left_on=["ticker", "exec_date"],
        right_on=["sell_ticker", "exec_date"],
        how="left"
    )

# Sanity: if sell_ticker exists, it should match ticker most of the time
if "sell_ticker" in realized.columns:
    mismatch = (realized["sell_ticker"].notna()) & (realized["ticker"].notna()) & (realized["sell_ticker"] != realized["ticker"])
    if mismatch.any():
        print(f"WARNING: {mismatch.sum():,} realized rows have sell_ticker != ticker (check trade_id mapping).")

# ============================================================
# TOP N BIGGEST WINNING TRADES (with hold + rebalance stats)
# ============================================================

realized = realized.sort_values("realized_pnl", ascending=False).reset_index(drop=True)

topN = realized[realized["realized_pnl"] > 0].head(TOP_WINNERS).copy()

show_cols = [
    "ticker",
    "start_date",
    "n_entry_lots_used",
    "avg_cost_basis",
    "sale_price",
    "end_date",
    "time_in_trade_days",
    "shares_sold",
    "realized_pnl",
    "return_pct",
]

# Ensure we can always print even if a column is missing in some variant
for c in show_cols:
    if c not in topN.columns:
        topN[c] = np.nan

topN["start_date"] = pd.to_datetime(topN["start_date"], errors="coerce").dt.date
topN["end_date"]   = pd.to_datetime(topN["end_date"], errors="coerce").dt.date
topN["time_in_trade_days"] = pd.to_numeric(topN["time_in_trade_days"], errors="coerce").round(2)

print(f"=== TOP {TOP_WINNERS} BIGGEST WINNING TRADES (per SELL exec (OPEN)) ===")
print(topN[show_cols].to_string(index=False, float_format=lambda x: f"{x:,.2f}"))
print()

topN.to_csv(
    os.path.join(OUTPUT_DIR, f"25_top{TOP_WINNERS}_winning_trades_with_holds.csv"),
    index=False
)

# ============================================================
# EXIT CAUSE FOR TOP WINNERS (Rank vs Below 100MA)
# Uses SELL signal_date + universe above_ma100
# ============================================================

topN_exit = topN.copy()

# Make sure we have signal_date/reason/rank fields (from merged sell_meta)
for c in ["signal_date", "reason", "signal_close_adj", "slope_rank_within_top"]:
    if c not in topN_exit.columns:
        topN_exit[c] = np.nan

topN_exit["signal_date"] = pd.to_datetime(topN_exit["signal_date"], errors="coerce")

# Pull MA100 flags from universe for those SELL signal_dates
pairs = topN_exit[["ticker", "signal_date"]].dropna().drop_duplicates().copy()
u_exit = load_universe_exit_features(UNIVERSE_FILE, pairs)

topN_exit = topN_exit.merge(
    u_exit[["ticker", "signal_date", "ma100", "above_ma100"]],
    on=["ticker", "signal_date"],
    how="left"
)

# Compute exit cause
topN_exit["exit_cause"] = topN_exit.apply(
    lambda r: classify_exit_cause(r.get("reason", np.nan), r.get("above_ma100", np.nan)),
    axis=1
)

# Optional: show whether price was below MA100 numerically
topN_exit["below_ma100"] = np.where(
    topN_exit["above_ma100"].isna(), np.nan, ~topN_exit["above_ma100"].astype(bool)
)

exit_cols = [
    "ticker", "start_date", "end_date",
    "realized_pnl", "return_pct",
    "reason", "exit_cause",
    "signal_date", "signal_close_adj", "ma100", "above_ma100",
    "slope_rank_within_top",
]

for c in exit_cols:
    if c not in topN_exit.columns:
        topN_exit[c] = np.nan

print(f"=== EXIT CAUSE FOR TOP {TOP_WINNERS} WINNERS (per SELL exec (OPEN)) ===")
print(topN_exit[exit_cols].to_string(index=False, float_format=lambda x: f"{x:,.2f}"))
print()

topN_exit.to_csv(
    os.path.join(OUTPUT_DIR, f"25_top{TOP_WINNERS}_winners_with_exit_cause.csv"),
    index=False
)

print("=== Exit cause counts (Top winners) ===")
print(topN_exit["exit_cause"].value_counts(dropna=False).to_string())
print()

# ============================================================
# DEDUPED VIEW: Aggregate partial closes by entry (avoid visual double-counting)
# ============================================================

agg = (
    realized.groupby(["ticker", "start_date"], as_index=False)
    .agg({
        "end_date": "max",
        "shares_sold": "sum",
        "n_entry_lots_used": "sum",
        "entry_cost": "sum",
        "exit_value": "sum",
        "realized_pnl": "sum",
        "time_in_trade_days": "max",
        "sell_trade_id": "last",  # keep a representative SELL id for final exit joins
    })
)

agg["avg_cost_basis"] = agg["entry_cost"] / agg["shares_sold"]
agg["sale_price"] = agg["exit_value"] / agg["shares_sold"]
agg["return_frac"] = agg["realized_pnl"] / agg["entry_cost"]
agg["return_pct"] = 100.0 * agg["return_frac"]

agg_top10 = agg.sort_values("realized_pnl", ascending=False).head(10).copy()

print("=== TOP 10 UNIQUE TRADES (aggregated by entry date) ===")
print(
    agg_top10[
        ["ticker", "start_date", "end_date",
         "shares_sold", "avg_cost_basis", "sale_price",
         "realized_pnl", "return_pct"]
    ].to_string(index=False, float_format=lambda x: f"{x:,.2f}")
)
print()

agg_top10.to_csv(os.path.join(OUTPUT_DIR, "25_top10_unique_trades_aggregated.csv"), index=False)

# ============================================================
# EXIT CAUSE FOR TOP 10 UNIQUE TRADES (final exit)
# ============================================================

agg_exit = agg_top10.copy()

# Map aggregated trade to its final exit SELL signal_date, reason, etc.
# We join to sell_meta using the representative sell_trade_id we kept.
agg_exit = agg_exit.merge(
    sell_meta[[
        "sell_trade_id", "signal_date", "reason", "signal_close_adj",
        "slope_rank_within_top"
    ]],
    on="sell_trade_id",
    how="left"
)

agg_exit["signal_date"] = pd.to_datetime(agg_exit["signal_date"], errors="coerce")

pairs2 = agg_exit[["ticker", "signal_date"]].dropna().drop_duplicates().copy()
u_exit2 = load_universe_exit_features(UNIVERSE_FILE, pairs2)

agg_exit = agg_exit.merge(
    u_exit2[["ticker", "signal_date", "ma100", "above_ma100"]],
    on=["ticker", "signal_date"],
    how="left"
)

agg_exit["exit_cause"] = agg_exit.apply(
    lambda r: classify_exit_cause(r.get("reason", np.nan), r.get("above_ma100", np.nan)),
    axis=1
)

print("=== EXIT CAUSE FOR TOP 10 UNIQUE TRADES (final exit) ===")
print(
    agg_exit[
        ["ticker", "start_date", "end_date", "realized_pnl", "return_pct",
         "reason", "exit_cause", "signal_date", "signal_close_adj", "ma100", "above_ma100"]
    ].to_string(index=False, float_format=lambda x: f"{x:,.2f}")
)
print()

agg_exit.to_csv(os.path.join(OUTPUT_DIR, "25_top10_unique_trades_with_exit_cause.csv"), index=False)

# ============================================================
# LOSS ANALYSIS (NEW): losses as % position and % portfolio at the time
# ============================================================

# Position-level fraction is the same as return_frac (pnl / entry_cost)
realized["pnl_frac_of_position"] = realized["return_frac"]

# Portfolio context (at SELL): use portfolio_before from sell_meta merge if present
if "portfolio_before" in realized.columns:
    realized["pnl_frac_of_portfolio"] = np.where(
        pd.to_numeric(realized["portfolio_before"], errors="coerce") != 0,
        pd.to_numeric(realized["realized_pnl"], errors="coerce") / pd.to_numeric(realized["portfolio_before"], errors="coerce"),
        np.nan
    )
else:
    realized["pnl_frac_of_portfolio"] = np.nan

# Loss-only positive magnitudes (so you can interpret as "loss %")
realized["loss_frac_of_position"] = np.where(
    realized["realized_pnl"] < 0,
    -pd.to_numeric(realized["realized_pnl"], errors="coerce") / pd.to_numeric(realized["entry_cost"], errors="coerce"),
    np.nan
)

realized["loss_frac_of_portfolio"] = np.where(
    realized["realized_pnl"] < 0,
    -pd.to_numeric(realized["realized_pnl"], errors="coerce") / pd.to_numeric(realized["portfolio_before"], errors="coerce"),
    np.nan
) if "portfolio_before" in realized.columns else np.nan

# Convenience percent columns
realized["loss_pct_of_position"] = 100.0 * realized["loss_frac_of_position"]
realized["loss_pct_of_portfolio"] = 100.0 * realized["loss_frac_of_portfolio"]

losers = realized[realized["realized_pnl"] < 0].copy()

print("=== LOSS ANALYSIS (per realized SELL) ===")
print(f"Losing trades: {len(losers):,} / {len(realized):,}")

# Worst losses by dollars
worst_dollars = losers.sort_values("realized_pnl", ascending=True).head(TOP_LOSERS).copy()
# Worst losses by position % (highest loss_pct_of_position)
worst_pospct = losers.sort_values("loss_frac_of_position", ascending=False).head(TOP_LOSERS).copy()
# Worst losses by portfolio % (highest loss_pct_of_portfolio)
if losers["loss_frac_of_portfolio"].notna().any():
    worst_portpct = losers.sort_values("loss_frac_of_portfolio", ascending=False).head(TOP_LOSERS).copy()
else:
    worst_portpct = losers.head(0).copy()

loss_cols = [
    "ticker", "start_date", "end_date",
    "realized_pnl", "entry_cost",
    "loss_pct_of_position", "loss_pct_of_portfolio",
    "portfolio_before", "reason", "signal_date"
]
for df_ in [worst_dollars, worst_pospct, worst_portpct]:
    for c in loss_cols:
        if c not in df_.columns:
            df_[c] = np.nan
    df_["start_date"] = pd.to_datetime(df_["start_date"], errors="coerce").dt.date
    df_["end_date"] = pd.to_datetime(df_["end_date"], errors="coerce").dt.date

print("\n--- Worst losses by $ PnL ---")
print(worst_dollars[loss_cols].to_string(index=False, float_format=lambda x: f"{x:,.2f}"))

print("\n--- Worst losses by % of position ---")
print(worst_pospct[loss_cols].to_string(index=False, float_format=lambda x: f"{x:,.2f}"))

if len(worst_portpct) > 0:
    print("\n--- Worst losses by % of portfolio (at SELL) ---")
    print(worst_portpct[loss_cols].to_string(index=False, float_format=lambda x: f"{x:,.2f}"))
else:
    print("\n--- Worst losses by % of portfolio (at SELL) ---")
    print("(portfolio_before missing or all NaN; cannot compute portfolio-relative losses)")

print()

worst_dollars.to_csv(os.path.join(OUTPUT_DIR, "25_worst_losses_by_dollars.csv"), index=False)
worst_pospct.to_csv(os.path.join(OUTPUT_DIR, "25_worst_losses_by_position_pct.csv"), index=False)
worst_portpct.to_csv(os.path.join(OUTPUT_DIR, "25_worst_losses_by_portfolio_pct.csv"), index=False)

# Loss descriptive stats (losers only)
loss_desc_cols = ["realized_pnl", "loss_frac_of_position", "loss_frac_of_portfolio", "entry_cost", "portfolio_before", "holding_days_wavg"]
for c in loss_desc_cols:
    if c not in losers.columns:
        losers[c] = np.nan

loss_desc = losers[loss_desc_cols].describe(percentiles=[0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]).T
print("=== Loss descriptive stats (losers only) ===")
print(loss_desc.to_string(float_format=lambda x: f"{x:,.6f}"))
print()

loss_desc.to_csv(os.path.join(OUTPUT_DIR, "25_loss_descriptive_stats_losers.csv"))

# ============================================================
# TRADE DESCRIPTIVE STATISTICS (HOLD TIMES + RETURNS)
# ============================================================

def safe_div(a, b):
    return a / b if b not in (0, 0.0, None) else np.nan

realized["is_win"] = realized["realized_pnl"] > 0

# Core summary metrics
n = len(realized)
win_rate = realized["is_win"].mean()
avg_pnl = realized["realized_pnl"].mean()
med_pnl = realized["realized_pnl"].median()

gross_gains = realized.loc[realized["realized_pnl"] > 0, "realized_pnl"].sum()
gross_losses = realized.loc[realized["realized_pnl"] < 0, "realized_pnl"].sum()  # negative
profit_factor = safe_div(gross_gains, abs(gross_losses)) if gross_losses != 0 else np.nan

avg_win = realized.loc[realized["realized_pnl"] > 0, "realized_pnl"].mean()
avg_loss = realized.loc[realized["realized_pnl"] < 0, "realized_pnl"].mean()  # negative
payoff_ratio = safe_div(avg_win, abs(avg_loss)) if pd.notnull(avg_win) and pd.notnull(avg_loss) else np.nan

avg_ret = realized["return_frac"].mean()
med_ret = realized["return_frac"].median()

# Holding time stats (share-weighted)
avg_hold = realized["holding_days_wavg"].mean()
med_hold = realized["holding_days_wavg"].median()

print("=== Trade Descriptive Statistics (per realized SELL) ===")
print(f"Trades closed (SELL records): {n:,}")
print(f"Win rate: {win_rate:.2%}")
print(f"Avg PnL: {avg_pnl:,.2f} | Median PnL: {med_pnl:,.2f}")
print(f"Gross gains: {gross_gains:,.2f} | Gross losses: {gross_losses:,.2f}")
print(f"Profit factor (gains/abs(losses)): {profit_factor:,.3f}")
print(f"Avg win: {avg_win:,.2f} | Avg loss: {avg_loss:,.2f} | Payoff ratio: {payoff_ratio:,.3f}")
print(f"Avg return: {avg_ret:.4f} | Median return: {med_ret:.4f}")
print(f"Avg hold (wavg days): {avg_hold:.2f} | Median hold (wavg days): {med_hold:.2f}")
print()

# Quantile tables
desc_cols = ["realized_pnl", "return_frac", "holding_days_wavg", "entry_cost", "shares_sold", "n_entry_lots_used"]
quantiles = [0.01, 0.05, 0.10, 0.25, 0.50, 0.75, 0.90, 0.95, 0.99]

for c in desc_cols:
    if c not in realized.columns:
        realized[c] = np.nan

trade_desc = realized[desc_cols].describe(percentiles=quantiles).T
print("=== Descriptive table (all trades) ===")
print(trade_desc.to_string(float_format=lambda x: f"{x:,.6f}"))
print()

# Winners vs losers
win_desc = realized.loc[realized["is_win"], desc_cols].describe(percentiles=quantiles).T
loss_desc2 = realized.loc[~realized["is_win"], desc_cols].describe(percentiles=quantiles).T

print("=== Descriptive table (winners only) ===")
print(win_desc.to_string(float_format=lambda x: f"{x:,.6f}"))
print()

print("=== Descriptive table (losers only) ===")
print(loss_desc2.to_string(float_format=lambda x: f"{x:,.6f}"))
print()

# Hold-time buckets
bins = [0, 1, 5, 10, 20, 40, 80, 160, 320, np.inf]
labels = ["0-1", "1-5", "5-10", "10-20", "20-40", "40-80", "80-160", "160-320", "320+"]

realized["hold_bucket"] = pd.cut(realized["holding_days_wavg"], bins=bins, labels=labels, right=False)
bucket = realized.groupby("hold_bucket").agg(
    trades=("realized_pnl", "count"),
    win_rate=("is_win", "mean"),
    avg_return=("return_frac", "mean"),
    med_return=("return_frac", "median"),
    avg_pnl=("realized_pnl", "mean"),
    sum_pnl=("realized_pnl", "sum"),
).reset_index()

print("=== By holding-period bucket ===")
print(bucket.to_string(index=False, float_format=lambda x: f"{x:,.4f}"))
print()

# Save stats
trade_desc.to_csv(os.path.join(OUTPUT_DIR, "25_trade_descriptive_stats_all.csv"))
win_desc.to_csv(os.path.join(OUTPUT_DIR, "25_trade_descriptive_stats_winners.csv"))
loss_desc2.to_csv(os.path.join(OUTPUT_DIR, "25_trade_descriptive_stats_losers.csv"))
bucket.to_csv(os.path.join(OUTPUT_DIR, "25_trade_stats_by_hold_bucket.csv"), index=False)

# ============================================================
# AGGREGATE TOTALS
# ============================================================

total_net_pnl = realized["realized_pnl"].sum()
total_pos_pnl = realized.loc[realized["realized_pnl"] > 0, "realized_pnl"].sum()
total_neg_pnl = realized.loc[realized["realized_pnl"] < 0, "realized_pnl"].sum()

print(f"Total net realized P&L : {total_net_pnl:,.2f}")
print(f"Total gains            : {total_pos_pnl:,.2f}")
print(f"Total losses           : {total_neg_pnl:,.2f}\n")

# ============================================================
# CONCENTRATION BY INDIVIDUAL TRADE
# ============================================================

winners = realized[realized["realized_pnl"] > 0].sort_values(
    "realized_pnl", ascending=False
).reset_index(drop=True)

winners["cum_pnl"] = winners["realized_pnl"].cumsum()
winners["cum_frac_of_wins"] = winners["cum_pnl"] / total_pos_pnl if total_pos_pnl else np.nan

top10pct_n = max(1, int(len(winners) * 0.10))

print("=== Concentration by Individual Trade ===")
print(f"Winning trades: {len(winners):,}")
print(f"Top 5 wins  : {winners.head(5)['realized_pnl'].sum():,.2f}")
print(f"Top 10 wins : {winners.head(10)['realized_pnl'].sum():,.2f}")
print(f"Top 10% wins ({top10pct_n} trades): {winners.head(top10pct_n)['realized_pnl'].sum():,.2f}\n")

# ============================================================
# CONCENTRATION BY TICKER
# ============================================================

pnl_by_ticker = (
    realized.groupby("ticker")["realized_pnl"]
    .sum()
    .sort_values(ascending=False)
    .to_frame("total_realized_pnl")
)

pnl_by_ticker["fraction_of_net"] = (
    pnl_by_ticker["total_realized_pnl"] / total_net_pnl if total_net_pnl else np.nan
)

top_tickers = pnl_by_ticker.head(20)

print("=== Top 20 Tickers by Realized P&L ===")
print(top_tickers.to_string(float_format=lambda x: f"{x:,.2f}"))
print()

# ============================================================
# CONCENTRATION BY YEAR
# ============================================================

pnl_by_year = (
    realized.groupby("exit_year")["realized_pnl"]
    .sum()
    .to_frame("total_realized_pnl")
)

pnl_by_year["fraction_of_net"] = (
    pnl_by_year["total_realized_pnl"] / total_net_pnl if total_net_pnl else np.nan
)

top_years = pnl_by_year.sort_values("total_realized_pnl", ascending=False).head(10)

print("=== P&L by Year ===")
print(pnl_by_year.to_string(float_format=lambda x: f"{x:,.2f}"))
print()

# ============================================================
# OPTIONAL: EQUITY CURVE SANITY CHECK
# ============================================================

if os.path.exists(EQUITY_FILE):
    eq = pd.read_parquet(EQUITY_FILE)
    if {"date", "portfolio_value"} <= set(eq.columns):
        eq["date"] = pd.to_datetime(eq["date"], errors="coerce")
        eq = eq.sort_values("date")
        print("=== Equity Curve Sanity Check ===")
        print("Start:", eq["date"].iloc[0].date(), f"{eq['portfolio_value'].iloc[0]:,.2f}")
        print("End:  ", eq["date"].iloc[-1].date(), f"{eq['portfolio_value'].iloc[-1]:,.2f}")
        print()

# ============================================================
# SAVE OUTPUTS
# ============================================================

realized.to_parquet(os.path.join(OUTPUT_DIR, "25_realized_trades_pnl.parquet"), index=False)
realized.to_csv(os.path.join(OUTPUT_DIR, "25_realized_trades_pnl.csv"), index=False)

pnl_by_year.to_csv(os.path.join(OUTPUT_DIR, "25_pnl_by_year.csv"))
pnl_by_ticker.to_csv(os.path.join(OUTPUT_DIR, "25_pnl_by_ticker.csv"))

winners.head(50).to_csv(os.path.join(OUTPUT_DIR, "25_top_trades.csv"), index=False)
top_tickers.to_csv(os.path.join(OUTPUT_DIR, "25_top_tickers.csv"))
top_years.to_csv(os.path.join(OUTPUT_DIR, "25_top_years.csv"))

print("=== DONE ===")


=== P&L CONCENTRATION ANALYSIS ===
Loaded trades: 3,020
Trade columns: ['signal_date', 'exec_date', 'signal_close_adj', 'exec_open_adj', 'ticker', 'type', 'shares', 'price', 'value', 'reason', 'slope_rank_within_top', 'spy_above_200dma', 'cash_before', 'cash_after', 'equity_after', 'portfolio_after', 'num_positions_after', 'date', 'trade_id', 'trade_value', 'equity_before', 'portfolio_before'] 

Realized trades (SELLs with P&L): 1,658

=== TOP 30 BIGGEST WINNING TRADES (per SELL exec (OPEN)) ===
ticker start_date  n_entry_lots_used  avg_cost_basis  sale_price   end_date  time_in_trade_days  shares_sold  realized_pnl  return_pct
   WDC 2025-07-10                  1           64.84      135.90 2025-10-02               84.00    10,017.00    711,869.34      109.61
   IVZ 2020-11-12                  1           12.22       22.47 2021-05-27              196.00    65,251.00    668,778.85       83.87
  NVDA 2019-11-07                  1            5.25       13.44 2020-11-12              371.0

In [2]:
#!/usr/bin/env python
import os
import numpy as np
import pandas as pd

TRADES_FILE   = "./13-trading_output_regression_insp500_spyfilter_cap15/13-trades_regression_insp500_spyfilter_cap15.parquet"
REALIZED_FILE = "./25-pnl_concentration_output/25_realized_trades_pnl.parquet"

trades = pd.read_parquet(TRADES_FILE)

# --- trade notional (gross dollars traded) ---
if "value" in trades.columns:
    notional = pd.to_numeric(trades["value"], errors="coerce")
else:
    notional = pd.to_numeric(trades["shares"], errors="coerce") * pd.to_numeric(trades["price"], errors="coerce")

gross_dollars_traded = float(np.nansum(np.abs(notional.values)))

# --- net realized pnl (from your FIFO reconstruction output) ---
realized = pd.read_parquet(REALIZED_FILE)
net_realized_pnl = float(pd.to_numeric(realized["realized_pnl"], errors="coerce").sum())

edge_per_dollar = net_realized_pnl / gross_dollars_traded if gross_dollars_traded > 0 else np.nan
edge_bps = edge_per_dollar * 10_000

# Optional: “per round-trip $” approximation (divide by 2, since turnover counts both buy+sell)
edge_bps_roundtrip = edge_bps * 2

print("=== EDGE PER DOLLAR TRADED ===")
print(f"Net realized P&L:        {net_realized_pnl:,.2f}")
print(f"Gross $ traded:          {gross_dollars_traded:,.2f}")
print(f"Edge per $ traded:       {edge_per_dollar:.8f}")
print(f"Edge (bps, per $ traded): {edge_bps:,.3f} bps")
print(f"Edge (bps, per round-trip $): {edge_bps_roundtrip:,.3f} bps")

out = pd.DataFrame([{
    "net_realized_pnl": net_realized_pnl,
    "gross_dollars_traded": gross_dollars_traded,
    "edge_per_dollar": edge_per_dollar,
    "edge_bps_per_dollar": edge_bps,
    "edge_bps_per_roundtrip_dollar": edge_bps_roundtrip,
}])

out.to_csv("./25-pnl_concentration_output/edge_per_dollar_traded.csv", index=False)
print("Saved: ./25-pnl_concentration_output/edge_per_dollar_traded.csv")


=== EDGE PER DOLLAR TRADED ===
Net realized P&L:        18,766,491.89
Gross $ traded:          1,292,292,054.29
Edge per $ traded:       0.01452187
Edge (bps, per $ traded): 145.219 bps
Edge (bps, per round-trip $): 290.437 bps
Saved: ./25-pnl_concentration_output/edge_per_dollar_traded.csv
