In [10]:
#!/usr/bin/env python
"""
Monthly cash-balance table for a trading system.

Goal:
  Produce a Year x Month table where each cell is the *month-end net cash balance*.

Interpretation (per your request):
  "Cashflow" = the net cash balance at month-end (not the monthly change).

Data source priority:
  1) Equity curve file (if it contains a cash column)
  2) Trade log (uses cash_after; takes last cash value observed up to each month-end)

Outputs:
  - 26_monthly_cash_balance_long.csv
  - 26_monthly_cash_balance_pivot.csv
  - (optional) 26_monthly_cash_balance_pivot.xlsx
"""

import os
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

# ============================================================
# CONFIG
# ============================================================

TRADES_FILE = "./13-trading_output_regression_insp500_spyfilter_cap15/13-trades_regression_insp500_spyfilter_cap15.parquet"
EQUITY_FILE = "./13-trading_output_regression_insp500_spyfilter_cap15/13-equity_curve_regression_insp500_spyfilter_cap15.parquet"  # optional
OUTPUT_DIR  = "./25a-cashflow_monthly_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Candidate cash column names (equity curve may differ by engine/version)
EQUITY_CASH_COL_CANDIDATES = [
    "cash", "cash_balance", "cash_value", "cash_usd", "net_cash", "cash_after"
]

# Month labels for final pivot
MONTH_LABELS = {1:"Jan", 2:"Feb", 3:"Mar", 4:"Apr", 5:"May", 6:"Jun",
                7:"Jul", 8:"Aug", 9:"Sep", 10:"Oct", 11:"Nov", 12:"Dec"}

print("=== MONTHLY CASH BALANCE TABLE ===")

# ============================================================
# LOAD + BUILD A CASH TIME SERIES
# ============================================================

cash_ts = None

# --- Try equity curve first (best if it has daily cash) ---
if os.path.exists(EQUITY_FILE):
    eq = pd.read_parquet(EQUITY_FILE)
    # normalize date column
    if "date" in eq.columns:
        eq["date"] = pd.to_datetime(eq["date"])
    elif "exec_date" in eq.columns:
        eq["date"] = pd.to_datetime(eq["exec_date"])
    else:
        eq["date"] = pd.to_datetime(eq.iloc[:, 0], errors="coerce")  # last resort

    # detect cash column
    cash_col = None
    for c in EQUITY_CASH_COL_CANDIDATES:
        if c in eq.columns:
            cash_col = c
            break

    if cash_col is not None:
        eq = eq.sort_values("date")
        cash_ts = (
            eq[["date", cash_col]]
            .rename(columns={cash_col: "cash_balance"})
            .dropna(subset=["date"])
        )
        cash_ts["cash_balance"] = pd.to_numeric(cash_ts["cash_balance"], errors="coerce")
        cash_ts = cash_ts.dropna(subset=["cash_balance"])
        # if multiple rows per day, keep last
        cash_ts = cash_ts.groupby(cash_ts["date"].dt.normalize(), as_index=False).tail(1)
        cash_ts = cash_ts.sort_values("date").reset_index(drop=True)
        print(f"Using EQUITY_FILE for cash series (column='{cash_col}'): {len(cash_ts):,} rows")
    else:
        print("EQUITY_FILE found but no recognized cash column. Falling back to TRADES_FILE.")
else:
    print("EQUITY_FILE not found. Using TRADES_FILE.")

# --- Fallback: use trades cash_after (last observed cash up to month-end) ---
if cash_ts is None:
    trades = pd.read_parquet(TRADES_FILE)

    # normalize date
    if "exec_date" in trades.columns:
        trades["date"] = pd.to_datetime(trades["exec_date"])
    elif "date" in trades.columns:
        trades["date"] = pd.to_datetime(trades["date"])
    else:
        raise ValueError("Trades file must contain 'exec_date' or 'date'.")

    if "cash_after" not in trades.columns:
        raise ValueError("Trades file must contain 'cash_after' to compute cash balance by month.")

    trades = trades.sort_values("date").reset_index(drop=True)

    cash_ts = trades[["date", "cash_after"]].rename(columns={"cash_after": "cash_balance"})
    cash_ts["cash_balance"] = pd.to_numeric(cash_ts["cash_balance"], errors="coerce")
    cash_ts = cash_ts.dropna(subset=["cash_balance"]).copy()

    # If multiple trades per day, take the last cash_after of that day
    cash_ts["day"] = cash_ts["date"].dt.normalize()
    cash_ts = cash_ts.groupby("day", as_index=False).tail(1)
    cash_ts = cash_ts.drop(columns=["day"]).sort_values("date").reset_index(drop=True)

    print(f"Using TRADES_FILE for cash series (cash_after): {len(cash_ts):,} rows")

if cash_ts.empty:
    raise RuntimeError("Cash time series is empty after loading/cleaning.")

# ============================================================
# MONTH-END CASH BALANCE (carry last known cash to month end)
# ============================================================

start_date = cash_ts["date"].min().normalize()
end_date   = cash_ts["date"].max().normalize()

# month-end calendar
month_ends = pd.date_range(start=start_date, end=end_date, freq="M")
month_ends_df = pd.DataFrame({"month_end": month_ends})

# merge_asof to get last known cash on or before each month_end
cash_ts_sorted = cash_ts.sort_values("date")[["date", "cash_balance"]].copy()

monthly = pd.merge_asof(
    month_ends_df.sort_values("month_end"),
    cash_ts_sorted.rename(columns={"date": "asof_date"}).sort_values("asof_date"),
    left_on="month_end",
    right_on="asof_date",
    direction="backward",
)

monthly["year"] = monthly["month_end"].dt.year
monthly["month"] = monthly["month_end"].dt.month
monthly["month_name"] = monthly["month"].map(MONTH_LABELS)

# If there are early month_ends before the first cash point, cash_balance will be NaN.
# Optionally drop those:
monthly = monthly.dropna(subset=["cash_balance"]).copy()

# ============================================================
# PIVOT: rows = year, cols = months
# ============================================================

pivot = (
    monthly.pivot_table(
        index="year",
        columns="month_name",
        values="cash_balance",
        aggfunc="last"
    )
    .reindex(columns=list(MONTH_LABELS.values()))  # enforce Jan..Dec order
    .sort_index()
)

print("\n=== MONTH-END NET CASH BALANCE (Year x Month) ===")
print(pivot.to_string(float_format=lambda x: f"{x:,.2f}"))

# ============================================================
# SAVE OUTPUTS
# ============================================================

monthly_out = monthly[["month_end", "year", "month", "month_name", "cash_balance"]].copy()
monthly_out.to_csv(os.path.join(OUTPUT_DIR, "25a_monthly_cash_balance_long.csv"), index=False)
pivot.to_csv(os.path.join(OUTPUT_DIR, "25a_monthly_cash_balance_pivot.csv"))

# Optional Excel (nice for quick viewing)
try:
    pivot.to_excel(os.path.join(OUTPUT_DIR, "25a_monthly_cash_balance_pivot.xlsx"))
except Exception:
    pass

print(f"\nSaved:")
print(f"  → {os.path.join(OUTPUT_DIR, '25a_monthly_cash_balance_long.csv')}")
print(f"  → {os.path.join(OUTPUT_DIR, '25a_monthly_cash_balance_pivot.csv')}")
print(f"  → {os.path.join(OUTPUT_DIR, '25a_monthly_cash_balance_pivot.xlsx')} (if supported)")
print("\n=== DONE ===")


=== MONTHLY CASH BALANCE TABLE ===
Using EQUITY_FILE for cash series (column='cash'): 6,790 rows

=== MONTH-END NET CASH BALANCE (Year x Month) ===
month_name          Jan          Feb          Mar           Apr          May          Jun           Jul           Aug           Sep           Oct           Nov          Dec
year                                                                                                                                                                        
1999         215,084.24   257,680.02   198,863.67    194,379.98   254,778.15   188,206.64    173,954.91    242,236.92    247,921.52    267,770.36    148,788.61   161,306.05
2000          71,439.78   214,022.31   213,808.90    142,809.63   104,164.61   292,862.26    189,021.12    327,622.85    239,144.22    439,192.93    524,492.63   616,738.48
2001         651,053.33   651,053.33   651,053.33    651,053.33   651,053.33   651,053.33    651,053.33    651,053.33    651,053.33    651,053.33    651,053.33 

In [12]:
import pandas as pd

equity = pd.read_parquet("./13-trading_output_regression_insp500_spyfilter_cap15/13-equity_curve_regression_insp500_spyfilter_cap15.parquet")

if "cash" in equity.columns:
    equity["cash_pct"] = equity["cash"] / equity["portfolio_value"]
    print(f"Average cash %: {equity['cash_pct'].mean()*100:.1f}%")
    print(f"Median cash %: {equity['cash_pct'].median()*100:.1f}%")
    print(f"Min cash %: {equity['cash_pct'].min()*100:.1f}%")
    print(f"Max cash %: {equity['cash_pct'].max()*100:.1f}%")
    
    # By year
    equity["date"] = pd.to_datetime(equity["date"])
    equity["year"] = equity["date"].dt.year
    print("\nCash % by year:")
    print(equity.groupby("year")["cash_pct"].mean().apply(lambda x: f"{x*100:.1f}%"))

Average cash %: 49.7%
Median cash %: 42.9%
Min cash %: 0.4%
Max cash %: 100.0%

Cash % by year:
year
1999    45.3%
2000    41.2%
2001    99.9%
2002    93.5%
2003    60.8%
2004    45.5%
2005    40.7%
2006    40.2%
2007    42.7%
2008    95.4%
2009    67.5%
2010    45.0%
2011    53.7%
2012    33.4%
2013    37.0%
2014    34.6%
2015    48.4%
2016    46.0%
2017    42.9%
2018    37.1%
2019    43.0%
2020    34.7%
2021    30.0%
2022    72.9%
2023    37.4%
2024    31.3%
2025    42.6%
2026    41.7%
Name: cash_pct, dtype: object
