In [1]:
# ===============================================================
#      VOL-ADJ TREND ANALYSIS  –  SINGLE-FILE VERSION
# ===============================================================

# ───────────────────────────────────────────────────────────────
#  0 · IMPORTS  (all in one place)
# ───────────────────────────────────────────────────────────────
import pandas as pd
import numpy as np
import xlsxwriter
import logging
from io import BytesIO
import ipywidgets as widgets
from IPython.display import display, clear_output
from ipyfilechooser import FileChooser
from typing import List, Dict, Optional

# ───────────────────────────────────────────────────────────────
#  1 · CSV LOADER + RF DETECTOR
# ───────────────────────────────────────────────────────────────
def load_csv(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if "Date" not in df.columns:
        raise ValueError("CSV must contain a 'Date' column.")
    return df

def identify_risk_free_fund(df: pd.DataFrame) -> str:
    returns = df.drop(columns="Date", errors="ignore")
    stdevs  = returns.std(skipna=True, ddof=0)
    return stdevs.idxmin()

## 2. Select Funds

In [2]:
# ===============================================================
# 2 · SELECT_FUNDS  (restored ≤ 3-missing-months rule)
# ===============================================================
def select_funds(
        df: pd.DataFrame,
        rf_col: str,
        fund_columns: list[str],
        in_sdate, in_edate,
        out_sdate, out_edate,
        selection_mode: str = "all",
        random_n: int | None = None
) -> list[str]:
    """
    Eligible funds:
      • ≤ 3 months missing inside combined in/out window
      • no run of > 6 consecutive NaN months inside that window
    """

    # ---- single coercion --------------------------------------------
    if not pd.api.types.is_datetime64_any_dtype(df["Date"]):
        df = df.copy()
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
        df.dropna(subset=["Date"], inplace=True)

    df["Month"] = df["Date"].dt.to_period("M")

    span = pd.period_range(pd.Period(in_sdate, "M"),
                           pd.Period(out_edate, "M"), freq="M")

    eligible = []
    for f in fund_columns:
        # monthly coverage mask over the analysis span
        m_ok = (df.groupby("Month")[f]
                  .apply(lambda col: col.notna().any()))
        mask = m_ok.reindex(span, fill_value=False).to_numpy()

        # ----- missing-month tolerance ≤ 3 ----------------------------
        if (~mask).sum() > 3:           # ← restored original tolerance
            continue

        # ----- gap test  (run length > 6) -----------------------------
        gap = np.diff(np.flatnonzero(np.r_[True, mask, True])).max() - 1
        if gap > 6:
            continue

        eligible.append(f)

    # ---- selection modes --------------------------------------------
    if selection_mode == "all" or random_n is None:
        return eligible
    if selection_mode == "random":
        if random_n > len(eligible):
            raise ValueError(f"Sample N {random_n} > eligible {len(eligible)}")
        return list(np.random.choice(eligible, random_n, replace=False))
    raise ValueError("Unsupported selection_mode")


## 3. Weight Prep

In [3]:
# ───────────────────────────────────────────────────────────────
#  3 · WEIGHT PREP
# ───────────────────────────────────────────────────────────────
def prepare_weights(selected: list[str],
                    custom: Dict[str, int] | None) -> tuple[Dict[str, float], np.ndarray]:
    if not custom:
        w = {f: 1/len(selected) for f in selected}
    else:
        missing = [f for f in selected if f not in custom]
        if missing:
            raise ValueError(f"Missing weights for {missing}")
        w = {f: pct/100 for f, pct in custom.items()}
        if abs(sum(w.values()) - 1) > 1e-6:
            raise ValueError("Custom weights must sum to 100.")
    vec = np.array([w[f] for f in selected])
    return w, vec

## 4. Analysis (In-Sample & Out-of-Sample)
The `run_analysis` function orchestrates the entire process:
- Function definitions
- Validates date inputs.
- Converts 'Date' column.
- Identifies risk-free column.
- Fills short gaps.
- Selects funds.
- Computes in-sample scaling factors and applies them in- and out-of-sample.
- Computes individual fund stats and portfolio stats.

In [4]:
# ===============================================================
# 4 · CORE STATS  +  RUN_ANALYSIS  (helpers included, weight fix)
# ===============================================================

M_PER_YEAR = 12           # constant used across helpers

# ---------- helpers --------------------------------------------
def _ensure_dt(df: pd.DataFrame) -> pd.DataFrame:
    """Return a copy whose Date column is datetime64[ns]."""
    if pd.api.types.is_datetime64_any_dtype(df["Date"]):
        return df
    df = df.copy()
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df.dropna(subset=["Date"], inplace=True)
    return df


def _ann_return(s: pd.Series) -> float:
    s = s.dropna()
    return (1 + s).prod() ** (M_PER_YEAR / len(s)) - 1 if len(s) else np.nan


def _ann_vol(s: pd.Series) -> float:
    s = s.dropna()
    return s.std(ddof=0) * np.sqrt(M_PER_YEAR) if len(s) else np.nan


def _sharpe(s: pd.Series, rf: pd.Series) -> float:
    ex = (s - rf).dropna()
    vol = _ann_vol(ex)
    return _ann_return(ex) / vol if vol else np.nan


def _sortino(s: pd.Series, rf: pd.Series) -> float:
    ex = (s - rf).dropna()
    neg = ex[ex < 0]
    dvol = neg.std(ddof=0) * np.sqrt(M_PER_YEAR) if len(neg) else np.nan
    return _ann_return(ex) / dvol if dvol else np.nan


def _max_dd(s: pd.Series) -> float:
    nav = (1 + s).dropna().cumprod()
    return ((nav / nav.cummax()) - 1).min() if len(nav) else np.nan


def _stats(series: pd.Series, rf_series: pd.Series) -> tuple[float, ...]:
    return (
        _ann_return(series),
        _ann_vol(series),
        _sharpe(series, rf_series),
        _sortino(series, rf_series),
        _max_dd(series),
    )


# ---------- main ------------------------------------------------
def run_analysis(
    df: pd.DataFrame,
    selected: list[str],
    w_vec: np.ndarray,
    w_dict: dict[str, float] | None,
    rf_col: str,
    in_start: str,
    in_end: str,
    out_start: str,
    out_end: str,
    target_vol: float = 0.25,
    monthly_cost: float = 0.0033,
    indices_list: Optional[list[str]] = None,
):
    """
    Vectorised run_analysis with correct weight re-normalisation
    after funds are dropped.
    Returns the same keys used by the UI and export functions.
    """
    df = _ensure_dt(df)

    # ---- date masks --------------------------------------------------
    in_s = pd.to_datetime(in_start)  + pd.offsets.MonthEnd(0)
    in_e = pd.to_datetime(in_end)    + pd.offsets.MonthEnd(0)
    out_s= pd.to_datetime(out_start) + pd.offsets.MonthEnd(0)
    out_e= pd.to_datetime(out_end)   + pd.offsets.MonthEnd(0)

    m_in  = df["Date"].between(in_s,  in_e)
    m_out = df["Date"].between(out_s, out_e)

    in_df,  out_df  = df.loc[m_in,  selected], df.loc[m_out, selected]
    in_rf,  out_rf  = df.loc[m_in,  rf_col],   df.loc[m_out, rf_col]

    # ---- drop funds with any NaNs in either window ------------------
    good = [f for f in selected
            if in_df[f].notna().all() and out_df[f].notna().all()]
    dropped = list(set(selected) - set(good))
    if dropped:
        logging.warning("Dropped funds: %s", dropped)

    selected = good
    # >>>> new guard: kick out any accidental index columns
    selected = [f for f in selected if f not in (indices_list or [])]
    # <<<<

    in_df, out_df = in_df[selected], out_df[selected]

    # rebuild weights
    if w_dict is None:                      # equal-weight path
        w_dict = {f: 1/len(selected) for f in selected}
    else:                                   # manual path → rescale
        pct   = {f: w_dict[f]*100 for f in selected}
        total = sum(pct.values())
        w_dict = {f: p/total for f, p in pct.items()}
    w_vec = np.array([w_dict[f] for f in selected])

    # ---- scaling ----------------------------------------------------
    vols = in_df.apply(_ann_vol)
    scale = np.where(vols > 0, target_vol / vols, 1.0)
    in_sc  = (in_df * scale) - monthly_cost
    out_sc = (out_df * scale) - monthly_cost
    in_sc.clip(lower=-1, inplace=True)
    out_sc.clip(lower=-1, inplace=True)

    # ---- stats ------------------------------------------------------
    rf_value = in_rf.mean() if hasattr(in_rf, "mean") else float(in_rf)

    # now we can build our stats config
    stats_cfg = RiskStatsConfig(
        risk_free       = rf_value,
        periods_per_year= cfg.periods_per_year
    )
    
    # and only _then_ call _stats()
    in_stat = {
        f: _stats(in_sc[f], stats_cfg)
        for f in selected

    out_rf_value = out_rf.mean() if hasattr(out_rf, "mean") else float(out_rf)
    stats_cfg.risk_free = out_rf_value
    
    out_stat = {
        f: _stats(out_sc[f], stats_cfg)
        for f in selected
    }

    ew_vec = np.full(len(selected), 1/len(selected))

    results = {
        "selected_funds": selected,
        "indices_list":   indices_list or [],
        "fund_weights":   w_dict,
        "ew_weights":     {f: 1/len(selected) for f in selected},
        "in_sample_stats":  in_stat,
        "out_sample_stats": out_stat,
        "in_ew_stats":     _stats(in_sc.dot(ew_vec), in_rf),
        "out_ew_stats":    _stats(out_sc.dot(ew_vec), out_rf),
        "in_user_stats":   _stats(in_sc.dot(w_vec),  in_rf),
        "out_user_stats":  _stats(out_sc.dot(w_vec), out_rf),
        "dropped":         dropped,
    }

    # ---- optional index stats ---------------------------------------
    if indices_list:
        idx_stats = {}
        for col in indices_list:
            idx_stats[col] = {
                "in_sample":  _stats(df.loc[m_in,  col], in_rf),
                "out_sample": _stats(df.loc[m_out, col], out_rf),
            }
        results["index_stats"] = idx_stats

    return results


## 5. Excel Export
Creates an Excel file with In-Sample, Out-of-Sample and Equal-weight and User-weight.

In [5]:
# ───────────────────────────────────────────────────────────────
#  5 · EXPORT  (NaN-safe, weight-format fix)
# ───────────────────────────────────────────────────────────────
# ───────── 5 · EXPORT  (final, bug-free) ───────────────────────
# ───────── 5 · EXPORT  (self-healing index section) ───────────
# ───────── 5 · EXPORT  (final safe version) ───────────────────
def export_to_excel(results, full_df, fname,
                    in_start, in_end, out_start, out_end):
    """
    Write summary & stats to an Excel workbook.
    Will compute index_stats on-the-fly if they were not pre-computed.
    """

    buf = BytesIO()
    wb  = xlsxwriter.Workbook(buf, {"in_memory": True})
    ws  = wb.add_worksheet("Summary")

    bold = wb.add_format({"bold": True})
    int0 = wb.add_format({"num_format": "0"})
    num2 = wb.add_format({"num_format": "0.00"})
    red  = wb.add_format({"num_format": "0.00", "font_color": "red"})

    safe = lambda v: "" if (pd.isna(v) or not np.isfinite(v)) else v
    pct  = lambda t: [t[0]*100, t[1]*100, t[2], t[3], t[4]*100]

    # ─── header rows ──────────────────────────────────────────
    ws.write_row(0, 0, ["Vol-Adj Trend Analysis"], bold)
    ws.write_row(1, 0, [f"In:  {in_start} → {in_end}"])
    ws.write_row(2, 0, [f"Out: {out_start} → {out_end}"])

    hdr = ["Name", "Weight %",
           "R (IN)%", "V (IN)%", "Sharpe", "Sortino", "MDD (IN)%",
           "R (OUT)%", "V (OUT)%", "Sharpe", "Sortino", "MDD (OUT)%"]
    row = 4
    ws.write_row(row, 0, hdr, bold)
    row += 1

    def wr(r, name, wt, tin, tout, b=False):
        ws.write(r, 0, name, bold if b else None)
        ws.write(r, 1,
                 wt*100 if wt != "" else "",
                 int0 if wt != "" else None)
        for c, (v, fm) in enumerate(zip(pct(tin)+pct(tout),
                                        [num2,num2,num2,num2,red]*2), start=2):
            ws.write(r, c, safe(v), fm)

    # ─── portfolio rows ──────────────────────────────────────
    wr(row, "Equal-Weight", 1,
       results["in_ew_stats"],  results["out_ew_stats"], True); row += 1
    wr(row, "User-Weight",  1,
       results["in_user_stats"], results["out_user_stats"], True); row += 2

    # ─── fund rows ───────────────────────────────────────────
    ws.write(row, 0, "Funds", bold); row += 1
    for f in results["selected_funds"]:
        wr(row, f, results["fund_weights"][f],
           results["in_sample_stats"][f],
           results["out_sample_stats"][f]); row += 1

    # ─── index rows (compute if necessary) ───────────────────
    if results.get("indices_list"):
        idx_stats = results.get("index_stats")
        if not idx_stats:
            # build stats on the fly (coerce Date first)
            df = full_df.copy()
            if not pd.api.types.is_datetime64_any_dtype(df["Date"]):
                df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
                df.dropna(subset=["Date"], inplace=True)

            rf = identify_risk_free_fund(df)
            in_s  = pd.to_datetime(in_start)  + pd.offsets.MonthEnd(0)
            in_e  = pd.to_datetime(in_end)    + pd.offsets.MonthEnd(0)
            out_s = pd.to_datetime(out_start) + pd.offsets.MonthEnd(0)
            out_e = pd.to_datetime(out_end)   + pd.offsets.MonthEnd(0)
            m_in  = df["Date"].between(in_s,  in_e)
            m_out = df["Date"].between(out_s, out_e)

            idx_stats = {}
            for col in results["indices_list"]:
                idx_stats[col] = {
                    "in_sample":  _stats(df.loc[m_in,  col],
                                         df.loc[m_in,  rf]),
                    "out_sample": _stats(df.loc[m_out, col],
                                         df.loc[m_out, rf])
                }

        ws.write(row, 0, "INDEX", bold); row += 1
        for idx, sd in idx_stats.items():
            wr(row, idx, "",
               sd["in_sample"], sd["out_sample"], True)
            row += 1

    # ─── save ────────────────────────────────────────────────
    wb.close()
    with open(fname, "wb") as f:
        f.write(buf.getvalue())

    logging.info("Workbook saved → %s", fname)

## 6. Run Parameters,Widgets & User Inputs
Here we define some IPython widgets for in-sample/out-of-sample dates, target volatility, monthly cost, etc.

### Using This Notebook
1. Run all cells.
2. Call `demo_run()` in a new cell to see a quick example with dummy data.
3. To use your own data, load it into a DataFrame (make sure it has a 'Date' column and decimal returns in other columns), then call `run_analysis()` and `export_to_excel()`.
4. For interactive selection, do:
   ```python
   display(ui_inputs)
   ```
   Then wire the `apply_button` to a callback function that reads the widget values and runs `run_analysis()`.
5. For custom weights, call:
   ```python
   my_weights = get_custom_weights(selected_funds)
   ```
   Then pass `my_weights` into your logic.


In [6]:
# ===============================================================
#            STREAMLINED ANALYSIS UI  (phase-2 clean)
# ===============================================================

# ---------- session store ----------
session = {"df": None, "rf": None, "sel": None, "cweights": None}

# ---------- 1 · DATA LOAD ----------
src = widgets.ToggleButtons(
    options=[("Local", "local"), ("URL", "url")],
    description="Source:"
)

chooser = FileChooser()
url_box = widgets.Text(placeholder="https://…/file.csv", layout={"width":"70%"})
load_btn = widgets.Button(description="Load CSV", button_style="success")
load_out = widgets.Output()

def _toggle_src(c):
    chooser.layout.display = "block" if c["new"]=="local" else "none"
    url_box.layout.display  = "block" if c["new"]=="url"   else "none"
src.observe(_toggle_src, names="value"); _toggle_src({"new":src.value})

def _load(_):
    with load_out:
        clear_output()
        try:
            path = chooser.selected if src.value=="local" else url_box.value.strip()
            if not path: raise ValueError("choose file / URL")
            if src.value=="url" and not path.lower().endswith(".csv"):
                raise ValueError("URL must end with .csv")
            df = load_csv(path)
            df["Date"] = pd.to_datetime(df["Date"], errors="coerce")  # single coercion
            rf = identify_risk_free_fund(df)
            session.update(df=df, rf=rf, sel=None, cweights=None)
            print(f"✅ Loaded {len(df):,} rows × {df.shape[1]} cols | RF → {rf}")
        except Exception as e:
            print("❌", e); session["df"]=None
load_btn.on_click(_load)

# ---------- 2 · PARAMS ------------
index_cnt = widgets.BoundedIntText(0, min=0, max=10, description="# Indices:")
in_start,in_end  = widgets.Text("2005-07"), widgets.Text("2008-06")
out_start,out_end= widgets.Text("2008-07"), widgets.Text("2009-06")
for w,lbl in [(in_start,"In Start:"),(in_end,"In End:"),
              (out_start,"Out Start:"),(out_end,"Out End:")]:
    w.description = lbl
target_vol   = widgets.FloatText(0.25,  description="Target Vol:")
monthly_cost = widgets.FloatText(0.0033, description="Monthly Cost:")

# ---------- 3 · SELECTION ----------
mode_dd = widgets.Dropdown(
    options=[("All", "all"), ("Random", "random"), ("Manual", "manual")],
    value="all",
    description="Mode:"
)
rand_n   = widgets.BoundedIntText(5, min=2, max=100, description="Sample N:")
fund_table, total_lbl = widgets.VBox([]), widgets.Label("Total = 0 %")

def _toggle_sel(_=None):
    rand_n.layout.display  = "block" if mode_dd.value=="random" else "none"
    vis = "block" if mode_dd.value=="manual" else "none"
    fund_table.layout.display = total_lbl.layout.display = vis
mode_dd.observe(_toggle_sel, names="value"); _toggle_sel()

# ---------- helpers ---------------
def _eligible_pool():
    df, rf = session["df"], session["rf"]
    if df is None: 
        print("⚠️ data not loaded"); return []

    # ---- date parse guard -----------------------------------
    try:
        in_s  = pd.to_datetime(in_start.value)+pd.offsets.MonthEnd(0)
        in_e  = pd.to_datetime(in_end.value)  +pd.offsets.MonthEnd(0)
        out_s = pd.to_datetime(out_start.value)+pd.offsets.MonthEnd(0)
        out_e = pd.to_datetime(out_end.value)  +pd.offsets.MonthEnd(0)
    except Exception:
        print("❌ invalid dates"); return []

    # ---- build indices (RIGHT-most idx_n non-RF columns) ----
    idx_n     = index_cnt.value
    data_cols = [c for c in df.columns if c not in ["Date", rf, "Month"]]
    non_rf    = [c for c in data_cols if c != rf]
    indices   = non_rf[-idx_n:] if idx_n else []          # <- fixed
    cand      = [c for c in data_cols if c not in indices]

    # ---- run select_funds ----------------------------------
    elig = select_funds(df, rf, cand, in_s, in_e, out_s, out_e, "all")
    # … diagnostics print unchanged …
    return elig

def _build_manual(*_):
    if mode_dd.value!="manual" or session["df"] is None: return
    valid = _eligible_pool()
    print("DEBUG  eligible funds =", len(valid))              # ← line 1
    print("DEBUG  list sample   →", valid[:25], "…")           # ← line 2
    if not valid:
        print("❌ No eligible funds"); return 
    fund_table.children = []                # reset

    def _update_total(*_):
        tot = sum(r.children[1].value for r in fund_table.children
                  if r.children[0].value)
        total_lbl.value = f"Total = {tot} %"

    for f in valid:
        cb = widgets.Checkbox(description=f, layout={"width":"200px"})
        wt = widgets.BoundedIntText(0, min=0, max=100,
                                    layout={"width":"60px"}, disabled=True)
        def _toggle(ch, box=wt):           # single observer
            box.disabled = not ch["new"]
            if box.disabled: box.value = 0
            _update_total()
        cb.observe(_toggle, names="value")
        wt.observe(_update_total, names="value")
        fund_table.children += (widgets.HBox([cb, wt]),)
    _update_total()

mode_dd.observe(lambda ch: _build_manual() if ch["new"]=="manual" else None,
                names="value")
for w in (in_start,in_end,out_start,out_end): w.observe(_build_manual,names="value")

# ---------- 4 · RUN ---------------
run_btn = widgets.Button(description="Run Analysis", button_style="success")
run_out = widgets.Output(layout={"border":"1px solid #999",
                                 "height":"340px","overflow_y":"auto"})

def _run(_):
    with run_out:
        clear_output()
        df, rf = session["df"], session["rf"]
        if df is None: print("⚠️ Load data first"); return

        # indices (robust)
        idx_n     = index_cnt.value
        data_cols = [c for c in df.columns if c not in ["Date", rf, "Month"]]
        non_rf    = [c for c in data_cols if c != rf]
        indices   = non_rf[-idx_n:] if idx_n else [] 

        # pool + selection
        pool = _eligible_pool()
        if not pool: print("❌ No eligible funds"); return
        if mode_dd.value=="all":
            sel, custom = pool, None
        elif mode_dd.value=="random":
            if rand_n.value>len(pool): print("⚠️ Sample N too big"); return
            sel, custom = list(np.random.choice(pool, rand_n.value, replace=False)), None
        else:
            sel, custom = [], {}
            if not fund_table.children: _build_manual()
            for row in fund_table.children:
                cb, wt = row.children
                if cb.value: sel.append(cb.description); custom[cb.description]=wt.value
            if sum(custom.values())!=100: print("⚠️ Weights ≠ 100"); return

        w_dict,w_vec = prepare_weights(sel, custom)

        res = run_analysis(df, sel, w_vec, w_dict, rf,
                           in_start.value, in_end.value,
                           out_start.value, out_end.value,
                           target_vol.value, monthly_cost.value,
                           indices)

        print("✅ analysis complete |", len(sel), "funds")
        if res["dropped"]:
            print("⚠️ Dropped:", res["dropped"])
        if indices: print("📊 Indices:", indices)

        fname=f"IS_{in_start.value}_{out_start.value}.xlsx"
        export_to_excel(res, df, fname,
                        in_start.value,in_end.value,
                        out_start.value,out_end.value)
        print("Workbook saved as", fname)

run_btn.on_click(_run)

# ---------- DISPLAY --------------
display(widgets.VBox([
    widgets.HTML("<h4>1. Load data</h4>"),
    src, chooser, url_box, load_btn, load_out,
    widgets.HTML("<hr><h4>2. Parameters</h4>"),
    widgets.HBox([index_cnt]),
    widgets.HBox([in_start,in_end,out_start,out_end]),
    widgets.HBox([target_vol,monthly_cost]),
    widgets.HTML("<hr><h4>3. Fund selection</h4>"),
    widgets.HBox([mode_dd,rand_n]),
    fund_table, total_lbl,
    widgets.HTML("<hr>"),
    run_btn,
    run_out
]))


VBox(children=(HTML(value='<h4>1. Load data</h4>'), ToggleButtons(description='Source:', options=(('Local', 'l…