In [None]:
import numpy as np
import scipy.stats as st

RISK_FREE = 0.043  # 4.3% annual

def portfolio_return(weights, mu):
    """Annualised expected return."""
    return np.dot(weights, mu)

def portfolio_vol(weights, cov):
    """Annualised volatility (std dev)."""
    return np.sqrt(weights @ cov @ weights)

def sharpe_ratio(weights, mu, cov):
    """Sharpe ratio: (E[R] - Rf) / σ."""
    ret = portfolio_return(weights, mu)
    vol = portfolio_vol(weights, cov)
    return (ret - RISK_FREE) / vol

def cvar_calc(weights, mu, cov, alpha=0.95):
    """
    Approximate CVaR under Normal assumption:
      CVaR = -(mean - σ * φ(Ζα)/(1-α))
    where Ζα = norm.ppf(α)
    """
    ret = portfolio_return(weights, mu)
    vol = portfolio_vol(weights, cov)
    z = st.norm.ppf(alpha)
    pdf = st.norm.pdf(z)
    cvar = -(ret - vol * pdf / (1 - alpha))
    return cvar


In [None]:
from cvxopt import matrix, solvers

def mean_variance_opt(mu, cov, target_return):
    """
    Solve:
      minimize 0.5 w^T Σ w
      s.t.     μ^T w >= target_return
               sum(w) = 1
               w >= 0
    Returns optimal weights as a NumPy array.
    """
    n = len(mu)
    P = matrix(cov * 2)                 # 2Σ
    q = matrix(np.zeros(n))             # zero vector

    # Constraints Gx <= h  <=>  -Ix <= 0  (w >= 0)
    G = matrix(-np.eye(n))
    h = matrix(np.zeros(n))

    # Constraints Ax = b for equalities and inequalities combined
    # We'll stack [1^T; μ^T] for equalities, then the inequality μ^T w >= target => -μ^T w <= -target
    A = matrix(np.vstack([np.ones((1, n)), mu.reshape(1, n)]))
    b = matrix([1.0, target_return])

    # To handle μ^T w >= target, we include it as an inequality by appending to G,h:
    # -μ^T w <= -target → G = [ -I; -μ ]  and  h = [ 0; -target ]
    G = matrix(np.vstack([np.eye(n)*-1, mu.reshape(1,n)*-1]))
    h = matrix(np.hstack([np.zeros(n), -target_return]))

    sol = solvers.qp(P, q, G, h, A, b)
    w = np.array(sol['x']).flatten()
    return w

In [2]:
import os, json, requests, time, itertools, math
import pandas as pd, numpy as np

API_KEY = os.getenv("ALPHA_VANTAGE_KEY")
BASE    = "https://www.alphavantage.co/query"

# 25-ticker “mini-universe” (20 blue-chip stocks + 5 broad ETFs)
UNIVERSE = [
    # 20 Stocks
    "AAPL","MSFT","AMZN","GOOG","TSLA",
    "NVDA","BRK.B","JNJ","V","JPM",
    "UNH","HD","PG","MA","XOM",
    "CVX","PFE","MRK","KO","WMT",

    # 5 ETFs
    "SPY","QQQ","VTI","IWM","AGG"
]


 ## Fetch weekly‐adjusted prices (cache locally)

In [None]:
from pathlib import Path
CACHE = Path("av_cache"); CACHE.mkdir(exist_ok=True)

def av_get(function, symbol):
    key = f"{function}_{symbol}.json"
    fp = CACHE / key
    if fp.exists():
        return json.loads(fp.read_text())
    url = f"{BASE}?function={function}&symbol={symbol}&apikey={API_KEY}"
    r = requests.get(url); r.raise_for_status()
    fp.write_text(r.text)
    time.sleep(12)                       # AV free tier = 5 calls/min
    return r.json()

def weekly_series(sym):
    js = av_get("TIME_SERIES_WEEKLY_ADJUSTED", sym)
    df = (pd.DataFrame(js["Weekly Adjusted Time Series"])
            .T.astype(float)[["4. close"]].rename(columns={"4. close":"close"}))
    return df.sort_index()

def overview(sym):
    return av_get("OVERVIEW", sym)


## Unified Price Table and Momentum 

In [None]:
price_dfs = {}
for sym in UNIVERSE:
    try:
        price_dfs[sym] = weekly_series(sym)["close"]
    except Exception as e:
        print("skip", sym, e)

prices = pd.concat(price_dfs, axis=1).dropna(how="all")
rets   = prices.pct_change().dropna()
μ      = rets.mean() * 52                        # annualised mean
Σ      = rets.cov()  * 52                        # annualised cov

# Momentum 6m / 12m
mom_6  = prices.pct_change(26).iloc[-1]
mom_12 = prices.pct_change(52).iloc[-1]


## Fundamental Data 

In [None]:
fund_rows = []
for sym in UNIVERSE:
    try:
        o = overview(sym)
        fund_rows.append({
          "ticker": sym,
          "beta":         float(o.get("Beta", np.nan)),
          "divYield":     float(o.get("DividendYield", np.nan)),
          "logCap":       math.log(float(o.get("MarketCapitalization", 1))),
          "sector":       o.get("Sector", "N/A")
        })
    except: pass

fund = pd.DataFrame(fund_rows).set_index("ticker")


## Building Training Data via Simulations

In [None]:
from sklearn.model_selection import train_test_split

def simulate_portfolios(num=2000, size=20):
    rng = np.random.default_rng(0)
    sims = []
    for _ in range(num):
        basket = rng.choice(UNIVERSE, size=size, replace=False)
        w0     = rng.dirichlet(np.ones(size))
        sims.append(dict(zip(basket, w0)))
    return sims

def label_asset(port, asset):
    basket = list(port.keys()) + [asset]
    weights= np.append(list(port.values()), 0)      # start with 0% weight
    w_opt  = mean_variance_opt(μ[basket], Σ.loc[basket, basket], 0.13) # target 13%
    base   = sharpe_ratio(weights[:-1], μ[basket[:-1]], Σ.loc[basket[:-1], basket[:-1]])
    new    = sharpe_ratio(w_opt, μ[basket], Σ.loc[basket, basket])
    return int(new > base+1e-4)

# --- build dataset ---
records = []
TARGETS = [0.08, 0.13, 0.18, 0.20, 0.25]  # 8%, 13%, 18%, 20%, 25%

for port in simulate_portfolios():
    held = set(port)
    base_tickers = list(port.keys())
    base_w = np.array(list(port.values()))

    # compute base Sharpe/CVaR once per portfolio
    base_sh = sharpe_ratio(
        base_w,
        μ[base_tickers].values,
        Σ.loc[base_tickers, base_tickers].values
    )
    base_cv = cvar_calc(
        base_w,
        μ[base_tickers].values,
        Σ.loc[base_tickers, base_tickers].values
    )

    for target in TARGETS:
        # compute the global optimum for this target
        # note: mean_variance_opt solves for w* s.t. sum(w*)=1, w*>=0, μ·w*>=target
        w_star = mean_variance_opt(
            mu=μ.values,
            cov=Σ.values,
            target_return=target
        )

        # map w_star back to ticker list
        # assume UNIVERSAL ordering same as μ.index
        # so w_star[i] corresponds to μ.index[i]
        # heldIdx = [i for i,sym in enumerate(μ.index) if sym in base_tickers]

        for cand in UNIVERSE:
            if cand in held:
                continue

            # marginal tilt of ε=1%
            eps = 0.01
            # take eps from the largest weight in w_star
            donor = np.argmax(w_star)
            w_pert = w_star.copy()
            w_pert[donor] = max(0, w_pert[donor] - eps)
            idx = list(μ.index).index(cand)
            w_pert[idx] += eps

            # compute perturbed metrics
            pert_sh = sharpe_ratio(
                w_pert,
                μ.values,
                Σ.values
            )
            pert_cv = cvar_calc(
                w_pert,
                μ.values,
                Σ.values
            )

            rec = {
                "ticker":      cand,
                "targetReturn": target,
                "deltaSharpe":  pert_sh - base_sh,
                "deltaCvar":    base_cv - pert_cv,
                "mom6":         mom_6[cand],
                "mom12":        mom_12[cand],
                **fund.loc[cand].to_dict(),
                "label":        label_asset(port, cand, target)
            }
            records.append(rec)

df = pd.DataFrame(records).dropna()
# include 'targetReturn' among features
feature_cols = [
    "deltaSharpe","deltaCvar","mom6","mom12",
    "beta","divYield","logCap","targetReturn"
]
X = df[feature_cols]
y = df["label"]

# train/val/test split as before
from sklearn.model_selection import train_test_split
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)


## Run Grid Search with Early Stopping

In [None]:
import xgboost as xgb
import numpy as np
from sklearn.model_selection import StratifiedKFold, ParameterGrid
from sklearn.metrics import roc_auc_score

rng = np.random.RandomState(42)

# handle class imbalance
pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

param_grid = {
    "max_depth":        [3, 4, 5],
    "learning_rate":    [0.03, 0.06, 0.1],
    "n_estimators":     [400, 700, 1000],   # will be cut by early stopping
    "subsample":        [0.8, 1.0],
    "colsample_bytree": [0.8, 1.0],
    "min_child_weight": [1, 3, 5],
    "gamma":            [0, 1]
}

best_auc, best_params, best_model = -1, None, None

for params in ParameterGrid(param_grid):
    clf = xgb.XGBClassifier(
        objective="binary:logistic",
        tree_method="hist",
        eval_metric="auc",
        random_state=42,
        scale_pos_weight=float(pos_weight),
        **params
    )

    clf.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False,
        early_stopping_rounds=50
    )

    auc = roc_auc_score(y_val, clf.predict_proba(X_val)[:, 1])
    if auc > best_auc:
        best_auc, best_params, best_model = auc, params, clf
        print(f"New best AUC {auc:.3f} with {params}  (iters={clf.best_iteration_})")

print("Best validation AUC:", best_auc)
best_params

## Evaluate on Test Set 

In [None]:
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report

proba_test = best_model.predict_proba(X_test)[:,1]
auc_test   = roc_auc_score(y_test, proba_test)
ap_test    = average_precision_score(y_test, proba_test)
print(f"Test AUC: {auc_test:.3f} | PR-AUC: {ap_test:.3f}")

# choose a threshold using validation (Youden J)
proba_val = best_model.predict_proba(X_val)[:,1]
thr = np.quantile(proba_val, 0.7)  # or search argmax(tpr - fpr)
pred_test = (proba_test >= thr).astype(int)
print(classification_report(y_test, pred_test, digits=3))

## Save the Model

In [None]:
import joblib, pathlib

best_model.save_model("../src/lib/recommend_model.json")

# (If you’re keeping a Python-only workflow, you could also do:)
# import joblib
# joblib.dump(best_model, "../src/lib/models/recommend_model.joblib")

print("Saved to ../lib/models/recommend_model.joblib")