In [None]:
# stage 1 training and prediction
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import ta
import joblib
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time

# ------------------------------------------------------------------------
# 0. Load and align data
# ------------------------------------------------------------------------
factors = pd.read_csv("aligned_factors.csv", index_col=0, parse_dates=True)
returns = pd.read_csv("daily_returns_10ETFs.csv", index_col=0, parse_dates=True)

# Align dates to ensure matching indices
dates = factors.index.intersection(returns.index)
factors = factors.loc[dates]
returns = returns.loc[dates]

# ------------------------------------------------------------------------
# 1. Compute technical indicators and lagged features per ETF
# ------------------------------------------------------------------------
all_tech_features = []

for etf in returns.columns:
    close = (1 + returns[etf]).cumprod()
    tech_df = pd.DataFrame(index=returns.index)

    # Selected indicators (others commented out to reduce noise)
    tech_df[f'{etf}_SMA_5']   = ta.trend.sma_indicator(close, window=5)
    tech_df[f'{etf}_EMA_12']  = ta.trend.ema_indicator(close, window=12)
    tech_df[f'{etf}_RSI_7']   = ta.momentum.rsi(close, window=7)
    tech_df[f'{etf}_MACD']    = ta.trend.macd_diff(close)
    tech_df[f'{etf}_ATR']     = ta.volatility.average_true_range(
        high=close * 1.01, low=close * 0.99, close=close, window=10
    )
    tech_df[f'{etf}_Vol_5']   = returns[etf].rolling(window=5).std()
    tech_df[f'{etf}_Mom_3']   = returns[etf].rolling(window=3).mean()

    # Lagged returns (shifted so only past information is used)
    for lag in [1, 2, 3]:
        tech_df[f'{etf}_LagRet_{lag}'] = returns[etf].shift(lag)

    all_tech_features.append(tech_df)

# Concatenate technical indicators for all ETFs
technical_features = pd.concat(all_tech_features, axis=1)

# ------------------------------------------------------------------------
# 2. Create lagged factor features
# ------------------------------------------------------------------------
for factor in ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']:
    for lag in [1, 2, 3]:
        factors[f'{factor}_lag_{lag}'] = factors[factor].shift(lag)

# Drop rows with NA values arising from lagging
factors = factors.dropna()

# ------------------------------------------------------------------------
# 3. Combine factors, technical features, and VIX change
# ------------------------------------------------------------------------
features = pd.concat([factors, technical_features], axis=1).dropna()
vix = pd.read_csv("VIX_History.csv", index_col=0, parse_dates=True)

# Align VIX to our feature dates and compute lagged change
vix_aligned = vix['CLOSE'].reindex(features.index).ffill()
features['VIX'] = vix_aligned.pct_change(fill_method=None).shift(1)
features['VIX'] = features['VIX'].fillna(0)

# Define the target: next-day return per ETF
target_returns = returns.shift(-1).loc[features.index].dropna()
features = features.loc[target_returns.index]

# ------------------------------------------------------------------------
# 4. Define rolling window parameters
# ------------------------------------------------------------------------
train_years = 12      # years used for training
valid_years = 1       # years used for validation
test_years  = 1       # years used for testing/prediction
retrain_frequency = 1 # years between retrainings
start_year = 2009
end_year   = 2024

# List generic features used for SHAP importance ranking
all_generic_features = [
    'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA',
    'Mkt-RF_lag_1', 'Mkt-RF_lag_2', 'Mkt-RF_lag_3',
    'SMB_lag_1', 'SMB_lag_2', 'SMB_lag_3',
    'HML_lag_1', 'HML_lag_2', 'HML_lag_3',
    'RMW_lag_1', 'RMW_lag_2', 'RMW_lag_3',
    'CMA_lag_1', 'CMA_lag_2', 'CMA_lag_3',
    'SMA_5', 'EMA_12', 'RSI_7', 'MACD',
    'Vol_5', 'Mom_3',
    'LagRet_1', 'LagRet_2', 'LagRet_3', 'VIX'
]

# ------------------------------------------------------------------------
# 5. Compute generic feature importance via SHAP
#    (aggregated across ETFs, using only the initial training window)
# ------------------------------------------------------------------------
shap_importances = pd.DataFrame(0.0, index=all_generic_features, columns=['SHAP_Value'])

# Use a fixed period (e.g. up to year 2009) for computing importances
base_train_start = pd.Timestamp(start_year - train_years, 1, 1)
base_train_end   = pd.Timestamp(start_year - valid_years - 1, 12, 31)

for etf in returns.columns:
    print(f"Computing SHAP importances for ETF: {etf}")
    # Filter columns relevant to this ETF (generic + factor features)
    etf_cols = [
        col for col in features.columns
        if (etf in col and any(k in col for k in ['SMA_5', 'EMA_12', 'RSI_7',
                                                  'MACD', 'Vol_5', 'Mom_3',
                                                  'LagRet_1','LagRet_2','LagRet_3', 'VIX']))
        or col in ['Mkt-RF','SMB','HML','RMW','CMA',
                   'Mkt-RF_lag_1','Mkt-RF_lag_2','Mkt-RF_lag_3',
                   'SMB_lag_1','SMB_lag_2','SMB_lag_3',
                   'HML_lag_1','HML_lag_2','HML_lag_3',
                   'RMW_lag_1','RMW_lag_2','RMW_lag_3',
                   'CMA_lag_1','CMA_lag_2','CMA_lag_3']
    ]
    X_base  = features.loc[base_train_start:base_train_end, etf_cols]
    y_base  = target_returns[etf].loc[base_train_start:base_train_end]

    # Fit a quick model to compute SHAP
    model_base = xgb.XGBRegressor(
        objective='reg:squarederror',
        tree_method='hist',
        random_state=42,
        device='cuda'
    ).fit(X_base, y_base)

    explainer_base = shap.Explainer(model_base)
    shap_vals = explainer_base(X_base)

    # Aggregate SHAP values per generic feature
    for gen_feat in all_generic_features:
        cols = [c for c in X_base.columns if gen_feat in c]
        if cols:
            idx = [X_base.columns.get_loc(c) for c in cols]
            shap_importances.loc[gen_feat] += np.mean(np.abs(shap_vals.values[:, idx]))

# Average importance across ETFs and select top N
shap_importances /= len(returns.columns)
top_generic_features = (
    shap_importances.sort_values('SHAP_Value', ascending=False)
                    .head(10)
                    .index
                    .tolist()
)

# ------------------------------------------------------------------------
# 6. Retrain models using the selected generic features in rolling windows
# ------------------------------------------------------------------------
all_predictions = []

for etf in returns.columns:
    print(f"\n==== Training models for ETF: {etf} ====")
    # Select columns containing any of the top_generic_features or factor names
    selected_features = [
        f for f in features.columns
        if any(gen in f for gen in top_generic_features) or f in factors.columns
    ]

    year = start_year
    while year <= end_year - test_years + 1:
        print(f"\nTraining window starting {year}")
        start_time = time.time()

        # Define periods
        train_start = pd.Timestamp(year - train_years, 1, 1)
        train_end   = pd.Timestamp(year - valid_years - 1, 12, 31)
        valid_start = pd.Timestamp(year - valid_years, 1, 1)
        valid_end   = pd.Timestamp(year - 1, 12, 31)
        test_start  = pd.Timestamp(year, 1, 1)
        test_end    = pd.Timestamp(year + test_years - 1, 12, 31)

        # Extract data
        X_train = features.loc[train_start:train_end, selected_features]
        y_train = target_returns[etf].loc[train_start:train_end]
        X_valid = features.loc[valid_start:valid_end, selected_features]
        y_valid = target_returns[etf].loc[valid_start:valid_end]
        X_test  = features.loc[test_start:test_end, selected_features]
        y_test  = target_returns[etf].loc[test_start:test_end]

        # Base model with early stopping
        base_model = xgb.XGBRegressor(
            objective='reg:squarederror',
            tree_method='hist',
            random_state=42,
            n_jobs=4
        )

        param_grid = {
            'n_estimators': [200, 400],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.03, 0.05],
            'subsample': [0.7, 0.8],
            'colsample_bytree': [0.7, 0.8]
        }

        tscv = TimeSeriesSplit(n_splits=3)

        grid_search = GridSearchCV(
            base_model,
            param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            verbose=0,
            n_jobs=4
        )

        # Fit with early stopping on the explicit validation set
        grid_search.fit(
            X_train,
            y_train,
            eval_set=[(X_valid, y_valid)],
            eval_metric='rmse',
            early_stopping_rounds=50,
            verbose=False
        )

        best_model = grid_search.best_estimator_

        # Predict on the test period
        preds = best_model.predict(X_test)

        # Compute evaluation metrics
        mse  = mean_squared_error(y_test, preds)
        rmse = np.sqrt(mse)
        mae  = mean_absolute_error(y_test, preds)
        r2   = r2_score(y_test, preds)
        dir_acc = np.mean((np.sign(y_test) == np.sign(preds)).astype(int))

        print(f"MSE: {mse:.6f}  RMSE: {rmse:.6f}  MAE: {mae:.6f}  "
              f"R²: {r2:.6f}  DirAcc: {dir_acc:.2%}")

        # Save the model for reproducibility
        joblib.dump(best_model, f"best_model_{etf}_{year}.joblib")

        # Save predictions
        preds_df = pd.DataFrame({
            'Date': X_test.index,
            'ETF': etf,
            'Year': year,
            'Actual_Return': y_test,
            'Predicted_Return': preds
        }).reset_index(drop=True)

        # Compute SHAP values on the test set
        explainer_test = shap.Explainer(best_model)
        shap_vals_test = explainer_test(X_test)

        shap_df = pd.DataFrame(
            shap_vals_test.values,
            columns=[f'SHAP_{col}' for col in X_test.columns],
            index=X_test.index
        ).reset_index().rename(columns={'index': 'Date'})

        # Merge SHAP values with predictions
        preds_df = preds_df.merge(shap_df, on='Date', how='left')

        all_predictions.append(preds_df)

        # Advance the window
        year += retrain_frequency
        print(f"Window processed in {time.time() - start_time:.2f} seconds")

# Concatenate and save all predictions and SHAP values
final_predictions_df = pd.concat(all_predictions, ignore_index=True)
final_predictions_df.to_csv("stage1_predictions_with_shap_10ETFs.csv", index=False)

print("Stage 1 completed and data saved for Stage 2.")


In [None]:
top_generic_features

In [None]:
import pandas as pd
import numpy as np
import json

# Load stage 1 predictions with SHAP values explicitly
stage1_df = pd.read_csv("stage1_predictions_with_shap_10ETFs.csv", parse_dates=['Date'])
etfs = stage1_df['ETF'].unique()

# Initialize DataFrame explicitly for aggregated daily data
dates = sorted(stage1_df['Date'].unique())
aggregated_data = pd.DataFrame({'Date': dates})

# Pivot tables for efficient cross-sectional computations
predicted_returns = stage1_df.pivot(index='Date', columns='ETF', values='Predicted_Return')
actual_returns = stage1_df.pivot(index='Date', columns='ETF', values='Actual_Return')

# Compute ETF-specific volatility (rolling 5-day window)
volatility = actual_returns.rolling(window=5).std()

# Merge explicitly into aggregated_data
for etf in etfs:
    aggregated_data[f'Predicted_Return_{etf}'] = aggregated_data['Date'].map(predicted_returns[etf])
    aggregated_data[f'Actual_Return_{etf}'] = aggregated_data['Date'].map(actual_returns[etf])
    aggregated_data[f'Volatility_{etf}'] = aggregated_data['Date'].map(volatility[etf])

# Dynamically load the top generic features from Stage 1 explicitly to maintain consistency
generic_shap_features = ['Vol_5',
 'SMB_lag_2',
 'Mom_3',
 'HML_lag_2',
 'SMA_5',
 'LagRet_1',
 'EMA_12',
 'SMB',
 'RSI_7',
 'CMA_lag_1']

# Aggregate SHAP values (mean and std across ETFs) explicitly by generic feature
shap_aggregated_features = {}

for feature in generic_shap_features:
    matching_cols = [col for col in stage1_df.columns if col.startswith('SHAP_') and col.endswith(f'_{feature}')]

    if matching_cols:
        shap_means = stage1_df.groupby('Date')[matching_cols].mean().mean(axis=1)
        shap_stds = stage1_df.groupby('Date')[matching_cols].std().mean(axis=1)

        shap_aggregated_features[f'Avg_SHAP_{feature}'] = shap_means
        shap_aggregated_features[f'Std_SHAP_{feature}'] = shap_stds
    else:
        print(f"Warning: No matches found for SHAP feature: {feature}")

# Convert aggregated SHAP features explicitly to DataFrame
shap_aggregated_df = pd.DataFrame(shap_aggregated_features).reset_index()

# Merge aggregated SHAP features explicitly
aggregated_data = pd.merge(aggregated_data, shap_aggregated_df, on='Date', how='left')

# Explicitly compute additional cross-sectional signals for richer Stage 2 observations
# Cross-sectional mean and std of predicted returns
aggregated_data['CrossSec_Mean_PredRet'] = predicted_returns.mean(axis=1).values
aggregated_data['CrossSec_Std_PredRet'] = predicted_returns.std(axis=1).values

# Cross-sectional mean volatility
aggregated_data['CrossSec_Mean_Volatility'] = volatility.mean(axis=1).values

# Rank ETFs by predicted return explicitly (percentile ranks)
ranked_preds = predicted_returns.rank(axis=1, pct=True)
for etf in etfs:
    aggregated_data[f'Rank_PredRet_{etf}'] = aggregated_data['Date'].map(ranked_preds[etf])

# Handle missing values explicitly and robustly:
# Forward-fill only SHAP and cross-sectional features explicitly
shap_and_crosssec_cols = [col for col in aggregated_data.columns if 'SHAP' in col or 'CrossSec' in col]
aggregated_data[shap_and_crosssec_cols] = aggregated_data[shap_and_crosssec_cols].ffill()

# Drop rows explicitly where ETF volatility calculations have initial NaNs
vol_cols = [f'Volatility_{etf}' for etf in etfs]
aggregated_data.dropna(subset=vol_cols, inplace=True)

# Final sanity checks explicitly for data quality assurance
if aggregated_data.empty:
    raise ValueError("Aggregated dataset is empty after preprocessing. Verify your input data.")
else:
    # Quick summary statistics explicitly for diagnostics
    print("Aggregated DataFrame shape:", aggregated_data.shape)
    print("Aggregated DataFrame summary stats:")
    print(aggregated_data.describe().transpose())

    # Save optimized data explicitly for Stage 2
    aggregated_data.to_csv("stage2_rl_observations_optimized_10ETFs.csv", index=False)
    print("Optimized Stage 2 RL dataset successfully saved.")


In [None]:
# import os
# import json
# import random
# from dataclasses import dataclass
# from typing import Dict, List, Tuple
# 
# import numpy as np
# import pandas as pd
# from sklearn.model_selection import ParameterSampler
# from sklearn.preprocessing import StandardScaler
# from scipy.stats import ttest_1samp
# 
# import torch
# from stable_baselines3 import PPO
# from stable_baselines3.common.env_util import make_vec_env
# import gymnasium as gym
# from gymnasium import spaces
# 
# # ---------------------------------------------------------------------
# # Configuration dataclass
# # ---------------------------------------------------------------------
# @dataclass
# class TrainingConfig:
#     train_window_days: int = 252 * 10      # 10 years for training
#     validation_window_days: int = 126      # ~6 months for validation
#     prediction_window_days: int = 126      # ~6 months for prediction
#     lookback_period: int = 10              # lookback for observations
#     rebalance_period: int = 10             # rebalance every 10 days
#     n_iter_tuning: int = 8                 # number of hyperparameter samples
#     tuning_timesteps: int = 5_000          # timesteps for each tune
#     incremental_timesteps: int = 3_000     # PPO training step size
#     max_timesteps: int = 30_000            # maximum PPO timesteps
#     patience: int = 3                      # early stopping patience
#     policy_arch: Tuple[int, int] = (256, 256)  # network architecture
#     num_iterations: int = 3                # number of outer iterations (seeds)
#     base_seed: int = 42                    # base random seed
#     default_risk_coeff: float = 0.5        # default risk coefficient
# 
# # ---------------------------------------------------------------------
# # Seed-setting utility
# # ---------------------------------------------------------------------
# def set_global_seed(seed: int) -> None:
#     np.random.seed(seed)
#     random.seed(seed)
#     torch.manual_seed(seed)
# 
# # ---------------------------------------------------------------------
# # Feature engineering
# # ---------------------------------------------------------------------
# def add_stable_features(df: pd.DataFrame, etf_list: List[str]) -> pd.DataFrame:
#     data = df.copy()
#     for etf in etf_list:
#         price_col = f'Price_{etf}'
#         data[f'Volatility_{etf}'] = data[price_col].pct_change().rolling(20).std()
#         data[f'Momentum_5d_{etf}'] = data[price_col].pct_change(periods=5)
#         data[f'Momentum_10d_{etf}'] = data[price_col].pct_change(periods=10)
#         data[f'Momentum_20d_{etf}'] = data[price_col].pct_change(periods=20)
#         data[f'MA_5d_{etf}'] = data[price_col].rolling(5).mean()
#         data[f'MA_20d_{etf}'] = data[price_col].rolling(20).mean()
#         data[f'MA_Crossover_{etf}'] = data[f'MA_5d_{etf}'] - data[f'MA_20d_{etf}']
#     data.dropna(inplace=True)
#     return data
# 
# def filter_features(df: pd.DataFrame,
#                     include_predicted_returns: bool = True,
#                     include_shap_metrics: bool = True) -> pd.DataFrame:
#     df_filtered = df.copy()
#     if not include_predicted_returns:
#         pred_cols = [c for c in df_filtered.columns if 'Predicted_Return' in c]
#         df_filtered.drop(columns=pred_cols, inplace=True)
#     if not include_shap_metrics:
#         shap_cols = [c for c in df_filtered.columns if 'SHAP' in c]
#         df_filtered.drop(columns=shap_cols, inplace=True)
#     return df_filtered
# 
# # ---------------------------------------------------------------------
# # Custom Gym environment
# # ---------------------------------------------------------------------
# class PortfolioEnv(gym.Env):
#     metadata = {'render_modes': []}
# 
#     def __init__(self, data: pd.DataFrame, etf_list: List[str],
#                  reward_type: str = 'mean_cvar', risk_coefficient: float = 0.5,
#                  rebalance_period: int = 21, lookback_period: int = 21):
#         super().__init__()
#         self.data = data.reset_index(drop=True)
#         self.etf_list = etf_list
#         self.reward_type = reward_type
#         self.risk_coefficient = risk_coefficient
#         self.rebalance_period = rebalance_period
#         self.lookback_period = lookback_period
#         self.action_space = spaces.Box(low=-1.0, high=1.0,
#                                        shape=(len(etf_list),), dtype=np.float32)
#         self.feature_cols = [c for c in data.columns
#                              if c != 'Date' and not c.startswith('Actual_Return')]
#         self.num_features_per_day = len(self.feature_cols)
#         self.observation_space = spaces.Box(
#             low=-np.inf, high=np.inf,
#             shape=(self.num_features_per_day * lookback_period,),
#             dtype=np.float32
#         )
#         self.current_step = lookback_period
#         self.cumulative_wealth = 1.0
#         self.current_weights = np.array([1.0 / len(etf_list)] * len(etf_list))
# 
#     def reset(self, seed=None, options=None):
#         super().reset(seed=seed)
#         if seed is not None:
#             np.random.seed(seed)
#         self.current_step = self.lookback_period
#         self.cumulative_wealth = 1.0
#         self.current_weights = np.array([1.0 / len(self.etf_list)] * len(self.etf_list))
#         return self._get_obs(), {}
# 
#     def _get_obs(self):
#         obs_window = self.data.iloc[self.current_step - self.lookback_period : self.current_step]
#         return obs_window[self.feature_cols].values.flatten().astype(np.float32)
# 
#     def calculate_reward(self, portfolio_return, asset_returns):
#         if self.reward_type == 'cumulative_return':
#             return self.cumulative_wealth - 1.0
#         elif self.reward_type == 'log_wealth':
#             return np.log(self.cumulative_wealth)
#         elif self.reward_type == 'mean_var':
#             return portfolio_return - self.risk_coefficient * np.var(asset_returns)
#         elif self.reward_type == 'mean_cvar':
#             alpha = 0.05
#             var = np.percentile(asset_returns, 100 * alpha)
#             cvar = np.mean(asset_returns[asset_returns <= var])
#             return portfolio_return - self.risk_coefficient * cvar
#         else:
#             raise ValueError(f"Invalid reward type: {self.reward_type}")
# 
#     def step(self, action):
#         next_step = self.current_step + 1
#         # Rebalance on schedule
#         if self.current_step % self.rebalance_period == 0:
#             desired_long, desired_short = 1.20, 0.20
#             clip_bounds = (-0.2, 0.8)
#             raw = action.copy()
#             long_w = np.maximum(raw, 0)
#             short_w = np.abs(np.minimum(raw, 0))
#             has_longs, has_shorts = (np.sum(long_w) > 0), (np.sum(short_w) > 0)
#             if has_longs and has_shorts:
#                 norm_long = desired_long * long_w / np.sum(long_w)
#                 norm_short = desired_short * short_w / np.sum(short_w)
#             elif has_longs and not has_shorts:
#                 norm_long = long_w / np.sum(long_w); norm_short = np.zeros_like(short_w)
#             elif not has_longs and has_shorts:
#                 n = len(raw)
#                 norm_long = np.ones(n) / n; norm_short = np.zeros(n)
#             else:
#                 n = len(raw)
#                 norm_long = np.ones(n) / n; norm_short = np.zeros(n)
#             combined = norm_long - norm_short
#             clipped = np.clip(combined, clip_bounds[0], clip_bounds[1])
#             long_c = np.maximum(clipped, 0.0)
#             short_c = np.abs(np.minimum(clipped, 0.0))
#             has_long_c, has_short_c = (np.sum(long_c) > 0), (np.sum(short_c) > 0)
#             if has_long_c and has_short_c:
#                 final_long = desired_long * long_c / np.sum(long_c)
#                 final_short = desired_short * short_c / np.sum(short_c)
#             elif has_long_c and not has_short_c:
#                 final_long = long_c / np.sum(long_c); final_short = np.zeros_like(short_c)
#             else:
#                 n = len(raw)
#                 final_long = np.ones(n) / n; final_short = np.zeros(n)
#             self.current_weights = final_long - final_short
#         else:
#             # Passive reweighting between rebalances
#             returns_today = np.array([self.data.loc[self.current_step, f'Actual_Return_{etf}'] for etf in self.etf_list])
#             self.current_weights *= (1 + returns_today)
#             self.current_weights /= np.sum(self.current_weights)
# 
#         if next_step >= len(self.data):
#             terminated = True; reward = 0.0
#         else:
#             asset_returns = np.array([
#                 self.data.loc[next_step, f'Actual_Return_{etf}']
#                 for etf in self.etf_list
#             ])
#             portfolio_return = np.dot(self.current_weights, asset_returns)
#             self.cumulative_wealth *= (1 + portfolio_return)
#             reward = self.calculate_reward(portfolio_return, asset_returns)
#             terminated = next_step >= len(self.data) - 1
#         self.current_step += 1
#         return self._get_obs(), reward, terminated, False, {}
# 
# # ---------------------------------------------------------------------
# # Hyperparameter tuning function
# # ---------------------------------------------------------------------
# def validate_and_tune(train_data: pd.DataFrame, val_data: pd.DataFrame,
#                       etf_list: List[str], cfg: TrainingConfig) -> Dict[str, float]:
#     param_dist = {
#         'learning_rate': [3e-4, 1e-4],
#         'n_steps': [20, 40],
#         'batch_size': [10, 20],
#         'gamma': [0.95, 0.98],
#         'risk_coefficient': [0.1, 0.5, 1.0],
#         'seed': [cfg.base_seed, cfg.base_seed + 17, cfg.base_seed + 42],
#     }
#     sampled_params = list(ParameterSampler(param_dist, n_iter=cfg.n_iter_tuning,
#                                            random_state=cfg.base_seed))
#     best_reward = -np.inf; best_params = None
#     for params in sampled_params:
#         seed = params.pop('seed')
#         risk_coeff = params.pop('risk_coefficient', cfg.default_risk_coeff)
#         set_global_seed(seed)
#         env = make_vec_env(lambda: PortfolioEnv(train_data, etf_list,
#                                                 'mean_cvar', risk_coeff,
#                                                 cfg.rebalance_period,
#                                                 cfg.lookback_period),
#                            n_envs=1, seed=seed)
#         model = PPO('MlpPolicy', env, ent_coef=0.01, clip_range=0.2,
#                     seed=seed, **params, verbose=0)
#         model.learn(total_timesteps=cfg.tuning_timesteps)
#         # Evaluate on validation
#         val_env = PortfolioEnv(val_data, etf_list, 'mean_cvar', risk_coeff,
#                                cfg.rebalance_period, cfg.lookback_period)
#         obs, _ = val_env.reset(seed=seed)
#         done = False; total_reward = 0.0
#         while not done:
#             action, _ = model.predict(obs, deterministic=True)
#             obs, reward, done, _, _ = val_env.step(action)
#             total_reward += reward
#         if total_reward > best_reward:
#             best_reward = total_reward
#             best_params = params.copy()
#             best_params['risk_coefficient'] = risk_coeff
#             best_params['seed'] = seed
#     return best_params
# 
# # ---------------------------------------------------------------------
# # Training and prediction function
# # ---------------------------------------------------------------------
# def train_and_predict(train_df: pd.DataFrame, val_df: pd.DataFrame,
#                       pred_df: pd.DataFrame, etf_list: List[str],
#                       cfg: TrainingConfig, best_params: Dict[str, float],
#                       model_path: str) -> Tuple[List[List[float]], List[pd.Timestamp]]:
#     risk_coeff = best_params.pop('risk_coefficient')
#     seed = best_params.pop('seed')
#     set_global_seed(seed)
#     env_train = make_vec_env(lambda: PortfolioEnv(train_df, etf_list,
#                                                   'mean_cvar', risk_coeff,
#                                                   cfg.rebalance_period,
#                                                   cfg.lookback_period),
#                              n_envs=1, seed=seed)
#     policy_kwargs = dict(net_arch=list(cfg.policy_arch))
#     model = PPO('MlpPolicy', env_train,
#                 policy_kwargs=policy_kwargs,
#                 ent_coef=0.01,
#                 clip_range=0.2,
#                 seed=seed,
#                 **best_params,
#                 verbose=0)
#     best_val_reward = -np.inf; no_improve = 0
#     # Early stopping loop
#     for step in range(0, cfg.max_timesteps, cfg.incremental_timesteps):
#         model.learn(total_timesteps=cfg.incremental_timesteps)
#         # Evaluate on validation set
#         val_env = PortfolioEnv(val_df, etf_list, 'mean_cvar', risk_coeff,
#                                cfg.rebalance_period, cfg.lookback_period)
#         obs, _ = val_env.reset(seed=seed)
#         done = False; val_reward = 0.0
#         while not done:
#             action, _ = model.predict(obs, deterministic=True)
#             obs, reward, done, _, _ = val_env.step(action)
#             val_reward += reward
#         if val_reward > best_val_reward:
#             best_val_reward = val_reward
#             no_improve = 0
#             model.save(model_path)
#         else:
#             no_improve += 1
#             if no_improve >= cfg.patience:
#                 break
#     # Load best model and predict on pred_df
#     best_model = PPO.load(model_path)
#     env_pred = PortfolioEnv(pred_df, etf_list, 'mean_cvar', risk_coeff,
#                             cfg.rebalance_period, cfg.lookback_period)
#     obs, _ = env_pred.reset()
#     done = False
#     weights_list: List[List[float]] = []
#     dates_list: List[pd.Timestamp] = []
#     action = np.zeros(len(etf_list), dtype=np.float32)
#     while not done:
#         if env_pred.current_step >= cfg.lookback_period and (
#             env_pred.current_step % cfg.rebalance_period == 0
#         ):
#             action, _ = best_model.predict(obs, deterministic=True)
#             # Normalization logic (same as env)
#             desired_long, desired_short = 1.20, 0.20
#             clip_bounds = (-0.2, 0.8)
#             raw = action.copy()
#             long_w = np.maximum(raw, 0.0)
#             short_w = np.abs(np.minimum(raw, 0.0))
#             has_longs, has_shorts = np.sum(long_w) > 0, np.sum(short_w) > 0
#             if has_longs and has_shorts:
#                 norm_long = desired_long * long_w / np.sum(long_w)
#                 norm_short = desired_short * short_w / np.sum(short_w)
#             elif has_longs and not has_shorts:
#                 norm_long = long_w / np.sum(long_w); norm_short = np.zeros_like(short_w)
#             elif not has_longs and has_shorts:
#                 n = len(raw); norm_long = np.ones(n)/n; norm_short = np.zeros(n)
#             else:
#                 n = len(raw); norm_long = np.ones(n)/n; norm_short = np.zeros(n)
#             combined = norm_long - norm_short
#             clipped = np.clip(combined, clip_bounds[0], clip_bounds[1])
#             long_c = np.maximum(clipped, 0.0); short_c = np.abs(np.minimum(clipped, 0.0))
#             has_long_c, has_short_c = np.sum(long_c) > 0, np.sum(short_c) > 0
#             if has_long_c and has_short_c:
#                 final_long = desired_long * long_c / np.sum(long_c)
#                 final_short = desired_short * short_c / np.sum(short_c)
#             elif has_long_c and not has_short_c:
#                 final_long = long_c / np.sum(long_c); final_short = np.zeros_like(short_c)
#             else:
#                 n = len(raw); final_long = np.ones(n)/n; final_short = np.zeros(n)
#             final_w = final_long - final_short
#             weights_list.append(final_w.tolist())
#             dates_list.append(env_pred.data.loc[env_pred.current_step, 'Date'])
#         obs, _, done, _, _ = env_pred.step(action)
#     return weights_list, dates_list
# 
# # ---------------------------------------------------------------------
# # Data loading and overall training loop
# # ---------------------------------------------------------------------
# cfg = TrainingConfig()
# 
# # Load your prepared Stage‑2 dataset and price data
# data = pd.read_csv('stage2_rl_observations_optimized_10ETFs.csv', parse_dates=['Date'])
# price_data = pd.read_csv('stock_prices_10ETFs.csv')
# price_data['Date'] = pd.to_datetime(price_data['Date'], utc=True).dt.tz_localize(None)
# price_cols = {col: f'Price_{col}' for col in price_data.columns if col != 'Date'}
# price_data.rename(columns=price_cols, inplace=True)
# 
# merged_data = pd.merge(data, price_data, on='Date', how='inner').reset_index(drop=True)
# if len(merged_data) != len(data):
#     print("Warning: data length mismatch after merge.")
# 
# etf_list = ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU']
# feature_data = add_stable_features(merged_data, etf_list)
# feature_data = filter_features(feature_data, include_predicted_returns=True, include_shap_metrics=True)
# 
# # Rolling windows
# total_len = len(feature_data)
# start_indices = range(0,
#                       total_len - (cfg.train_window_days + cfg.validation_window_days + cfg.prediction_window_days),
#                       cfg.prediction_window_days)
# 
# # Prepare directory for outputs
# output_dir = 'stage2_iterations'
# os.makedirs(output_dir, exist_ok=True)
# 
# # Collect metrics for all iterations
# summary_records = []
# 
# for iter_num in range(cfg.num_iterations):
#     iter_seed = cfg.base_seed + iter_num
#     iter_dir = os.path.join(output_dir, f'iteration_{iter_num:02d}')
#     os.makedirs(iter_dir, exist_ok=True)
# 
#     previous_model_path = None  # Path for incremental retraining within iterations
#     iter_returns = []
#     print(f"\nStarting iteration {iter_num+1}/{cfg.num_iterations} (Seed: {iter_seed}) at {time.strftime('%Y-%m-%d %H:%M:%S')}")
#     for idx, start_idx in enumerate(start_indices):
#         window_start_time = time.time()
#         print(f"  - Starting window {idx+1}/{len(start_indices)} at {time.strftime('%Y-%m-%d %H:%M:%S')}")
# 
#         train_start = start_idx
#         train_end = train_start + cfg.train_window_days
#         val_start = train_end
#         val_end = val_start + cfg.validation_window_days
#         pred_start = val_end
#         pred_end = pred_start + cfg.prediction_window_days
# 
#         train_df = feature_data.iloc[train_start:train_end].reset_index(drop=True)
#         val_df   = feature_data.iloc[val_start:val_end].reset_index(drop=True)
#         pred_df  = feature_data.iloc[pred_start:pred_end].reset_index(drop=True)
# 
#         feature_cols = [c for c in train_df.columns if c != 'Date' and not c.startswith('Actual_Return')]
#         scaler = StandardScaler()
#         scaler.fit(train_df[feature_cols])
# 
#         train_scaled = train_df.copy()
#         train_scaled[feature_cols] = scaler.transform(train_df[feature_cols])
#         val_scaled = val_df.copy()
#         val_scaled[feature_cols] = scaler.transform(val_df[feature_cols])
#         pred_scaled = pred_df.copy()
#         pred_scaled[feature_cols] = scaler.transform(pred_df[feature_cols])
# 
#         # Hyperparameter tuning for first window
#         if idx == 0:
#             best_params = validate_and_tune(train_scaled, val_scaled, etf_list, cfg)
#         else:
#             best_params = best_params.copy()
# 
#         window_dir = os.path.join(iter_dir, f'window_{idx:02d}')
#         os.makedirs(window_dir, exist_ok=True)
#         model_path = os.path.join(window_dir, 'best_ppo.zip')
# 
#         # Incremental Retraining Logic
#         risk_coeff = best_params.pop('risk_coefficient', cfg.default_risk_coeff)
#         seed = best_params.pop('seed', iter_seed)
#         set_global_seed(seed)
# 
#         env_train = make_vec_env(lambda: PortfolioEnv(train_scaled, etf_list, 'mean_cvar', risk_coeff, cfg.rebalance_period, cfg.lookback_period), n_envs=1, seed=seed)
#         policy_kwargs = dict(net_arch=list(cfg.policy_arch))
# 
#         if previous_model_path and os.path.exists(previous_model_path):
#             model = PPO.load(previous_model_path, env=env_train)
#             model.set_env(env_train)
#         else:
#             model = PPO('MlpPolicy', env_train, policy_kwargs=policy_kwargs, ent_coef=0.01, clip_range=0.2, seed=seed, **best_params, verbose=0)
# 
#         best_val_reward = -np.inf
#         no_improve = 0
#         training_log = []
# 
#         for step in range(0, cfg.max_timesteps, cfg.incremental_timesteps):
#             model.learn(total_timesteps=cfg.incremental_timesteps)
# 
#             val_env = PortfolioEnv(val_scaled, etf_list, 'mean_cvar', risk_coeff, cfg.rebalance_period, cfg.lookback_period)
#             obs, _ = val_env.reset(seed=seed)
#             done, val_reward = False, 0.0
# 
#             while not done:
#                 action, _ = model.predict(obs, deterministic=True)
#                 obs, reward, done, _, _ = val_env.step(action)
#                 val_reward += reward
# 
#             training_log.append({'training_step': step + cfg.incremental_timesteps, 'validation_reward': val_reward})
# 
#             if val_reward > best_val_reward:
#                 best_val_reward = val_reward
#                 no_improve = 0
#                 model.save(model_path)
#             else:
#                 no_improve += 1
#                 if no_improve >= cfg.patience:
#                     break
# 
#         # Save training logs to CSV
#         pd.DataFrame(training_log).to_csv(os.path.join(window_dir, 'training_validation_log.csv'), index=False)
# 
#         previous_model_path = model_path
# 
#         # Prediction step:
#         best_model = PPO.load(model_path)
#         env_pred = PortfolioEnv(pred_scaled, etf_list, 'mean_cvar', risk_coeff, cfg.rebalance_period, cfg.lookback_period)
#         obs, _ = env_pred.reset()
#         done = False
#         weights_list, dates_list = [], []
# 
#         action = np.zeros(len(etf_list), dtype=np.float32)
#         while not done:
#             if env_pred.current_step >= cfg.lookback_period and (
#                 env_pred.current_step % cfg.rebalance_period == 0
#             ):
#                 action, _ = best_model.predict(obs, deterministic=True)
# 
#                 desired_long, desired_short = 1.20, 0.20
#                 clip_bounds = (-0.2, 0.8)
#                 raw = action.copy()
#                 long_w = np.maximum(raw, 0.0)
#                 short_w = np.abs(np.minimum(raw, 0.0))
#                 has_longs, has_shorts = np.sum(long_w) > 0, np.sum(short_w) > 0
# 
#                 if has_longs and has_shorts:
#                     norm_long = desired_long * long_w / np.sum(long_w)
#                     norm_short = desired_short * short_w / np.sum(short_w)
#                 elif has_longs and not has_shorts:
#                     norm_long = long_w / np.sum(long_w); norm_short = np.zeros_like(short_w)
#                 elif not has_longs and has_shorts:
#                     n = len(raw); norm_long = np.ones(n)/n; norm_short = np.zeros(n)
#                 else:
#                     n = len(raw); norm_long = np.ones(n)/n; norm_short = np.zeros(n)
# 
#                 combined = norm_long - norm_short
#                 clipped = np.clip(combined, clip_bounds[0], clip_bounds[1])
# 
#                 long_c = np.maximum(clipped, 0.0); short_c = np.abs(np.minimum(clipped, 0.0))
#                 has_long_c, has_short_c = np.sum(long_c) > 0, np.sum(short_c) > 0
#                 if has_long_c and has_short_c:
#                     final_long = desired_long * long_c / np.sum(long_c)
#                     final_short = desired_short * short_c / np.sum(short_c)
#                 elif has_long_c and not has_short_c:
#                     final_long = long_c / np.sum(long_c); final_short = np.zeros_like(short_c)
#                 else:
#                     n = len(raw); final_long = np.ones(n)/n; final_short = np.zeros(n)
# 
#                 final_w = final_long - final_short
#                 weights_list.append(final_w.tolist())
#                 dates_list.append(env_pred.data.loc[env_pred.current_step, 'Date'])
# 
#             obs, _, done, _, _ = env_pred.step(action)
# 
#         # Save predicted weights to CSV
#         weights_df = pd.DataFrame(weights_list, columns=etf_list)
#         weights_df.insert(0, 'Date', dates_list)
#         weights_df.to_csv(os.path.join(window_dir, 'weights.csv'), index=False)
# 
#         # Compute cumulative return
#         cum_wealth = 1.0
#         returns_log = []
# 
#         for t, w in zip(dates_list, weights_list):
#             step_idx = pred_scaled[pred_scaled['Date'] == t].index[0]
#             asset_returns = np.array([
#                 pred_scaled.loc[step_idx + 1, f'Actual_Return_{etf}']
#                 for etf in etf_list
#             ])
#             port_ret = np.dot(w, asset_returns)
#             cum_wealth *= (1 + port_ret)
#             returns_log.append({'Date': t, 'Portfolio_Return': port_ret, 'Cumulative_Wealth': cum_wealth})
# 
#         iter_returns.append(cum_wealth - 1.0)
# 
#         # Save returns log to CSV
#         pd.DataFrame(returns_log).to_csv(os.path.join(window_dir, 'returns_log.csv'), index=False)
#         window_end_time = time.time()
#         elapsed_window_time = window_end_time - window_start_time
#         print(f"  - Completed window {idx+1}/{len(start_indices)} in {elapsed_window_time/60:.2f} minutes.")
# 
# 
#     mean_ret = np.mean(iter_returns)
#     std_ret  = np.std(iter_returns, ddof=1)
#     sharpe   = (mean_ret / std_ret) * np.sqrt(len(iter_returns)) if std_ret != 0 else np.nan
#     summary_records.append({'iteration': iter_num, 'seed': iter_seed,
#                             'mean_return': mean_ret, 'sharpe': sharpe})
# 
# # Save overall summary and significance results:
# summary_df = pd.DataFrame(summary_records)
# summary_df.to_csv(os.path.join(output_dir, 'iterations_summary.csv'), index=False)
# 
# t_stat, p_val = ttest_1samp(summary_df['mean_return'], 0.0)
# with open(os.path.join(output_dir, 't_test_result.csv'), 'w') as f:
#     f.write(f"t-statistic,{t_stat}\np-value,{p_val}\n")
# 
# print(summary_df)
# print(f"Overall t-statistic={t_stat:.3f}, p-value={p_val:.3f}")
# 


In [28]:
# fine tune to use change‑based actions
import os
import json
import random
from dataclasses import dataclass
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterSampler
from sklearn.preprocessing import StandardScaler
from scipy.stats import ttest_1samp

import torch
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
import gymnasium as gym
import time
from gymnasium import spaces
import gc
# ---------------------------------------------------------------------
# Configuration dataclass
# ---------------------------------------------------------------------
@dataclass
class TrainingConfig:
    train_window_days: int = 252 * 7      # 10 years for training
    validation_window_days: int = 252      # ~6 months for validation
    prediction_window_days: int = 252      # ~6 months for prediction
    lookback_period: int = 21              # lookback for observations
    rebalance_period: int = 21             # rebalance every 10 days
    n_iter_tuning: int = 20                # number of hyperparameter samples
    tuning_timesteps: int = 10_000          # timesteps for each tune
    incremental_timesteps: int = 10_000     # PPO training step size
    max_timesteps: int = 50_000            # maximum PPO timesteps
    patience: int = 3                      # early stopping patience
    policy_arch: Tuple[int, int] = (256, 256)  # network architecture
    num_iterations: int = 10                # number of outer iterations (seeds)
    base_seed: int = 42                    # base random seed
    default_risk_coeff: float = 0.5        # default risk coefficient
    desired_long: float = 1.0       # Default no leverage, 100% allocation
    desired_short: float = 0.0      # Default no short selling
    weight_bounds: Tuple[float, float] = (0.0, 1.0)  # Default bounds [0,1] for no shorts
    lambda_hhi: float = 0.1
    lambda_turnover: float = 0.005
    transaction_cost_rate: float = 0.0
    model_retrain: bool  = False

# ---------------------------------------------------------------------
# Seed-setting utility
# ---------------------------------------------------------------------
def set_global_seed(seed: int) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

# ---------------------------------------------------------------------
# Feature engineering
# ---------------------------------------------------------------------
def add_stable_features(df: pd.DataFrame, etf_list: List[str]) -> pd.DataFrame:
    data = df.copy()
    for etf in etf_list:
        price_col = f'Price_{etf}'
        data[f'Volatility_{etf}'] = data[price_col].pct_change().rolling(20).std()
        data[f'Momentum_5d_{etf}'] = data[price_col].pct_change(periods=5)
        data[f'Momentum_10d_{etf}'] = data[price_col].pct_change(periods=10)
        data[f'Momentum_20d_{etf}'] = data[price_col].pct_change(periods=20)
        data[f'MA_5d_{etf}'] = data[price_col].rolling(5).mean()
        data[f'MA_20d_{etf}'] = data[price_col].rolling(20).mean()
        data[f'MA_Crossover_{etf}'] = data[f'MA_5d_{etf}'] - data[f'MA_20d_{etf}']
    data.dropna(inplace=True)
    return data

def filter_features(df: pd.DataFrame,
                    include_predicted_returns: bool = True,
                    include_shap_metrics: bool = True) -> pd.DataFrame:
    df_filtered = df.copy()
    if not include_predicted_returns:
        pred_cols = [c for c in df_filtered.columns if 'Predicted_Return' in c]
        df_filtered.drop(columns=pred_cols, inplace=True)
    if not include_shap_metrics:
        shap_cols = [c for c in df_filtered.columns if 'SHAP' in c]
        df_filtered.drop(columns=shap_cols, inplace=True)
    return df_filtered

# ---------------------------------------------------------------------
# Custom Gym environment
# ---------------------------------------------------------------------
class PortfolioEnv(gym.Env):
    metadata = {'render_modes': []}

    def __init__(self, data, etf_list, reward_type='mean_cvar',
                 risk_coefficient=0.5, rebalance_period=21,
                 lookback_period=21, weight_bounds=(0.0, 1.0),
                 desired_long=1.0, desired_short=0.0,
                 use_baseline=False, baseline_fn=None,
                 transaction_cost_rate=0.0,
                 lambda_turnover=0.001,   # <- Add explicitly
                 lambda_hhi=0.1):         # <- Add explicitly
        super().__init__()
        self.transaction_cost_rate = transaction_cost_rate
        self.data = data.reset_index(drop=True)
        self.etf_list = etf_list
        self.reward_type = reward_type
        self.risk_coefficient = risk_coefficient
        self.rebalance_period = rebalance_period
        self.lookback_period = lookback_period
        self.weight_bounds = weight_bounds
        self.desired_long = desired_long       # Add this explicitly
        self.desired_short = desired_short     # Add this explicitly
        self.use_baseline = use_baseline
        self.baseline_fn = baseline_fn
        self.transaction_cost_rate = transaction_cost_rate
        self.lambda_turnover = lambda_turnover
        self.lambda_hhi = lambda_hhi

        self.action_space = spaces.Box(low=-1.0, high=1.0,
                                       shape=(len(etf_list),), dtype=np.float32)
        self.feature_cols = [c for c in data.columns
                             if c != 'Date' and not c.startswith('Actual_Return')]
        self.num_features_per_day = len(self.feature_cols)
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(self.num_features_per_day * lookback_period,),
            dtype=np.float32
        )

        self.current_step = lookback_period
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(etf_list)] * len(etf_list))

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            np.random.seed(seed)
        self.current_step = self.lookback_period
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(self.etf_list)] * len(self.etf_list))
        return self._get_obs(), {}

    def _get_obs(self):
        obs_window = self.data.iloc[self.current_step - self.lookback_period : self.current_step]
        obs_values = obs_window[self.feature_cols].values.flatten().astype(np.float32)
        
        if np.isnan(obs_values).any() or np.isinf(obs_values).any():
            obs_values = np.nan_to_num(obs_values, nan=0.0, posinf=0.0, neginf=0.0)
        return obs_values

    def calculate_reward(self, portfolio_return, asset_returns, turnover):
        hhi = np.sum(np.square(np.abs(self.current_weights)))
    
        if np.isnan(portfolio_return) or np.isinf(portfolio_return):
            portfolio_return = 0.0  # safeguard explicitly
    
        portfolio_return = np.clip(portfolio_return, -0.5, 0.5)  # explicitly clip returns
    
        if self.reward_type == 'cumulative_return':
            base_reward = portfolio_return
        elif self.reward_type == 'log_wealth':
            base_reward = np.log(max(1 + portfolio_return, 1e-8))
        elif self.reward_type == 'mean_var':
            base_reward = portfolio_return - self.risk_coefficient * np.var(asset_returns)
        elif self.reward_type == 'mean_cvar':
            alpha = 0.05
            var = np.percentile(asset_returns, 100 * alpha)
            cvar = np.mean(asset_returns[asset_returns <= var])
            base_reward = portfolio_return - self.risk_coefficient * cvar
        else:
            raise ValueError(f"Invalid reward type: {self.reward_type}")
    
        reward = base_reward \
                 - self.lambda_turnover * turnover \
                 - self.lambda_hhi * hhi
    
        if np.isnan(reward) or np.isinf(reward):
            reward = -1.0  # explicit fallback
    
        return reward

    def step(self, action):
        next_step = self.current_step + 1
        prev_weights = self.current_weights.copy()
    
        if self.current_step % self.rebalance_period == 0:
            if self.use_baseline and self.baseline_fn is not None:
                current_date = self.data.loc[self.current_step, 'Date']
                baseline_w = self.baseline_fn(current_date)
                raw = baseline_w * (1.0 + action)
            else:
                raw = self.current_weights + action
    
            long_w = np.maximum(raw, 0.0)
            short_w = np.abs(np.minimum(raw, 0.0))
    
            has_long = long_w.sum() > 0
            has_short = short_w.sum() > 0
    
            if has_long and has_short:
                norm_long = self.desired_long * long_w / long_w.sum()
                norm_short = self.desired_short * short_w / short_w.sum()
                combined = norm_long - norm_short
            elif has_long and not has_short:
                # explicitly no leverage if no shorts
                combined = long_w / long_w.sum()
            elif not has_long and has_short:
                # explicitly full short if no longs
                combined = -short_w / short_w.sum()
            else:
                # fallback explicitly to equal weights
                combined = np.ones(len(raw)) / len(raw)
    
            clipped = np.clip(combined, self.weight_bounds[0], self.weight_bounds[1])
    
            # After clipping explicitly re-normalize
            long_c = np.maximum(clipped, 0.0)
            short_c = np.abs(np.minimum(clipped, 0.0))
    
            if long_c.sum() > 0 and short_c.sum() > 0:
                final_long = self.desired_long * long_c / long_c.sum()
                final_short = self.desired_short * short_c / short_c.sum()
                self.current_weights = final_long - final_short
            elif long_c.sum() > 0:
                self.current_weights = long_c / long_c.sum()
            elif short_c.sum() > 0:
                self.current_weights = -short_c / short_c.sum()
            else:
                self.current_weights = np.ones(len(raw)) / len(raw)
    
            turnover = np.sum(np.abs(self.current_weights - prev_weights))
        else:
            # Passive reweighting between rebalances
            returns_today = np.array([
                self.data.loc[self.current_step, f"Actual_Return_{etf}"]
                for etf in self.etf_list
            ])
            self.current_weights *= (1.0 + returns_today)
            self.current_weights /= np.sum(np.abs(self.current_weights))
            turnover = 0.0
    
        # Check for termination
        if next_step >= len(self.data):
            reward = 0.0
            terminated = True
        else:
            asset_returns = np.array([
                self.data.loc[next_step, f"Actual_Return_{etf}"]
                for etf in self.etf_list
            ])
            portfolio_return = np.dot(self.current_weights, asset_returns)
            portfolio_return = np.nan_to_num(portfolio_return, nan=0.0, posinf=0.0, neginf=0.0)
    
            self.cumulative_wealth *= (1.0 + portfolio_return)
            reward = self.calculate_reward(portfolio_return, asset_returns, turnover)
            reward -= self.transaction_cost_rate * turnover
            terminated = next_step >= len(self.data) - 1
    
        self.current_step += 1
        return self._get_obs(), reward, terminated, False, {}

    
        # advance time and return observation, reward, termination flags
        self.current_step += 1
        return self._get_obs(), reward, terminated, False, {}

# ---------------------------------------------------------------------
# Hyperparameter tuning function
# ---------------------------------------------------------------------
def equal_weight_baseline(date):
    return np.ones(len(etf_list)) / len(etf_list)

def validate_and_tune(train_data: pd.DataFrame, val_data: pd.DataFrame,
                      etf_list: List[str], cfg: TrainingConfig,
                      random_seed: int) -> Dict[str, float]:

    param_dist = {
        'learning_rate': [5e-4, 1e-5, 5e-5],
        'n_steps': [20, 40],
        'batch_size': [10, 20],
        'gamma': [0.95, 0.98],
        'risk_coefficient': [0.1, 0.5, 1.0, 5.0],
        'lambda_turnover': [0.005, 0.01, 0.05],
        'lambda_hhi': [0.5, 1, 5],
        'seed': [random_seed, random_seed + 11, random_seed + 23]  # explicitly vary seeds
    }

    # Crucial: explicitly pass random_seed to ParameterSampler
    sampled_params = list(ParameterSampler(
        param_dist, n_iter=cfg.n_iter_tuning, random_state=random_seed
    ))

    best_reward = -np.inf
    best_params = None

    for params in sampled_params:
        seed = params.pop('seed')
        risk_coeff = params.pop('risk_coefficient', cfg.default_risk_coeff)
        lambda_turnover = params.pop('lambda_turnover', cfg.lambda_turnover)
        lambda_hhi = params.pop('lambda_hhi', cfg.lambda_hhi)
        
        set_global_seed(seed)


        env = make_vec_env(lambda: PortfolioEnv(
            train_data, etf_list, 'mean_cvar', risk_coeff,
            cfg.rebalance_period, cfg.lookback_period,
            use_baseline=True, baseline_fn=equal_weight_baseline,
            transaction_cost_rate=0.0005,
            lambda_turnover=lambda_turnover,
            lambda_hhi=lambda_hhi
        ), n_envs=1, seed=seed)

        model = PPO('MlpPolicy', env, ent_coef=0.01, clip_range=0.2, seed=seed, **params, verbose=0)
        model.learn(total_timesteps=cfg.tuning_timesteps)

        # Evaluate explicitly on validation set
        val_env = PortfolioEnv(
            val_data, etf_list, 'mean_cvar', risk_coeff,
            cfg.rebalance_period, cfg.lookback_period,
            use_baseline=True, baseline_fn=equal_weight_baseline,
            transaction_cost_rate=0.0005,
            lambda_turnover=lambda_turnover,
            lambda_hhi=lambda_hhi
        )

        obs, _ = val_env.reset(seed=seed)
        done, total_reward = False, 0.0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = val_env.step(action)
            total_reward += reward

        if total_reward > best_reward:
            best_reward = total_reward
            best_params = params.copy()
            best_params.update({
                'risk_coefficient': risk_coeff,
                'lambda_turnover': lambda_turnover,
                'lambda_hhi': lambda_hhi,
                'seed': seed
            })

    return best_params

# ---------------------------------------------------------------------
# Training and prediction function
# ---------------------------------------------------------------------
def train_and_predict(train_df: pd.DataFrame, val_df: pd.DataFrame,
                      pred_df: pd.DataFrame, etf_list: List[str],
                      cfg: TrainingConfig, best_params: Dict[str, float],
                      model_path: str) -> Tuple[List[List[float]], List[pd.Timestamp]]:
    risk_coeff = best_params.pop('risk_coefficient')
    seed = best_params.pop('seed')
    set_global_seed(seed)

    # Initialize training environment
    env_train = make_vec_env(
        lambda: PortfolioEnv(
            train_df, etf_list,
            'mean_cvar', risk_coeff,
            cfg.rebalance_period,
            cfg.lookback_period,
            use_baseline=True,
            baseline_fn=equal_weight_baseline,
            transaction_cost_rate=cfg.transaction_cost_rate,
			desired_long=cfg.desired_long,
		    desired_short=cfg.desired_short,
		    weight_bounds=cfg.weight_bounds,
            lambda_turnover=cfg.lambda_turnover,
            lambda_hhi=cfg.lambda_hhi
        ),
        n_envs=1, seed=seed
    )

    policy_kwargs = dict(net_arch=list(cfg.policy_arch))
    model = PPO(
        'MlpPolicy', env_train,
        policy_kwargs=policy_kwargs,
        ent_coef=0.01,
        clip_range=0.2,
        seed=seed,
        verbose=0,
        **best_params
    )

    best_val_reward = -np.inf
    no_improve = 0

    # Early stopping loop
    for step in range(0, cfg.max_timesteps, cfg.incremental_timesteps):
        model.learn(total_timesteps=cfg.incremental_timesteps)

        # Initialize validation environment
        val_env = PortfolioEnv(
            val_df, etf_list,
            'mean_cvar', risk_coeff,
            cfg.rebalance_period,
            cfg.lookback_period,
            use_baseline=True,
            baseline_fn=equal_weight_baseline,
            transaction_cost_rate=cfg.transaction_cost_rate,
			desired_long=cfg.desired_long,
		    desired_short=cfg.desired_short,
		    weight_bounds=cfg.weight_bounds,
            lambda_turnover=cfg.lambda_turnover,
            lambda_hhi=cfg.lambda_hhi
        )

        obs, _ = val_env.reset(seed=seed)
        done = False
        val_reward = 0.0

        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = val_env.step(action)
            val_reward += reward

        if val_reward > best_val_reward:
            best_val_reward = val_reward
            no_improve = 0
            model.save(model_path)
        else:
            no_improve += 1
            if no_improve >= cfg.patience:
                break

    # Load best model and predict on pred_df
    best_model = PPO.load(model_path)

    env_pred = PortfolioEnv(
        pred_df, etf_list,
        reward_type='mean_cvar',  # Use Mean-CVaR reward explicitly
        risk_coefficient=risk_coeff,
        rebalance_period=cfg.rebalance_period,
        lookback_period=cfg.lookback_period,
        use_baseline=False,  # Set baseline to False for delta actions
        transaction_cost_rate=cfg.transaction_cost_rate,
		desired_long=cfg.desired_long,
		desired_short=cfg.desired_short,
		weight_bounds=cfg.weight_bounds,
        lambda_turnover=cfg.lambda_turnover,
        lambda_hhi=cfg.lambda_hhi
    )

    obs, _ = env_pred.reset()
    done = False
    weights_list, dates_list = [], []

    while not done:
        if env_pred.current_step >= cfg.lookback_period and (
            env_pred.current_step % cfg.rebalance_period == 0
        ):
            action, _ = best_model.predict(obs, deterministic=True)
            obs, _, done, _, _ = env_pred.step(action)  # step first, then record

            # Record weights AFTER applying the action
            weights_list.append(env_pred.current_weights.tolist())
            dates_list.append(env_pred.data.loc[env_pred.current_step, 'Date'])
        else:
            obs, _, done, _, _ = env_pred.step(np.zeros(len(etf_list), dtype=np.float32))

    return weights_list, dates_list


# ---------------------------------------------------------------------
# Data loading and overall training loop
# ---------------------------------------------------------------------
cfg = TrainingConfig(model_retrain=False)

# Load your prepared Stage‑2 dataset and price data
data = pd.read_csv('stage2_rl_observations_optimized_10ETFs.csv', parse_dates=['Date'])
price_data = pd.read_csv('stock_prices_10ETFs.csv')
price_data['Date'] = pd.to_datetime(price_data['Date'], utc=True).dt.tz_localize(None)
price_cols = {col: f'Price_{col}' for col in price_data.columns if col != 'Date'}
price_data.rename(columns=price_cols, inplace=True)

merged_data = pd.merge(data, price_data, on='Date', how='inner').reset_index(drop=True)
if len(merged_data) != len(data):
    print("Warning: data length mismatch after merge.")

etf_list = ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU']
feature_data = add_stable_features(merged_data, etf_list)
feature_data = filter_features(feature_data, include_predicted_returns=True, include_shap_metrics=True)

# Rolling windows
total_len = len(feature_data)
# start_indices = range(0,
#                       total_len - (cfg.train_window_days + cfg.validation_window_days + cfg.prediction_window_days),
#                       cfg.prediction_window_days)

start_indices = []
current_start = 0

while True:
    train_start = current_start
    train_end = train_start + cfg.train_window_days
    val_end = train_end + cfg.validation_window_days
    pred_end = val_end + cfg.prediction_window_days
    
    if pred_end > total_len:
        break
    
    start_indices.append(current_start)
    
    # move to next window ensuring continuity without gap
    current_start += cfg.prediction_window_days - cfg.rebalance_period

# Prepare directory for outputs
output_dir = 'stage2_iterations'
os.makedirs(output_dir, exist_ok=True)

# Collect metrics for all iterations
summary_records = []
for iter_num in range(cfg.num_iterations):
	
    iter_seed = cfg.base_seed + iter_num
    set_global_seed(iter_seed)
    tuned_seed = iter_seed
    iter_dir = os.path.join(output_dir, f'iteration_{iter_num:02d}')
    os.makedirs(iter_dir, exist_ok=True)
    
    previous_model_path = None
    iter_returns = []
    print(f"\nStarting iteration {iter_num+1}/{cfg.num_iterations} (Seed: {tuned_seed}) at {time.strftime('%Y-%m-%d %H:%M:%S')}")
	
    for idx, start_idx in enumerate(start_indices):
        window_start_time = time.time()
        print(f"  - Starting window {idx+1}/{len(start_indices)} at {time.strftime('%Y-%m-%d %H:%M:%S')}")

        train_start = start_idx
        train_end = train_start + cfg.train_window_days
        val_start = train_end
        val_end = val_start + cfg.validation_window_days
        pred_start = val_end
        pred_end = pred_start + cfg.prediction_window_days

        train_df = feature_data.iloc[train_start:train_end].reset_index(drop=True)
        val_df = feature_data.iloc[val_start:val_end].reset_index(drop=True)
        pred_df = feature_data.iloc[pred_start:pred_end].reset_index(drop=True)
        
        train_df.ffill(inplace=True)
        train_df.bfill(inplace=True)
        
        val_df.ffill(inplace=True)
        val_df.bfill(inplace=True)
        
        pred_df.ffill(inplace=True)
        pred_df.bfill(inplace=True)
        

        feature_cols = [c for c in train_df.columns if c != 'Date' and not c.startswith('Actual_Return')]
        scaler = StandardScaler()
        scaler.fit(train_df[feature_cols])
		
        scale = scaler.scale_
        scale[scale < 1e-8] = 1.0
        scaler.scale_ = scale
        
        train_scaled = train_df.copy()
        train_scaled[feature_cols] = scaler.transform(train_df[feature_cols])
        val_scaled = val_df.copy()
        val_scaled[feature_cols] = scaler.transform(val_df[feature_cols])
        pred_scaled = pred_df.copy()
        pred_scaled[feature_cols] = scaler.transform(pred_df[feature_cols])

        # Explicit seed handling
        if idx == 0:
            best_params = validate_and_tune(
                train_scaled, val_scaled, etf_list, cfg, random_seed=iter_seed
            )
            
            # Fetch tuned seed explicitly from best_params
            tuned_seed = best_params.get('seed', iter_seed)
        else:
            # Keep explicitly using previously found best_params
            best_params = best_params.copy()

        seed = tuned_seed
        set_global_seed(seed)

        window_dir = os.path.join(iter_dir, f'window_{idx:02d}')
        os.makedirs(window_dir, exist_ok=True)
        model_path = os.path.join(window_dir, 'best_ppo.zip')

        # Use tuned parameters explicitly
        risk_coeff = best_params.get('risk_coefficient', cfg.default_risk_coeff)
        lambda_turnover = best_params.get('lambda_turnover', cfg.lambda_turnover)
        lambda_hhi = best_params.get('lambda_hhi', cfg.lambda_hhi)
        

        env_train = make_vec_env(lambda: PortfolioEnv(
                train_scaled, etf_list, 'mean_cvar', risk_coeff, cfg.rebalance_period, cfg.lookback_period,
                use_baseline=True,
                baseline_fn=equal_weight_baseline,
                transaction_cost_rate=0.0005,
                desired_long=cfg.desired_long,
                desired_short=cfg.desired_short,
                weight_bounds=cfg.weight_bounds,
                lambda_turnover=lambda_turnover,
                lambda_hhi=lambda_hhi
            ), n_envs=1, seed=seed)

        policy_kwargs = dict(net_arch=list(cfg.policy_arch))

        if previous_model_path and os.path.exists(previous_model_path) and not cfg.model_retrain:
            print(f'load the exising model from {previous_model_path} and retrain')
            model = PPO.load(previous_model_path, env=env_train)
            model.set_env(env_train)
        else:
            print(f'triam new model and saved under {model_path}')
            model = PPO('MlpPolicy', env_train, policy_kwargs=policy_kwargs,
                ent_coef=0.01,
                clip_range=0.2,
                seed=seed,
                learning_rate=best_params.get('learning_rate', 1e-4),
                n_steps=best_params.get('n_steps', 20),
                batch_size=best_params.get('batch_size', 10),
                gamma=best_params.get('gamma', 0.98),
                verbose=0)
            
        best_val_reward = -np.inf
        no_improve = 0
        training_log = []

        for step in range(0, cfg.max_timesteps, cfg.incremental_timesteps):
            model.learn(total_timesteps=cfg.incremental_timesteps)

            val_env = PortfolioEnv(val_scaled, etf_list, 'mean_cvar', risk_coeff,
                                   cfg.rebalance_period, cfg.lookback_period,
                                   use_baseline=True,
                                   baseline_fn=equal_weight_baseline,
                                   transaction_cost_rate=0.0005, 			
								   desired_long=cfg.desired_long,
								   desired_short=cfg.desired_short,
								   weight_bounds=cfg.weight_bounds, lambda_turnover=lambda_turnover,
                lambda_hhi=lambda_hhi)
            obs, _ = val_env.reset(seed=seed)
            done, val_reward = False, 0.0

            while not done:
                action, _ = model.predict(obs, deterministic=True)
                obs, reward, done, _, _ = val_env.step(action)
                val_reward += reward

            training_log.append({'training_step': step + cfg.incremental_timesteps, 'validation_reward': val_reward})

            if val_reward > best_val_reward:
                best_val_reward = val_reward
                no_improve = 0
                model.save(model_path)
            else:
                no_improve += 1
                if no_improve >= cfg.patience:
                    break

        pd.DataFrame(training_log).to_csv(os.path.join(window_dir, 'training_validation_log.csv'), index=False)
        previous_model_path = model_path

        best_model = PPO.load(model_path)
        env_pred = PortfolioEnv(
            pred_scaled, etf_list,
            reward_type='mean_cvar',
            risk_coefficient=risk_coeff,
            rebalance_period=cfg.rebalance_period,
            lookback_period=cfg.lookback_period,
            use_baseline=False,
            transaction_cost_rate=0.0005,
			desired_long=cfg.desired_long,
		    desired_short=cfg.desired_short,
		    weight_bounds=cfg.weight_bounds, lambda_turnover=lambda_turnover,
                lambda_hhi=lambda_hhi
        )

        obs, _ = env_pred.reset()
        done = False
        weights_list, dates_list = [], []

        while not done:
            if env_pred.current_step >= cfg.lookback_period and (
                env_pred.current_step % cfg.rebalance_period == 0
            ):
                action, _ = best_model.predict(obs, deterministic=True)
                obs, _, done, _, _ = env_pred.step(action)

                weights_list.append(env_pred.current_weights.tolist())
                dates_list.append(env_pred.data.loc[env_pred.current_step, 'Date'])
            else:
                obs, _, done, _, _ = env_pred.step(np.zeros(len(etf_list), dtype=np.float32))

        weights_df = pd.DataFrame(weights_list, columns=etf_list)
        weights_df.insert(0, 'Date', dates_list)
        weights_df.to_csv(os.path.join(window_dir, 'weights.csv'), index=False)

        cum_wealth = 1.0
        returns_log = []

        for t, w in zip(dates_list, weights_list):
            step_idx = pred_scaled[pred_scaled['Date'] == t].index[0]
            asset_returns = np.array([
                pred_scaled.loc[step_idx + 1, f'Actual_Return_{etf}']
                for etf in etf_list
            ])
            port_ret = np.dot(w, asset_returns)
            cum_wealth *= (1 + port_ret)
            returns_log.append({'Date': t, 'Portfolio_Return': port_ret, 'Cumulative_Wealth': cum_wealth})

        iter_returns.append(cum_wealth - 1.0)

        pd.DataFrame(returns_log).to_csv(os.path.join(window_dir, 'returns_log.csv'), index=False)
        window_end_time = time.time()
        elapsed_window_time = window_end_time - window_start_time
        print(f"  - Completed window {idx+1}/{len(start_indices)} in {elapsed_window_time/60:.2f} minutes.")

    mean_ret = np.mean(iter_returns)
    std_ret = np.std(iter_returns, ddof=1)
    sharpe = (mean_ret / std_ret) * np.sqrt(len(iter_returns)) if std_ret != 0 else np.nan
    summary_records.append({
        'iteration': iter_num,
        'seed': iter_seed,
        'mean_return': mean_ret,
        'sharpe': sharpe
    })
    gc.collect()
    torch.cuda.empty_cache()  # Only if you're using GPU explicitly

summary_df = pd.DataFrame(summary_records)
summary_df.to_csv(os.path.join(output_dir, 'iterations_summary.csv'), index=False)

t_stat, p_val = ttest_1samp(summary_df['mean_return'], 0.0)
with open(os.path.join(output_dir, 't_test_result.csv'), 'w') as f:
    f.write(f"t-statistic,{t_stat}\np-value,{p_val}\n")

print(summary_df)
print(f"Overall t-statistic={t_stat:.3f}, p-value={p_val:.3f}")

gc.collect()
torch.cuda.empty_cache()  # Only if you're using GPU explicitly






Starting iteration 1/10 (Seed: 42) at 2025-08-01 08:51:58
  - Starting window 1/8 at 2025-08-01 08:51:58
triam new model and saved under stage2_iterations\iteration_00\window_00\best_ppo.zip
  - Completed window 1/8 in 20.50 minutes.
  - Starting window 2/8 at 2025-08-01 09:12:28
load the exising model from stage2_iterations\iteration_00\window_00\best_ppo.zip and retrain
  - Completed window 2/8 in 4.88 minutes.
  - Starting window 3/8 at 2025-08-01 09:17:21
load the exising model from stage2_iterations\iteration_00\window_01\best_ppo.zip and retrain
  - Completed window 3/8 in 3.91 minutes.
  - Starting window 4/8 at 2025-08-01 09:21:16
load the exising model from stage2_iterations\iteration_00\window_02\best_ppo.zip and retrain
  - Completed window 4/8 in 4.90 minutes.
  - Starting window 5/8 at 2025-08-01 09:26:10
load the exising model from stage2_iterations\iteration_00\window_03\best_ppo.zip and retrain
  - Completed window 5/8 in 3.87 minutes.
  - Starting window 6/8 at 2025-0

In [23]:
jun chenprevious_model_path and os.path.exists(previous_model_path) and not cfg.model_retrain

False

In [25]:
cfg.model_retrain

True

In [13]:
reprevious_model_path

'stage2_iterations\\iteration_01\\window_15\\best_ppo.zip'

In [29]:
import pandas as pd
import glob
import os

# Configuration
output_dir = 'stage2_iterations'  # Adjust if your path is different
pattern = os.path.join(output_dir, 'iteration_*', 'window_*', 'weights.csv')

# Find all weight files matching the pattern
files = glob.glob(pattern)

# Initialize an empty list to collect DataFrames
all_weights = []

for file_path in files:
    # Extract iteration and window numbers
    parts = file_path.split(os.sep)
    iteration = int(parts[-3].split('_')[1])
    window = int(parts[-2].split('_')[1])

    # Load weights file
    df = pd.read_csv(file_path, parse_dates=['Date'])

    # Add columns for iteration and window
    df.insert(0, 'Window', window)
    df.insert(0, 'Iteration', iteration)

    # Append to the list
    all_weights.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(all_weights, ignore_index=True)

# Sort by iteration, window, and date
combined_df.sort_values(['Iteration', 'Window', 'Date'], inplace=True)

# Save combined data
combined_df.to_csv(os.path.join(output_dir, 'combined_weights.csv'), index=False)

print(f"Combined weights saved to: {os.path.join(output_dir, 'combined_weights.csv')}")


Combined weights saved to: stage2_iterations\combined_weights.csv


In [None]:
# start of stage 2 training
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from sklearn.model_selection import ParameterGrid

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import time
import os
import torch
import random
import json
from stable_baselines3.common.utils import set_random_seed

SEED = 42
def set_global_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    set_random_seed(seed)

set_global_seed(SEED)

class PortfolioEnv(gym.Env):
    def __init__(self, data, etf_list, reward_type='mean_cvar', risk_coefficient=0.5, rebalance_period=21, lookback_period=21):
        super().__init__()

        self.data = data.reset_index(drop=True)
        self.etf_list = etf_list
        self.reward_type = reward_type
        self.risk_coefficient = risk_coefficient
        self.rebalance_period = rebalance_period
        self.lookback_period = lookback_period
        self.action_space = spaces.Box(low=-1, high=1, shape=(len(etf_list),), dtype=np.float32)

        # Explicitly select feature columns (excluding Date and returns used only for calculating reward)
        self.feature_cols = [col for col in data.columns if col not in ['Date'] and not col.startswith('Actual_Return')]
        self.num_features_per_day = len(self.feature_cols)

        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(self.num_features_per_day * self.lookback_period,),
            dtype=np.float32
        )

        self.current_step = self.lookback_period
        self.done = False
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(etf_list)] * len(etf_list))

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            self.seed(seed)
        self.current_step = self.lookback_period
        self.done = False
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(self.etf_list)] * len(self.etf_list))
        return self._get_obs(), {}

    def step(self, action):
        next_step = self.current_step + 1

        if self.current_step % self.rebalance_period == 0:
            # v2 long short
            desired_long = 1.20  # 120% long exposure explicitly
            desired_short = 0.20  # 20% short exposure explicitly
            clip_bounds = (-0.2, 0.8)

            raw_weights = action.copy()

            # Separate explicitly positive (long) and negative (short) actions
            long_weights = np.maximum(raw_weights, 0)
            short_weights = np.abs(np.minimum(raw_weights, 0))

            has_longs = np.sum(long_weights) > 0
            has_shorts = np.sum(short_weights) > 0

            if has_longs and has_shorts:
                # Normal 120/20 explicitly0
                normalized_long = desired_long * long_weights / np.sum(long_weights)
                normalized_short = desired_short * short_weights / np.sum(short_weights)
            elif has_longs and not has_shorts:
                # Only long explicitly: default realistically to 100% long
                normalized_long = long_weights / np.sum(long_weights)
                normalized_short = np.zeros_like(short_weights)
            elif not has_longs and has_shorts:
                # Only short explicitly (unrealistic), fallback clearly to equal-weight long-only
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)
            else:
                # All zeros explicitly: fallback explicitly to equal-weight long-only
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)

            # Apply explicit clipping
            combined_weights = normalized_long - normalized_short
            clipped_weights = np.clip(combined_weights, clip_bounds[0], clip_bounds[1])

            # Re-separate explicitly after clipping
            long_clipped = np.maximum(clipped_weights, 0)
            short_clipped = np.abs(np.minimum(clipped_weights, 0))

            has_long_clipped = np.sum(long_clipped) > 0
            has_short_clipped = np.sum(short_clipped) > 0

            # Final explicit normalization after clipping
            if has_long_clipped and has_short_clipped:
                final_long = desired_long * long_clipped / np.sum(long_clipped)
                final_short = desired_short * short_clipped / np.sum(short_clipped)
            elif has_long_clipped and not has_short_clipped:
                final_long = long_clipped / np.sum(long_clipped)  # exactly 100% long
                final_short = np.zeros_like(short_clipped)
            else:
                # Realistic fallback explicitly: equal-weight long-only
                num_assets = len(raw_weights)
                final_long = np.ones(num_assets) / num_assets
                final_short = np.zeros(num_assets)

            final_weights = final_long - final_short
            self.current_weights = final_weights
            
            # v1 softmax normalization
            
            # temperature = 0.5  # Explicitly lower for higher concentration (try 0.2 to 0.8)
            # scaled_action = action / temperature
            # self.current_weights = np.exp(scaled_action) / np.sum(np.exp(scaled_action))

        else:
            returns_today = np.array([self.data.loc[self.current_step, f'Actual_Return_{etf}'] for etf in self.etf_list])
            self.current_weights *= (1 + returns_today)
            self.current_weights /= np.sum(self.current_weights)

        if next_step >= len(self.data):
            terminated = True
            reward = 0.0
        else:
            returns = np.array([self.data.loc[next_step, f'Actual_Return_{etf}'] for etf in self.etf_list])
            portfolio_return = np.dot(self.current_weights, returns)
            self.cumulative_wealth *= (1 + portfolio_return)
            reward = self.calculate_reward(portfolio_return, returns)
            terminated = next_step >= len(self.data) - 1

        self.current_step += 1

        return self._get_obs(), reward, terminated, False, {}

        # def _get_obs(self):
        #     obs_window = self.data.iloc[self.current_step - self.lookback_period:self.current_step]
        #     obs_window = obs_window.drop(columns=['Date']).values.flatten().astype(np.float32)
        #     return obs_window

    def _get_obs(self):
        obs_window = self.data.iloc[self.current_step - self.lookback_period:self.current_step]
        obs_window = obs_window[self.feature_cols].values.flatten().astype(np.float32)
        return obs_window

    def calculate_reward(self, portfolio_return, asset_returns):
        if self.reward_type == 'cumulative_return':
            return self.cumulative_wealth - 1.0
        elif self.reward_type == 'log_wealth':
            return np.log(self.cumulative_wealth)
        elif self.reward_type == 'mean_var':
            return portfolio_return - self.risk_coefficient * np.var(asset_returns)
        elif self.reward_type == 'mean_cvar':
            alpha = 0.05
            var = np.percentile(asset_returns, 100 * alpha)
            cvar = np.mean(asset_returns[asset_returns <= var])
            return portfolio_return - self.risk_coefficient * cvar
        else:
            raise ValueError('Invalid reward type')

    def seed(self, seed=None):
        np.random.seed(seed)

import pandas as pd
import numpy as np

def add_stable_features(df, etf_list):
    data = df.copy()

    for etf in etf_list:
        price_col = f'Price_{etf}'

        # Volatility (20-day)
        data[f'Volatility_{etf}'] = data[price_col].pct_change().rolling(20).std()

        # Momentum indicators (returns over 5, 10, 20 days)
        data[f'Momentum_5d_{etf}'] = data[price_col].pct_change(periods=5)
        data[f'Momentum_10d_{etf}'] = data[price_col].pct_change(periods=10)
        data[f'Momentum_20d_{etf}'] = data[price_col].pct_change(periods=20)

        # Moving averages (5-day and 20-day)
        data[f'MA_5d_{etf}'] = data[price_col].rolling(5).mean()
        data[f'MA_20d_{etf}'] = data[price_col].rolling(20).mean()

        # Moving average crossover (5-day MA - 20-day MA)
        data[f'MA_Crossover_{etf}'] = data[f'MA_5d_{etf}'] - data[f'MA_20d_{etf}']

    # Drop NaN values due to rolling calculations
    data.dropna(inplace=True)

    return data

def filter_features(df, include_predicted_returns=True, include_shap_metrics=True):
    df_filtered = df.copy()

    # Explicit patterns to identify columns
    predicted_return_pattern = 'Predicted_Return'
    shap_metric_pattern = 'SHAP'

    # Exclude Predicted Returns explicitly if requested
    if not include_predicted_returns:
        predicted_cols = [col for col in df_filtered.columns if predicted_return_pattern in col]
        df_filtered.drop(columns=predicted_cols, inplace=True)
        print(f"Excluded predicted return columns: {predicted_cols}")

    # Exclude SHAP-related metrics explicitly if requested
    if not include_shap_metrics:
        shap_cols = [col for col in df_filtered.columns if shap_metric_pattern in col]
        df_filtered.drop(columns=shap_cols, inplace=True)
        print(f"Excluded SHAP-related columns: {shap_cols}")

    return df_filtered

# ETFs
etf_list = ['XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLY', 'XLV', 'XLU']

# etf_list = ['BA',
# 'AMGN',
# 'DIS',
# 'NKE',
# 'HON',
# 'MMM',
# 'CAT',
# 'KO',
# 'PG',
# 'AXP',
# 'JPM',
# 'MCD',
# 'HD',
# 'AAPL',
# 'CSCO',
# 'IBM',
# 'MSFT',
# 'TRV',
# 'UNH',
# 'CVX',
# 'JNJ',
# 'MRK',
# 'AMZN',
# 'WMT',
# 'INTC',
# 'VZ']
# Hyperparameter tuning
param_grid = {
    'learning_rate': [1e-4, 5e-5],
    'n_steps': [20, 40],
    'batch_size': [5, 10],
    'gamma': [0.98, 0.99]
}
consolidated_file = 'stage2_rl_observations_optimized_10ETFs.csv'
reward_type = 'mean_cvar'
# data = pd.read_csv(consolidated_file, parse_dates=['Date'])
# data = data.sort_values('Date').reset_index(drop=True)

data = pd.read_csv('stage2_rl_observations_optimized_10ETFs.csv', parse_dates=['Date'])
price_data = pd.read_csv('stock_prices_10ETFs.csv')
# price_data = pd.read_csv('stock_prices_10ETFs.csv')
# Convert the Date column in price data, handling the timezone correctly
price_data['Date'] = pd.to_datetime(price_data['Date'], utc=True)
price_data['Date'] = price_data['Date'].dt.tz_localize(None)

# Rename price columns explicitly to 'price_{ticker}'
price_cols = {col: f'Price_{col}' for col in price_data.columns if col != 'Date'}
price_data.rename(columns=price_cols, inplace=True)

# Merge datasets on Date
merged_data = pd.merge(data, price_data, on='Date', how='inner')
merged_data.reset_index(drop=True, inplace=True)
# Check if merge was successful
if len(merged_data) != len(data):
    print(f"Warning: Data length mismatch after merging (Original: {len(data)}, Merged: {len(merged_data)}).")
else:
    print("Merged successfully with aligned dates.")

data_with_features_raw = add_stable_features(merged_data, etf_list)
data_with_features_raw.reset_index(drop=True, inplace=True)


# Usage Example clearly for benchmark (only price metrics, no predicted return or SHAP):
data_with_features = filter_features(data_with_features_raw, 
                                 include_predicted_returns=True, 
                                 include_shap_metrics=True)
################### override data to use SHAP only
# data_with_features = data
################### END override 

# Define your rolling window lengths clearly:
train_window_days = 252 * 10
validation_window_days = 126
prediction_window_days = 126
lookback_period = 10
rebalance_period = 10

start_indices = range(0, len(data) - (train_window_days + validation_window_days + prediction_window_days), prediction_window_days)
all_weights = []
model_path = 'ppo_single_train_best_model_10ETFs.zip'

from sklearn.model_selection import ParameterSampler

def validate_and_tune(train_data, val_data, reward_type, rebalance_period=10, lookback_period=10, n_iter=8, timesteps=5000):
    best_reward, best_params = -np.inf, None

    # Narrow and meaningful parameter distribution
    param_dist = {
        'learning_rate': [3e-4, 1e-4],
        'n_steps': [20, 40],
        'batch_size': [10, 20],
        'gamma': [0.95, 0.98],
        'risk_coefficient': [0.1, 0.5, 1.0] if reward_type in ['mean_var', 'mean_cvar'] else [0.5],
        'seed': [42, 100, 2024, 12345, 579]
    }

    sampled_params = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=SEED))

    for params in sampled_params:
        seed = params.pop('seed')
        risk_coeff = params.pop('risk_coefficient', 0.5)
        set_global_seed(seed)
        env = make_vec_env(lambda: PortfolioEnv(train_data, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period), n_envs=1, seed=seed)
        model = PPO('MlpPolicy', env,
                    ent_coef=0.01,    # explicitly encourages exploration
                    clip_range=0.2,
                    seed=seed,
                    **params, verbose=0)
        model.learn(total_timesteps=timesteps)

        val_env = PortfolioEnv(val_data, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
        obs, _ = val_env.reset(seed=seed)
        done, total_reward = False, 0

        while not done:
            # num_samples = 100  # Recommended starting point
            # action_samples = []
            # 
            # for _ in range(num_samples):
            #     sampled_action, _ = model.predict(obs, deterministic=False)  # obs directly
            #     action_samples.append(sampled_action)
            # 
            # action = np.mean(action_samples, axis=0)
            
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = val_env.step(action)
            total_reward += reward

        if total_reward > best_reward:
            best_reward = total_reward
            best_params = {**params, 'risk_coefficient': risk_coeff, 'seed': seed}
    with open('best_params.json', 'w') as f:
        json.dump(best_params, f)
    return best_params

def scale_data(df, feature_cols, scaler):
    scaled_features = scaler.transform(df[feature_cols])
    scaled_df = pd.DataFrame(scaled_features, columns=feature_cols, index=df.index)

    # Re-add columns that were not scaled (e.g., Date, Actual_Return_*)
    for col in df.columns:
        if col not in feature_cols:
            scaled_df[col] = df[col].values

    # Keep original column order
    scaled_df = scaled_df[df.columns]
    return scaled_df

# Main execution
from sklearn.preprocessing import StandardScaler
for idx, start_idx in enumerate(start_indices):
    # for start_idx in range(0, 252*2, 252):
    start_time = time.time()

    # Explicit indices for training, validation, and prediction datasets
    train_start_idx = start_idx
    train_end_idx = train_start_idx + train_window_days

    val_start_idx = train_end_idx
    val_end_idx = val_start_idx + validation_window_days

    pred_start_idx = val_end_idx
    pred_end_idx = pred_start_idx + prediction_window_days

    # Corresponding dates explicitly
    train_start_date = data_with_features.loc[train_start_idx, 'Date']
    train_end_date = data_with_features.loc[train_end_idx - 1, 'Date']

    val_start_date = data_with_features.loc[val_start_idx, 'Date']
    val_end_date = data_with_features.loc[val_end_idx - 1, 'Date']

    pred_start_date = data_with_features.loc[pred_start_idx, 'Date']
    pred_end_date = data_with_features.loc[pred_end_idx - 1, 'Date']

    # Clearly print ranges for clarity
    print(f"Training period: {train_start_date.date()} to {train_end_date.date()}")
    print(f"Validation period: {val_start_date.date()} to {val_end_date.date()}")
    print(f"Prediction period: {pred_start_date.date()} to {pred_end_date.date()}")

    # Explicitly subset data accordingly
    train_data = data_with_features.iloc[train_start_idx:train_end_idx].reset_index(drop=True)
    val_data = data_with_features.iloc[val_start_idx:val_end_idx].reset_index(drop=True)
    pred_data = data_with_features.iloc[pred_start_idx:pred_end_idx].reset_index(drop=True)

    feature_cols = [col for col in train_data.columns if col != 'Date' and not col.startswith('Actual_Return')]

    scaler = StandardScaler()
    scaler.fit(train_data[feature_cols])

    train_data_scaled = scale_data(train_data, feature_cols, scaler)
    val_data_scaled = scale_data(val_data, feature_cols, scaler)
    pred_data_scaled = scale_data(pred_data, feature_cols, scaler)

    print("Starting hyperparameter tuning...")
    best_params = validate_and_tune(train_data_scaled, val_data_scaled, reward_type)
    print(f"Best parameters: {best_params}")

    incremental_timesteps = 3000    
    max_timesteps = 30000
    patience = 3
    
    best_val_reward = -np.inf
    no_improve_steps = 0

    # risk_coeff = best_params.pop('risk_coefficient',0.5)
    policy_kwargs = dict(net_arch=[256, 256])

    with open('best_params.json', 'r') as f:
        best_params = json.load(f)
    
    risk_coeff = best_params.pop('risk_coefficient')
    seed = best_params.pop('seed')
    
    set_global_seed(seed)
    env = make_vec_env(lambda: PortfolioEnv(train_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period), n_envs=1, seed=seed)
    
    # Load previous model if exists
    # if idx > 0 and os.path.exists(model_path):
    #     print(f"Loading previous model from {model_path}...")
    #     model = PPO.load(model_path, env=env)
    #     model.set_env(env)
    # else:
    #     print("Initializing new PPO model...")
    #     model = PPO('MlpPolicy', env,
    #                 policy_kwargs=policy_kwargs,
    #                 ent_coef=0.01,
    #                 clip_range=0.2,
    #                 seed=seed, 
    #                 **best_params, verbose=0)
     # always retrain
    model = PPO('MlpPolicy', env,
                    policy_kwargs=policy_kwargs,
                    ent_coef=0.01,
                    clip_range=0.2,
                    seed=seed, 
                    **best_params, verbose=0)
    # model.learn(total_timesteps=20000)
    print("Starting model training with early stopping...")

    for step in range(0, max_timesteps, incremental_timesteps):
        model.learn(total_timesteps=incremental_timesteps)
    
        # Evaluate on validation environment
        val_env = PortfolioEnv(val_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
        val_obs, _ = val_env.reset()
        val_done = False
        val_total_reward = 0.0
    
        while not val_done:
            val_action, _ = model.predict(val_obs, deterministic=True)
            # num_samples = 100  # Recommended
            # value_action_samples = []
            # 
            # for _ in range(num_samples):
            #     value_sampled_action, _ = model.predict(val_obs, deterministic=False)
            #     value_action_samples.append(value_sampled_action)
            # 
            # val_action = np.mean(value_action_samples, axis=0)    
            
            val_obs, val_reward, val_done, _, _ = val_env.step(val_action)
            val_total_reward += val_reward
    
        print(f"Step: {step + incremental_timesteps}, Validation Total Reward: {val_total_reward:.4f}")
    
        # Early stopping check
        if val_total_reward > best_val_reward:
            best_val_reward = val_total_reward
            no_improve_steps = 0
            # model.save("best_ppo_model.zip")
            model.save(model_path)
            print(f"Improved validation reward; model saved at step {step + incremental_timesteps}")
        else:
            no_improve_steps += 1
            print(f"No improvement ({no_improve_steps}/{patience})")
    
            if no_improve_steps >= patience:
                print("Early stopping explicitly triggered.")
                break
    
    # Load the best model explicitly
    model = PPO.load(model_path)
    print("Loaded the best PPO model explicitly for prediction.")



    # Ensure historical context explicitly available in prediction
    full_data = pd.concat([train_data_scaled, val_data_scaled, pred_data_scaled])
    pred_data_with_history = full_data[full_data['Date'] >= (pred_start_date - pd.Timedelta(days=lookback_period))].reset_index(drop=True)

    pred_env = PortfolioEnv(pred_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
    # pred_env = PortfolioEnv(pred_data_with_history, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)

    obs, info = pred_env.reset()
    done = False

    action = np.zeros(len(etf_list), dtype=np.float32)

    while not done:
        if pred_env.current_step >= lookback_period and pred_env.current_step % pred_env.rebalance_period == 0:
            # obs_for_agent = pred_data_with_history.drop(columns=['Date']).iloc[pred_env.current_step - lookback_period:pred_env.current_step].values.flatten().astype(np.float32)
            # action, _ = model.predict(obs_for_agent, deterministic=True)

            # v1 normalize weight
            # action, _ = model.predict(obs, deterministic=True)
            # use determinstic = FALSE       
            # num_samples = 100  # Recommended
            # action_samples = []
            # for _ in range(num_samples):
            #     sampled_action, _ = model.predict(obs, deterministic=False)
            #     action_samples.append(sampled_action)
            # action = np.mean(action_samples, axis=0)    
            # 
            # temperature = 0.5
            # scaled_action = action / temperature
            # weights = np.exp(scaled_action) / np.sum(np.exp(scaled_action))
            # rebalance_date = pred_data_with_history.loc[pred_env.current_step, 'Date']
            # all_weights.append([rebalance_date] + weights.tolist())


            # v2 long short normalization
            action, _ = model.predict(obs, deterministic=True)
            
            # uncomment this for predictopm
            # num_samples = 100  # Recommended
            # action_samples = []
            # 
            # for _ in range(num_samples):
            #     sampled_action, _ = model.predict(obs, deterministic=False)
            #     action_samples.append(sampled_action)
            # 
            # action = np.mean(action_samples, axis=0)    

            # Explicitly apply your new 120/20 normalization logic (to match environment step)
            desired_long = 1.20  # Explicitly 120% long exposure
            desired_short = 0.20  # Explicitly 20% short exposure
            clip_bounds = (-0.2, 0.8)

            raw_weights = action.copy()

            # Separate explicitly positive (long) and negative (short) actions
            long_weights = np.maximum(raw_weights, 0)
            short_weights = np.abs(np.minimum(raw_weights, 0))

            has_longs = np.sum(long_weights) > 0
            has_shorts = np.sum(short_weights) > 0

            if has_longs and has_shorts:
                normalized_long = desired_long * long_weights / np.sum(long_weights)
                normalized_short = desired_short * short_weights / np.sum(short_weights)
            elif has_longs and not has_shorts:
                normalized_long = long_weights / np.sum(long_weights)
                normalized_short = np.zeros_like(short_weights)
            elif not has_longs and has_shorts:
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)
            else:
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)

            combined_weights = normalized_long - normalized_short
            clipped_weights = np.clip(combined_weights, clip_bounds[0], clip_bounds[1])

            # Re-separate after clipping explicitly
            long_clipped = np.maximum(clipped_weights, 0)
            short_clipped = np.abs(np.minimum(clipped_weights, 0))

            has_long_clipped = np.sum(long_clipped) > 0
            has_short_clipped = np.sum(short_clipped) > 0

            if has_long_clipped and has_short_clipped:
                final_long = desired_long * long_clipped / np.sum(long_clipped)
                final_short = desired_short * short_clipped / np.sum(short_clipped)
            elif has_long_clipped and not has_short_clipped:
                final_long = long_clipped / np.sum(long_clipped)
                final_short = np.zeros_like(short_clipped)
            else:
                num_assets = len(raw_weights)
                final_long = np.ones(num_assets) / num_assets
                final_short = np.zeros(num_assets)

            final_weights = final_long - final_short

            rebalance_date = pred_data_with_history.loc[pred_env.current_step, 'Date']
            all_weights.append([rebalance_date] + final_weights.tolist())

        obs, _, done, _, _ = pred_env.step(action)

    end_time = time.time()
    print(f"Elapsed time: {end_time - start_time:.4f} seconds")

columns = ['Date'] + etf_list
weights_df = pd.DataFrame(all_weights, columns=columns)
weights_df.to_csv('ppo_multi_year_weights_10ETFs.csv', index=False)
print("Saved predictions to ppo_multi_year_weights_10ETFs.csv")


In [None]:
############################## This is start to run 25 iterations ##############################
########################################################################################################################

In [None]:
# ITERATION - final variable: 128/20 - retrain - 50kx30k sample - mean cvar - determinstic false with 50 - 7 yr train by 21 day test
# start of stage 2 training
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from sklearn.model_selection import ParameterGrid

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import time
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import time
import os
import torch
import random
import json
from stable_baselines3.common.utils import set_random_seed


SEED = 42
def set_global_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    set_random_seed(seed)

set_global_seed(SEED)

class PortfolioEnv(gym.Env):
    def __init__(self, data, etf_list, reward_type='mean_cvar', risk_coefficient=0.5, rebalance_period=21, lookback_period=21):
        super().__init__()

        self.data = data.reset_index(drop=True)
        self.etf_list = etf_list
        self.reward_type = reward_type
        self.risk_coefficient = risk_coefficient
        self.rebalance_period = rebalance_period
        self.lookback_period = lookback_period
        self.action_space = spaces.Box(low=-1, high=1, shape=(len(etf_list),), dtype=np.float32)

        # Explicitly select feature columns (excluding Date and returns used only for calculating reward)
        self.feature_cols = [col for col in data.columns if col not in ['Date'] and not col.startswith('Actual_Return')]
        self.num_features_per_day = len(self.feature_cols)

        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(self.num_features_per_day * self.lookback_period,),
            dtype=np.float32
        )

        self.current_step = self.lookback_period
        self.done = False
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(etf_list)] * len(etf_list))

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            self.seed(seed)
        self.current_step = self.lookback_period
        self.done = False
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(self.etf_list)] * len(self.etf_list))
        return self._get_obs(), {}

    def step(self, action):
        next_step = self.current_step + 1

        if self.current_step % self.rebalance_period == 0:
            # v2 long short
            desired_long = 1.20  # 120% long exposure explicitly
            desired_short = 0.20  # 20% short exposure explicitly
            clip_bounds = (-0.2, 0.8)

            raw_weights = action.copy()

            # Separate explicitly positive (long) and negative (short) actions
            long_weights = np.maximum(raw_weights, 0)
            short_weights = np.abs(np.minimum(raw_weights, 0))

            has_longs = np.sum(long_weights) > 0
            has_shorts = np.sum(short_weights) > 0

            if has_longs and has_shorts:
                # Normal 120/20 explicitly0
                normalized_long = desired_long * long_weights / np.sum(long_weights)
                normalized_short = desired_short * short_weights / np.sum(short_weights)
            elif has_longs and not has_shorts:
                # Only long explicitly: default realistically to 100% long
                normalized_long = long_weights / np.sum(long_weights)
                normalized_short = np.zeros_like(short_weights)
            elif not has_longs and has_shorts:
                # Only short explicitly (unrealistic), fallback clearly to equal-weight long-only
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)
            else:
                # All zeros explicitly: fallback explicitly to equal-weight long-only
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)

            # Apply explicit clipping
            combined_weights = normalized_long - normalized_short
            clipped_weights = np.clip(combined_weights, clip_bounds[0], clip_bounds[1])

            # Re-separate explicitly after clipping
            long_clipped = np.maximum(clipped_weights, 0)
            short_clipped = np.abs(np.minimum(clipped_weights, 0))

            has_long_clipped = np.sum(long_clipped) > 0
            has_short_clipped = np.sum(short_clipped) > 0

            # Final explicit normalization after clipping
            if has_long_clipped and has_short_clipped:
                final_long = desired_long * long_clipped / np.sum(long_clipped)
                final_short = desired_short * short_clipped / np.sum(short_clipped)
            elif has_long_clipped and not has_short_clipped:
                final_long = long_clipped / np.sum(long_clipped)  # exactly 100% long
                final_short = np.zeros_like(short_clipped)
            else:
                # Realistic fallback explicitly: equal-weight long-only
                num_assets = len(raw_weights)
                final_long = np.ones(num_assets) / num_assets
                final_short = np.zeros(num_assets)

            final_weights = final_long - final_short
            self.current_weights = final_weights
            
            # v1 softmax normalization
            
            # temperature = 0.5  # Explicitly lower for higher concentration (try 0.2 to 0.8)
            # scaled_action = action / temperature
            # self.current_weights = np.exp(scaled_action) / np.sum(np.exp(scaled_action))

        else:
            returns_today = np.array([self.data.loc[self.current_step, f'Actual_Return_{etf}'] for etf in self.etf_list])
            self.current_weights *= (1 + returns_today)
            self.current_weights /= np.sum(self.current_weights)

        if next_step >= len(self.data):
            terminated = True
            reward = 0.0
        else:
            returns = np.array([self.data.loc[next_step, f'Actual_Return_{etf}'] for etf in self.etf_list])
            portfolio_return = np.dot(self.current_weights, returns)
            self.cumulative_wealth *= (1 + portfolio_return)
            reward = self.calculate_reward(portfolio_return, returns)
            terminated = next_step >= len(self.data) - 1

        self.current_step += 1

        return self._get_obs(), reward, terminated, False, {}

        # def _get_obs(self):
        #     obs_window = self.data.iloc[self.current_step - self.lookback_period:self.current_step]
        #     obs_window = obs_window.drop(columns=['Date']).values.flatten().astype(np.float32)
        #     return obs_window

    def _get_obs(self):
        obs_window = self.data.iloc[self.current_step - self.lookback_period:self.current_step]
        obs_window = obs_window[self.feature_cols].values.flatten().astype(np.float32)
        return obs_window

    def calculate_reward(self, portfolio_return, asset_returns):
        if self.reward_type == 'cumulative_return':
            return self.cumulative_wealth - 1.0
        elif self.reward_type == 'log_wealth':
            return np.log(self.cumulative_wealth)
        elif self.reward_type == 'mean_var':
            return portfolio_return - self.risk_coefficient * np.var(asset_returns)
        elif self.reward_type == 'mean_cvar':
            alpha = 0.05
            var = np.percentile(asset_returns, 100 * alpha)
            cvar = np.mean(asset_returns[asset_returns <= var])
            return portfolio_return - self.risk_coefficient * cvar
        else:
            raise ValueError('Invalid reward type')

    def seed(self, seed=None):
        np.random.seed(seed)

import pandas as pd
import numpy as np

def add_stable_features(df, etf_list):
    data = df.copy()

    for etf in etf_list:
        price_col = f'Price_{etf}'

        # Volatility (20-day)
        data[f'Volatility_{etf}'] = data[price_col].pct_change().rolling(20).std()

        # Momentum indicators (returns over 5, 10, 20 days)
        data[f'Momentum_5d_{etf}'] = data[price_col].pct_change(periods=5)
        data[f'Momentum_10d_{etf}'] = data[price_col].pct_change(periods=10)
        data[f'Momentum_20d_{etf}'] = data[price_col].pct_change(periods=20)

        # Moving averages (5-day and 20-day)
        data[f'MA_5d_{etf}'] = data[price_col].rolling(5).mean()
        data[f'MA_20d_{etf}'] = data[price_col].rolling(20).mean()

        # Moving average crossover (5-day MA - 20-day MA)
        data[f'MA_Crossover_{etf}'] = data[f'MA_5d_{etf}'] - data[f'MA_20d_{etf}']

    # Drop NaN values due to rolling calculations
    data.dropna(inplace=True)

    return data

def filter_features(df, include_predicted_returns=True, include_shap_metrics=True):
    df_filtered = df.copy()

    # Explicit patterns to identify columns
    predicted_return_pattern = 'Predicted_Return'
    shap_metric_pattern = 'SHAP'

    # Exclude Predicted Returns explicitly if requested
    if not include_predicted_returns:
        predicted_cols = [col for col in df_filtered.columns if predicted_return_pattern in col]
        df_filtered.drop(columns=predicted_cols, inplace=True)
        print(f"Excluded predicted return columns: {predicted_cols}")

    # Exclude SHAP-related metrics explicitly if requested
    if not include_shap_metrics:
        shap_cols = [col for col in df_filtered.columns if shap_metric_pattern in col]
        df_filtered.drop(columns=shap_cols, inplace=True)
        print(f"Excluded SHAP-related columns: {shap_cols}")

    return df_filtered

# ETFs
etf_list = ['XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLY', 'XLV', 'XLU']
# etf_list = ['BA',
# 'AMGN',
# 'DIS',
# 'NKE',
# 'HON',
# 'MMM',
# 'CAT',
# 'KO',
# 'PG',
# 'AXP',
# 'JPM',
# 'MCD',
# 'HD',
# 'AAPL',
# 'CSCO',
# 'IBM',
# 'MSFT',
# 'TRV',
# 'UNH',
# 'CVX',
# 'JNJ',
# 'MRK',
# 'AMZN',
# 'WMT',
# 'INTC',
# 'VZ']
# Hyperparameter tuning
param_grid = {
    'learning_rate': [1e-4, 3e-4, 5e-4],
    'gamma': [0.90, 0.95, 0.98],
    'clip_range': [0.1, 0.2, 0.25],
    'gae_lambda': [0.8, 0.9, 0.95]
}
consolidated_file = 'stage2_rl_observations_optimized_DIA_ETF.csv'
reward_type = 'mean_cvar'
# data = pd.read_csv(consolidated_file, parse_dates=['Date'])
# data = data.sort_values('Date').reset_index(drop=True)

data = pd.read_csv('stage2_rl_observations_optimized_DIA_ETF.csv', parse_dates=['Date'])
price_data = pd.read_csv('stock_prices_DIA_ETF.csv')

# Convert the Date column in price data, handling the timezone correctly
price_data['Date'] = pd.to_datetime(price_data['Date'], utc=True)
price_data['Date'] = price_data['Date'].dt.tz_localize(None)

# Rename price columns explicitly to 'price_{ticker}'
price_cols = {col: f'Price_{col}' for col in price_data.columns if col != 'Date'}
price_data.rename(columns=price_cols, inplace=True)

# Merge datasets on Date
merged_data = pd.merge(data, price_data, on='Date', how='inner')
merged_data.reset_index(drop=True, inplace=True)
# Check if merge was successful
if len(merged_data) != len(data):
    print(f"Warning: Data length mismatch after merging (Original: {len(data)}, Merged: {len(merged_data)}).")
else:
    print("Merged successfully with aligned dates.")

data_with_features_raw = add_stable_features(merged_data, etf_list)
data_with_features_raw.reset_index(drop=True, inplace=True)


# Usage Example clearly for benchmark (only price metrics, no predicted return or SHAP):
data_with_features = filter_features(data_with_features_raw, 
                                 include_predicted_returns=True, 
                                 include_shap_metrics=True)
################### override data to use SHAP only
# data_with_features = data
################### END override 

# Define your rolling window lengths clearly:
train_window_days = 252 * 7
validation_window_days = 252
prediction_window_days = 252
lookback_period = 21
rebalance_period = 21

start_indices = range(0, len(data) - (train_window_days + validation_window_days + prediction_window_days), prediction_window_days)
all_weights = []

from sklearn.model_selection import ParameterSampler
def validate_and_tune(train_data, val_data, reward_type, rebalance_period=10, lookback_period=10, n_iter=8, timesteps=5000):
    best_reward, best_params = -np.inf, None

    # Narrow and meaningful parameter distribution
    param_dist = {
        'learning_rate': [3e-4, 1e-4],
        'n_steps': [20, 40],
        'batch_size': [10, 20],
        'gamma': [0.95, 0.98],
        'risk_coefficient': [0.1, 0.5, 1.0] if reward_type in ['mean_var', 'mean_cvar'] else [0.5],
    }

    sampled_params = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42))

    for params in sampled_params:
        risk_coeff = params.pop('risk_coefficient', 0.5)

        env = make_vec_env(lambda: PortfolioEnv(train_data, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period), n_envs=1)
        model = PPO('MlpPolicy', env,
                    ent_coef=0.01,    # explicitly encourages exploration
                    clip_range=0.2,
                    **params, verbose=0)
        model.learn(total_timesteps=timesteps)

        val_env = PortfolioEnv(val_data, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
        obs, _ = val_env.reset()
        done, total_reward = False, 0
        
        # while not done:
        #     action, _ = model.predict(obs, deterministic=True)
        #     obs, reward, done, _, _ = val_env.step(action)
        #     total_reward += reward
        
        while not done:
            num_samples = 100  # Recommended starting point
            action_samples = []
        
            for _ in range(num_samples):
                sampled_action, _ = model.predict(obs, deterministic=False)  # obs directly
                action_samples.append(sampled_action)
        
            action = np.mean(action_samples, axis=0)
        
            obs, reward, done, _, _ = val_env.step(action)
            total_reward += reward

        if total_reward > best_reward:
            best_reward = total_reward
            best_params = {**params, 'risk_coefficient': risk_coeff}

    return best_params

def scale_data(df, feature_cols, scaler):
    scaled_features = scaler.transform(df[feature_cols])
    scaled_df = pd.DataFrame(scaled_features, columns=feature_cols, index=df.index)

    # Re-add columns that were not scaled (e.g., Date, Actual_Return_*)
    for col in df.columns:
        if col not in feature_cols:
            scaled_df[col] = df[col].values

    # Keep original column order
    scaled_df = scaled_df[df.columns]
    return scaled_df

# Main execution
from sklearn.preprocessing import StandardScaler
import os
iterations = 10
all_weights_iterations = []

for iteration in range(iterations):
    print(f"\n==== Starting Iteration {iteration + 1}/{iterations} ====")
    model_path = f"ppo_train_best_model_iteration_{iteration}.zip"
    for idx, start_idx in enumerate(start_indices):
        # for start_idx in range(0, 252*2, 252):
        start_time = time.time()
    
        # Explicit indices for training, validation, and prediction datasets
        train_start_idx = start_idx
        train_end_idx = train_start_idx + train_window_days
    
        val_start_idx = train_end_idx
        val_end_idx = val_start_idx + validation_window_days
    
        pred_start_idx = val_end_idx
        pred_end_idx = pred_start_idx + prediction_window_days
    
        # Corresponding dates explicitly
        train_start_date = data_with_features.loc[train_start_idx, 'Date']
        train_end_date = data_with_features.loc[train_end_idx - 1, 'Date']
    
        val_start_date = data_with_features.loc[val_start_idx, 'Date']
        val_end_date = data_with_features.loc[val_end_idx - 1, 'Date']
    
        pred_start_date = data_with_features.loc[pred_start_idx, 'Date']
        pred_end_date = data_with_features.loc[pred_end_idx - 1, 'Date']
    
        # Clearly print ranges for clarity
        print(f"Training period: {train_start_date.date()} to {train_end_date.date()}")
        print(f"Validation period: {val_start_date.date()} to {val_end_date.date()}")
        print(f"Prediction period: {pred_start_date.date()} to {pred_end_date.date()}")
    
        # Explicitly subset data accordingly
        train_data = data_with_features.iloc[train_start_idx:train_end_idx].reset_index(drop=True)
        val_data = data_with_features.iloc[val_start_idx:val_end_idx].reset_index(drop=True)
        pred_data = data_with_features.iloc[pred_start_idx:pred_end_idx].reset_index(drop=True)
    
        feature_cols = [col for col in train_data.columns if col != 'Date' and not col.startswith('Actual_Return')]
    
        scaler = StandardScaler()
        scaler.fit(train_data[feature_cols])
    
        train_data_scaled = scale_data(train_data, feature_cols, scaler)
        val_data_scaled = scale_data(val_data, feature_cols, scaler)
        pred_data_scaled = scale_data(pred_data, feature_cols, scaler)
    
        print("Starting hyperparameter tuning...")
        best_params = validate_and_tune(train_data_scaled, val_data_scaled, reward_type)
        print(f"Best parameters: {best_params}")
    
        incremental_timesteps = 5000
        max_timesteps = 30000
        patience = 3
        
        best_val_reward = -np.inf
        no_improve_steps = 0
    
        risk_coeff = best_params.pop('risk_coefficient',0.5)
        policy_kwargs = dict(net_arch=[256, 256])
    
        env = make_vec_env(lambda: PortfolioEnv(train_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period), n_envs=1)
        
        # Load previous model if exists
        if idx > 0 and os.path.exists(model_path):
            print(f"Loading previous model from {model_path}...")
            model = PPO.load(model_path, env=env)
            model.set_env(env)
        else:
            print("Initializing new PPO model...")
            model = PPO('MlpPolicy', env,
                        policy_kwargs=policy_kwargs,
                        ent_coef=0.01,
                        clip_range=0.2,
                        **best_params, verbose=0)
         # always retrain
        # model = PPO('MlpPolicy', env,
        #             policy_kwargs=policy_kwargs,
        #             ent_coef=0.01,    # explicitly encourages exploration
        #             clip_range=0.2,
        #             **best_params, verbose=0)
        # print("Starting model training...")
        # model.learn(total_timesteps=20000)
        print("Starting model training with early stopping...")
        
        # model = PPO('MlpPolicy', env,
        #             policy_kwargs=policy_kwargs,
        #             ent_coef=0.01,    # explicitly encourages exploration
        #             clip_range=0.2,
        #             **best_params, verbose=0)
        # print("Starting model training...")
        # model.learn(total_timesteps=20000)
    
        for step in range(0, max_timesteps, incremental_timesteps):
            model.learn(total_timesteps=incremental_timesteps)
        
            # Evaluate on validation environment
            val_env = PortfolioEnv(val_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
            val_obs, _ = val_env.reset()
            val_done = False
            val_total_reward = 0.0
        
            while not val_done:
                # val_action, _ = model.predict(val_obs, deterministic=True)
                # val_obs, val_reward, val_done, _, _ = val_env.step(val_action)
                # val_total_reward += val_reward
                
                num_samples = 100  # Recommended
                value_action_samples = []
        
                for _ in range(num_samples):
                    value_sampled_action, _ = model.predict(val_obs, deterministic=False)
                    value_action_samples.append(value_sampled_action)
            
                val_action = np.mean(value_action_samples, axis=0)    
                
                val_obs, val_reward, val_done, _, _ = val_env.step(val_action)
                val_total_reward += val_reward
        
            print(f"Step: {step + incremental_timesteps}, Validation Total Reward: {val_total_reward:.4f}")
        
            # Early stopping check
            if val_total_reward > best_val_reward:
                best_val_reward = val_total_reward
                no_improve_steps = 0
                # model.save("best_ppo_model.zip")
                model.save(model_path)
                print(f"Improved validation reward; model saved at step {step + incremental_timesteps}")
            else:
                no_improve_steps += 1
                print(f"No improvement ({no_improve_steps}/{patience})")
        
                if no_improve_steps >= patience:
                    print("Early stopping explicitly triggered.")
                    break
        
        # Load the best model explicitly
        # model = PPO.load("best_ppo_model.zip")
        model = PPO.load(model_path)
        
        print("Loaded the best PPO model explicitly for prediction.")
    
    
    
        # Ensure historical context explicitly available in prediction
        full_data = pd.concat([train_data_scaled, val_data_scaled, pred_data_scaled])
        pred_data_with_history = full_data[full_data['Date'] >= (pred_start_date - pd.Timedelta(days=lookback_period))].reset_index(drop=True)
    
        pred_env = PortfolioEnv(pred_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
        # pred_env = PortfolioEnv(pred_data_with_history, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
    
        obs, info = pred_env.reset()
        done = False
    
        action = np.zeros(len(etf_list), dtype=np.float32)
    
        while not done:
            if pred_env.current_step >= lookback_period and pred_env.current_step % pred_env.rebalance_period == 0:
                # obs_for_agent = pred_data_with_history.drop(columns=['Date']).iloc[pred_env.current_step - lookback_period:pred_env.current_step].values.flatten().astype(np.float32)
                # action, _ = model.predict(obs_for_agent, deterministic=True)
    
                # v1 normalize weight
                # action, _ = model.predict(obs, deterministic=True)
                
                # num_samples = 100  # Recommended
                # action_samples = []
                # 
                # for _ in range(num_samples):
                #     sampled_action, _ = model.predict(obs, deterministic=False)
                #     action_samples.append(sampled_action)
                # 
                # action = np.mean(action_samples, axis=0)    
                # 
                # temperature = 0.5
                # scaled_action = action / temperature
                # final_weights = np.exp(scaled_action) / np.sum(np.exp(scaled_action))
                # rebalance_date = pred_data_with_history.loc[pred_env.current_step, 'Date']
                # # all_weights.append([rebalance_date] + weights.tolist())
                # all_weights_iterations.append([iteration + 1, rebalance_date] + final_weights.tolist())
    
                # v2 long short normalization
                # action, _ = model.predict(obs, deterministic=True)
                
                num_samples = 100  # Recommended
                action_samples = []

                for _ in range(num_samples):
                    sampled_action, _ = model.predict(obs, deterministic=False)
                    action_samples.append(sampled_action)

                action = np.mean(action_samples, axis=0)    

                # Explicitly apply your new 120/20 normalization logic (to match environment step)
                desired_long = 1.20  # Explicitly 120% long exposure
                desired_short = 0.20  # Explicitly 20% short exposure
                clip_bounds = (-0.2, 0.8)

                raw_weights = action.copy()

                # Separate explicitly positive (long) and negative (short) actions
                long_weights = np.maximum(raw_weights, 0)
                short_weights = np.abs(np.minimum(raw_weights, 0))

                has_longs = np.sum(long_weights) > 0
                has_shorts = np.sum(short_weights) > 0

                if has_longs and has_shorts:
                    normalized_long = desired_long * long_weights / np.sum(long_weights)
                    normalized_short = desired_short * short_weights / np.sum(short_weights)
                elif has_longs and not has_shorts:
                    normalized_long = long_weights / np.sum(long_weights)
                    normalized_short = np.zeros_like(short_weights)
                elif not has_longs and has_shorts:
                    num_assets = len(raw_weights)
                    normalized_long = np.ones(num_assets) / num_assets
                    normalized_short = np.zeros(num_assets)
                else:
                    num_assets = len(raw_weights)
                    normalized_long = np.ones(num_assets) / num_assets
                    normalized_short = np.zeros(num_assets)

                combined_weights = normalized_long - normalized_short
                clipped_weights = np.clip(combined_weights, clip_bounds[0], clip_bounds[1])

                # Re-separate after clipping explicitly
                long_clipped = np.maximum(clipped_weights, 0)
                short_clipped = np.abs(np.minimum(clipped_weights, 0))

                has_long_clipped = np.sum(long_clipped) > 0
                has_short_clipped = np.sum(short_clipped) > 0

                if has_long_clipped and has_short_clipped:
                    final_long = desired_long * long_clipped / np.sum(long_clipped)
                    final_short = desired_short * short_clipped / np.sum(short_clipped)
                elif has_long_clipped and not has_short_clipped:
                    final_long = long_clipped / np.sum(long_clipped)
                    final_short = np.zeros_like(short_clipped)
                else:
                    num_assets = len(raw_weights)
                    final_long = np.ones(num_assets) / num_assets
                    final_short = np.zeros(num_assets)

                final_weights = final_long - final_short

                rebalance_date = pred_data_with_history.loc[pred_env.current_step, 'Date']
                # all_weights.append([rebalance_date] + final_weights.tolist())
                all_weights_iterations.append([iteration + 1, rebalance_date] + final_weights.tolist())
                # 
            obs, _, done, _, _ = pred_env.step(action)
    
        end_time = time.time()
        print(f"Iteration {iteration + 1}, start index {start_idx} completed in {end_time - start_time:.4f} seconds")

columns = ['Iteration', 'Date'] + etf_list
weights_df = pd.DataFrame(all_weights_iterations, columns=columns)
weights_df.to_csv('ppo_allocations_multiple_iterations_DIA_ETF.csv', index=False)
print("Saved all iterations' allocations to ppo_allocations_multiple_iterations_DIA_ETF.csv")



In [None]:
# Stage 2 PPO training with recommended enhancements
# ==================================================

import os
import json
import time
from datetime import timedelta
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterSampler

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.vec_env import SubprocVecEnv
import gym
from gym import spaces
import torch
import random

# -------------------------------------------------------------------
# Utility functions and seeding
# -------------------------------------------------------------------
SEED = 42

def set_global_seed(seed):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)

set_global_seed(SEED)


# -------------------------------------------------------------------
# Environment definition with softmax normalisation and Mean‑CVaR reward
# -------------------------------------------------------------------
class PortfolioEnv(gym.Env):
    """
    Custom Gym environment for portfolio allocation.
    Observations are flattened windows of features; actions are unconstrained
    real numbers that are converted to portfolio weights via softmax.
    Reward is computed at each rebalance period as mean minus λ × CVaR.
    """
    def __init__(self, data, etf_list, reward_type='mean_cvar',
                 risk_coefficient=1.0, rebalance_period=21,
                 lookback_period=60):
        super().__init__()
        self.data = data.reset_index(drop=True)
        self.etf_list = etf_list
        self.reward_type = reward_type
        self.risk_coefficient = risk_coefficient
        self.rebalance_period = rebalance_period
        self.lookback_period = lookback_period

        # Action space: one unbounded action per asset
        self.action_space = spaces.Box(low=-10, high=10, shape=(len(etf_list),), dtype=np.float32)

        # Observation space: flatten last lookback_period days of features
        self.feature_cols = [c for c in data.columns
                             if c not in ['Date'] and not c.startswith('Actual_Return')]
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf,
                                            shape=(len(self.feature_cols)*lookback_period,),
                                            dtype=np.float32)

        self.current_step = self.lookback_period
        self.current_weights = np.array([1/len(etf_list)]*len(etf_list), dtype=float)
        self.cumulative_wealth = 1.0

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            np.random.seed(seed)
        self.current_step = self.lookback_period
        self.current_weights = np.array([1/len(self.etf_list)]*len(self.etf_list),
                                        dtype=float)
        self.cumulative_wealth = 1.0
        return self._get_obs(), {}

    def _get_obs(self):
        """Return a flattened window of recent features."""
        window = self.data.iloc[
            self.current_step - self.lookback_period : self.current_step
        ]
        return window[self.feature_cols].values.flatten().astype(np.float32)

    def _action_to_weights(self, action):
        """
        Convert raw action outputs into a valid long‑only weight vector via softmax.
        This implements the 'continuous 10‑dimensional weights with softmax normalisation'
        specification from your methodology (Step 4).
        """
        # temperature scaling – adjust if you want more/less concentration
        temperature = 1.0
        scaled = action / temperature
        exp_vals = np.exp(scaled - np.max(scaled))
        return exp_vals / exp_vals.sum()

    def calculate_reward(self, portfolio_return, asset_returns):
        """Compute reward according to the chosen risk measure."""
        if self.reward_type == 'mean_cvar':
            alpha = 0.05
            var = np.percentile(asset_returns, 100*alpha)
            cvar = np.mean(asset_returns[asset_returns <= var])
            return portfolio_return - self.risk_coefficient * cvar
        elif self.reward_type == 'mean_var':
            return portfolio_return - self.risk_coefficient * np.var(asset_returns)
        elif self.reward_type == 'log_wealth':
            return np.log(self.cumulative_wealth)
        elif self.reward_type == 'cumulative_return':
            return self.cumulative_wealth - 1.0
        else:
            raise ValueError(f"Unknown reward_type {self.reward_type}")

    def step(self, action):
        """Update portfolio and compute reward."""
        next_step = self.current_step + 1

        # rebalance portfolio at rebalance dates
        if self.current_step % self.rebalance_period == 0:
            self.current_weights = self._action_to_weights(action)
        else:
            # drift weights using actual returns
            daily_rets = np.array([
                self.data.loc[self.current_step, f'Actual_Return_{t}']
                for t in self.etf_list
            ])
            self.current_weights *= (1 + daily_rets)
            self.current_weights /= self.current_weights.sum()

        # compute reward on the next day
        if next_step >= len(self.data):
            done = True
            reward = 0.0
        else:
            asset_returns = np.array([
                self.data.loc[next_step, f'Actual_Return_{t}']
                for t in self.etf_list
            ])
            portfolio_ret = float(np.dot(self.current_weights, asset_returns))
            self.cumulative_wealth *= (1 + portfolio_ret)
            reward = self.calculate_reward(portfolio_ret, asset_returns)
            done = (next_step >= len(self.data) - 1)

        self.current_step += 1
        return self._get_obs(), reward, done, False, {}

# -------------------------------------------------------------------
# Data preparation (Step 1)
# -------------------------------------------------------------------
# Load your Stage 2 RL observations (predicted returns, SHAP, etc.)
stage2_file = 'stage2_rl_observations_optimized_10ETFs.csv'
price_file = 'stock_prices_10ETFs.csv'

stage2 = pd.read_csv(stage2_file, parse_dates=['Date'])
prices = pd.read_csv(price_file)
prices['Date'] = pd.to_datetime(prices['Date'], utc=True).dt.tz_localize(None)

# Align data on Date
prices.rename(columns={c: f'Price_{c}' for c in prices.columns if c != 'Date'},
              inplace=True)
data_merged = pd.merge(stage2, prices, on='Date', how='inner')

# Compute technical indicators (volatility, momentum, moving averages)
# as outlined in the methodology (20‑day volatility, 5/10/20‑day momentum,
# 5‑ and 20‑day moving averages and crossover).


def add_features(df, etfs):
    df2 = df.copy()
    for etf in etfs:
        price_col = f'Price_{etf}'
        returns = df2[price_col].pct_change()
        df2[f'Volatility_{etf}'] = returns.rolling(20).std()
        df2[f'Momentum_5d_{etf}'] = returns.rolling(5).sum()
        df2[f'Momentum_10d_{etf}'] = returns.rolling(10).sum()
        df2[f'Momentum_20d_{etf}'] = returns.rolling(20).sum()
        df2[f'MA_5d_{etf}'] = df2[price_col].rolling(5).mean()
        df2[f'MA_20d_{etf}'] = df2[price_col].rolling(20).mean()
        df2[f'MA_Crossover_{etf}'] = df2[f'MA_5d_{etf}'] - df2[f'MA_20d_{etf}']
    return df2.dropna()

data_with_features = add_features(data_merged, ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU'])

# Optionally, filter out predicted returns or SHAP metrics; here we include both
# because they are key inputs in Step 4’s observation space.
def filter_features(df, include_predicted_returns=True, include_shap_metrics=True):
    df2 = df.copy()
    if not include_predicted_returns:
        cols = [c for c in df2.columns if 'Predicted_Return' in c]
        df2.drop(columns=cols, inplace=True)
    if not include_shap_metrics:
        cols = [c for c in df2.columns if 'SHAP' in c]
        df2.drop(columns=cols, inplace=True)
    return df2

data_with_features = filter_features(data_with_features, True, True)

# -------------------------------------------------------------------
# Rolling window splits (Stage 2 Initial Training, Validation, Test)
# -------------------------------------------------------------------
# Use 10 years for training, 2 years for validation, 3 years for test
train_days = 252*10
val_days   = 252*2
test_days  = 252*3

lookback  = 60         # 60‑day lookback (recommended)
rebalance = 21         # monthly rebalance (21 trading days)

# In a real implementation you would loop over many start dates; here we take the first one
start_idx = 0
train_data = data_with_features.iloc[start_idx:start_idx+train_days].reset_index(drop=True)
val_data   = data_with_features.iloc[start_idx+train_days:
                                     start_idx+train_days+val_days].reset_index(drop=True)
test_data  = data_with_features.iloc[start_idx+train_days+val_days:
                                     start_idx+train_days+val_days+test_days].reset_index(drop=True)

# Standardise features
feature_cols = [c for c in train_data.columns if c not in ['Date']
                and not c.startswith('Actual_Return')]
scaler = StandardScaler().fit(train_data[feature_cols])

def scale(df):
    x = scaler.transform(df[feature_cols])
    df_scaled = pd.DataFrame(x, columns=feature_cols, index=df.index)
    for col in df.columns:
        if col not in feature_cols:
            df_scaled[col] = df[col]
    return df_scaled[df.columns]

train_scaled = scale(train_data)
val_scaled   = scale(val_data)
test_scaled  = scale(test_data)

# -------------------------------------------------------------------
# PPO Training with improved hyper‑parameters (Step 5)
# -------------------------------------------------------------------
def linear_schedule(initial_value, final_value):
    def schedule(progress_remaining):
        return final_value + progress_remaining * (initial_value - final_value)
    return schedule

# Use a vectorised environment with 10 parallel instances for faster training
n_envs = 10
def make_env():
    return PortfolioEnv(train_scaled, 
                        ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU'],
                        reward_type='mean_cvar',
                        risk_coefficient=1.0,
                        rebalance_period=rebalance,
                        lookback_period=lookback)

vec_env = SubprocVecEnv([make_env for _ in range(n_envs)], start_method='spawn')

# PPO hyper‑parameters inspired by recent research:contentReference[oaicite:0]{index=0}
# n_steps collects 3 months of daily data per environment: 252 * 3 ≈ 756
n_steps = 252 * 3
ppo_model = PPO(
    policy='MlpPolicy',
    env=vec_env,
    learning_rate=linear_schedule(3e-4, 1e-5),
    n_steps=n_steps,
    batch_size=1260,           # 252 * 5
    n_epochs=16,
    gamma=0.9,                 # lower discount to focus on near‑term returns
    gae_lambda=0.9,
    clip_range=0.25,
    policy_kwargs=dict(
        net_arch=[64, 64],
        activation_fn=torch.nn.Tanh,
        log_std_init=-1.0
    ),
    ent_coef=0.01,
    seed=SEED,
    verbose=1
)

# Train for 7.5 million timesteps (≈600 episodes × 10 envs × 252×5 steps)
total_timesteps = int(7.5e6)
ppo_model.learn(total_timesteps=total_timesteps)

# Optionally save the model
ppo_model.save('ppo_stage2_best_model.zip')

# -------------------------------------------------------------------
# Validation and early stopping (Step 6)
# -------------------------------------------------------------------
# After training, evaluate on the validation set without updating parameters
val_env = PortfolioEnv(val_scaled,
                       ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU'],
                       reward_type='mean_cvar',
                       risk_coefficient=1.0,
                       rebalance_period=rebalance,
                       lookback_period=lookback)
obs, _ = val_env.reset(seed=SEED)
done = False
val_reward = 0.0
while not done:
    action, _ = ppo_model.predict(obs, deterministic=True)
    obs, reward, done, _, _ = val_env.step(action)
    val_reward += reward
print(f"Total validation reward: {val_reward:.4f}")

# If necessary, you can adjust hyperparameters and re‑train based on validation performance.

# -------------------------------------------------------------------
# Out‑of‑sample testing (2022–2024) and performance metrics (Step 6)
# -------------------------------------------------------------------
test_env = PortfolioEnv(test_scaled,
                        ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU'],
                        reward_type='mean_cvar',
                        risk_coefficient=1.0,
                        rebalance_period=rebalance,
                        lookback_period=lookback)

obs, _ = test_env.reset()
done = False
rebalance_dates = []
weights_history = []
while not done:
    # produce an action every step; env will apply it only on rebalance dates
    action, _ = ppo_model.predict(obs, deterministic=True)
    obs, _, done, _, _ = test_env.step(action)
    # record weights at rebalance points
    if test_env.current_step % rebalance == 0:
        date = test_scaled.loc[test_env.current_step-1, 'Date']
        weights_history.append([date] + test_env.current_weights.tolist())

# Save monthly weights to CSV
weights_df = pd.DataFrame(weights_history, columns=['Date'] + ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU'])
weights_df.to_csv('ppo_stage2_weights.csv', index=False)

# Compute drifted daily returns and compare to equal weights
# (similar to your existing evaluation code)
def compute_returns(weights, price_df):
    # Explicitly define price columns by removing "Price_" prefix
    price_df.columns = [c.replace('Price_', '') for c in price_df.columns]

    common = [c for c in weights.columns if c in price_df.columns]
    if len(common) == 0:
        raise ValueError("No common ETFs found between weights and prices DataFrames.")

    price_df = price_df[common]
    daily_returns = price_df.pct_change().dropna()
    weights = weights.set_index('Date')
    start = weights.index.min()
    end = weights.index.max() + timedelta(days=rebalance)
    daily_returns = daily_returns.loc[start:end]

    eq_weight = np.array([1/len(common)]*len(common))
    drifted = pd.DataFrame(index=daily_returns.index, columns=common)
    eq_drift = pd.DataFrame(index=daily_returns.index, columns=common)
    cur_w = weights.iloc[0].values
    cur_eq = eq_weight

    returns_df = pd.DataFrame(index=daily_returns.index, columns=['RL', 'Equal'])

    for d in daily_returns.index:
        rets = daily_returns.loc[d]  # <-- Define this here explicitly every loop iteration

        if d in weights.index:
            cur_w = weights.loc[d].values
            cur_eq = eq_weight
        else:
            cur_w = (cur_w * (1 + rets.values))
            cur_w /= cur_w.sum()
            cur_eq = (cur_eq * (1 + rets.values))
            cur_eq /= cur_eq.sum()

        drifted.loc[d] = cur_w
        eq_drift.loc[d] = cur_eq

        shifted_rl = drifted.shift(1).loc[d]
        shifted_eq = eq_drift.shift(1).loc[d]

        if d == daily_returns.index[0]:
            returns_df.loc[d, 'RL'] = np.dot(cur_w, rets)
            returns_df.loc[d, 'Equal'] = np.dot(cur_eq, rets)
        else:
            returns_df.loc[d, 'RL'] = np.dot(shifted_rl, rets)
            returns_df.loc[d, 'Equal'] = np.dot(shifted_eq, rets)

    return returns_df.dropna()

# Compute test returns
test_returns = compute_returns(weights_df, prices.set_index('Date'))
cum_rl    = (1 + test_returns['RL']).prod() - 1
cum_equal = (1 + test_returns['Equal']).prod() - 1
print(f"Out‑of‑sample cumulative return (RL):    {cum_rl:.4%}")
print(f"Out‑of‑sample cumulative return (Equal): {cum_equal:.4%}")

# You can also compute annualised return, volatility, Sharpe ratio and max drawdown
def performance_metrics(returns, freq=252):
    ann_return = (1 + returns).prod()**(freq/len(returns)) - 1
    ann_vol    = returns.std() * np.sqrt(freq)
    sharpe     = ann_return / ann_vol if ann_vol != 0 else np.nan
    cum_pnl    = (1+returns).cumprod()
    max_dd     = (cum_pnl / cum_pnl.cummax() - 1).min()
    return ann_return, ann_vol, sharpe, max_dd

rl_ann, rl_vol, rl_sharpe, rl_dd = performance_metrics(test_returns['RL'])
eq_ann, eq_vol, eq_sharpe, eq_dd = performance_metrics(test_returns['Equal'])
print(f"RL annualised return:    {rl_ann:.4%}, Sharpe: {rl_sharpe:.3f}, Max Drawdown: {rl_dd:.4%}")
print(f"Equal annualised return: {eq_ann:.4%}, Sharpe: {eq_sharpe:.3f}, Max Drawdown: {eq_dd:.4%}")


In [None]:
# -------------------------------------------------------------------
# Validation and early stopping (Step 6)
# -------------------------------------------------------------------
# After training, evaluate on the validation set without updating parameters
val_env = PortfolioEnv(val_scaled,
                       ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU'],
                       reward_type='mean_cvar',
                       risk_coefficient=1.0,
                       rebalance_period=rebalance,
                       lookback_period=lookback)
obs, _ = val_env.reset(seed=SEED)
done = False
val_reward = 0.0
while not done:
    action, _ = ppo_model.predict(obs, deterministic=True)
    obs, reward, done, _, _ = val_env.step(action)
    val_reward += reward
print(f"Total validation reward: {val_reward:.4f}")

# If necessary, you can adjust hyperparameters and re‑train based on validation performance.

# -------------------------------------------------------------------
# Out‑of‑sample testing (2022–2024) and performance metrics (Step 6)
# -------------------------------------------------------------------
test_env = PortfolioEnv(test_scaled,
                        ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU'],
                        reward_type='mean_cvar',
                        risk_coefficient=1.0,
                        rebalance_period=rebalance,
                        lookback_period=lookback)

obs, _ = test_env.reset()
done = False
rebalance_dates = []
weights_history = []
while not done:
    # produce an action every step; env will apply it only on rebalance dates
    action, _ = ppo_model.predict(obs, deterministic=True)
    obs, _, done, _, _ = test_env.step(action)
    # record weights at rebalance points
    if test_env.current_step % rebalance == 0:
        date = test_scaled.loc[test_env.current_step-1, 'Date']
        weights_history.append([date] + test_env.current_weights.tolist())

# Save monthly weights to CSV
weights_df = pd.DataFrame(weights_history, columns=['Date'] + ['XLB','XLE','XLF','XLI','XLK','XLP','XLY','XLV','XLU'])
weights_df.to_csv('ppo_stage2_weights.csv', index=False)

# Compute drifted daily returns and compare to equal weights
# (similar to your existing evaluation code)
def compute_returns(weights, price_df):
    # Explicitly define price columns by removing "Price_" prefix
    price_df.columns = [c.replace('Price_', '') for c in price_df.columns]

    common = [c for c in weights.columns if c in price_df.columns]
    if len(common) == 0:
        raise ValueError("No common ETFs found between weights and prices DataFrames.")

    price_df = price_df[common]
    daily_returns = price_df.pct_change().dropna()
    weights = weights.set_index('Date')
    start = weights.index.min()
    end = weights.index.max() + timedelta(days=rebalance)
    daily_returns = daily_returns.loc[start:end]

    eq_weight = np.array([1/len(common)]*len(common))
    drifted = pd.DataFrame(index=daily_returns.index, columns=common)
    eq_drift = pd.DataFrame(index=daily_returns.index, columns=common)
    cur_w = weights.iloc[0].values
    cur_eq = eq_weight

    returns_df = pd.DataFrame(index=daily_returns.index, columns=['RL', 'Equal'])

    for d in daily_returns.index:
        rets = daily_returns.loc[d]  # <-- Define this here explicitly every loop iteration

        if d in weights.index:
            cur_w = weights.loc[d].values
            cur_eq = eq_weight
        else:
            cur_w = (cur_w * (1 + rets.values))
            cur_w /= cur_w.sum()
            cur_eq = (cur_eq * (1 + rets.values))
            cur_eq /= cur_eq.sum()

        drifted.loc[d] = cur_w
        eq_drift.loc[d] = cur_eq

        shifted_rl = drifted.shift(1).loc[d]
        shifted_eq = eq_drift.shift(1).loc[d]

        if d == daily_returns.index[0]:
            returns_df.loc[d, 'RL'] = np.dot(cur_w, rets)
            returns_df.loc[d, 'Equal'] = np.dot(cur_eq, rets)
        else:
            returns_df.loc[d, 'RL'] = np.dot(shifted_rl, rets)
            returns_df.loc[d, 'Equal'] = np.dot(shifted_eq, rets)

    return returns_df.dropna()

# Compute test returns
test_returns = compute_returns(weights_df, prices.set_index('Date'))
cum_rl    = (1 + test_returns['RL']).prod() - 1
cum_equal = (1 + test_returns['Equal']).prod() - 1
print(f"Out‑of‑sample cumulative return (RL):    {cum_rl:.4%}")
print(f"Out‑of‑sample cumulative return (Equal): {cum_equal:.4%}")

# You can also compute annualised return, volatility, Sharpe ratio and max drawdown
def performance_metrics(returns, freq=252):
    ann_return = (1 + returns).prod()**(freq/len(returns)) - 1
    ann_vol    = returns.std() * np.sqrt(freq)
    sharpe     = ann_return / ann_vol if ann_vol != 0 else np.nan
    cum_pnl    = (1+returns).cumprod()
    max_dd     = (cum_pnl / cum_pnl.cummax() - 1).min()
    return ann_return, ann_vol, sharpe, max_dd

rl_ann, rl_vol, rl_sharpe, rl_dd = performance_metrics(test_returns['RL'])
eq_ann, eq_vol, eq_sharpe, eq_dd = performance_metrics(test_returns['Equal'])
print(f"RL annualised return:    {rl_ann:.4%}, Sharpe: {rl_sharpe:.3f}, Max Drawdown: {rl_dd:.4%}")
print(f"Equal annualised return: {eq_ann:.4%}, Sharpe: {eq_sharpe:.3f}, Max Drawdown: {eq_dd:.4%}")

In [None]:
import pandas as pd
import numpy as np

# Load data explicitly
port_wts = pd.read_csv('ppo_allocations_multiple_iterations_DIA_ETF.csv', parse_dates=['Date'], index_col='Date')
daily_returns = pd.read_csv('daily_returns_10ETFs.csv', parse_dates=['Date'], index_col='Date')

common_tickers = [col for col in port_wts.columns if col in daily_returns.columns]
daily_returns = daily_returns[common_tickers]

# Explicitly filter daily returns to the date range covered by portfolio weights
start_date, end_date = port_wts.index.min(), port_wts.index.max() + pd.Timedelta(days=21)
daily_returns = daily_returns.loc[start_date:end_date]

# Initialize drifted weights with the first available rebalance weights
initial_weights = port_wts.loc[start_date].values

equal_weight = np.array([1.0 / len(common_tickers)] * len(common_tickers))

# Create drifted weights DataFrame explicitly initialized
drifted_weights = pd.DataFrame(index=daily_returns.index, columns=common_tickers)
equal_weights = pd.DataFrame(index=daily_returns.index, columns=common_tickers)

current_weights = initial_weights
current_equal_weights = equal_weight

# Initialize returns DataFrame explicitly
returns_df = pd.DataFrame(index=daily_returns.index, columns=['Optimal_Portfolio_Return', 'Equal_Weight_Return'])

for current_date in daily_returns.index:
    if current_date in port_wts.index:
        # Explicit rebalance date: assign new weights
        current_weights = port_wts.loc[current_date].values
        current_equal_weights = equal_weight
    else:
        # Explicitly drift weights using previous day's return
        prev_day_return = daily_returns.loc[current_date]

        drifted_wts_numerator = current_weights * (1 + prev_day_return.values)
        current_weights = drifted_wts_numerator / np.sum(drifted_wts_numerator)

        equal_drifted_numerator = current_equal_weights * (1 + prev_day_return.values)
        current_equal_weights = equal_drifted_numerator / np.sum(equal_drifted_numerator)

    drifted_weights.loc[current_date] = current_weights
    equal_weights.loc[current_date] = current_equal_weights
    shifted_drifted_weights = drifted_weights.shift(1)
    shifted_equal_weights = equal_weights.shift(1)
    if current_date == daily_returns.index[0]:
        # On the first day, use initial weights directly
        returns_df.loc[current_date, 'Optimal_Portfolio_Return'] = np.dot(
            drifted_weights.loc[current_date], daily_returns.loc[current_date])
        returns_df.loc[current_date, 'Equal_Weight_Return'] = np.dot(
            equal_weights.loc[current_date], daily_returns.loc[current_date])
    else:
        # Explicitly use previous day's weights
        returns_df.loc[current_date, 'Optimal_Portfolio_Return'] = np.dot(
            shifted_drifted_weights.loc[current_date], daily_returns.loc[current_date])
        returns_df.loc[current_date, 'Equal_Weight_Return'] = np.dot(
            shifted_equal_weights.loc[current_date], daily_returns.loc[current_date])

# Check explicitly
print("Drifted weights (head):\n", drifted_weights.head())
print("\nPortfolio returns (head):\n", returns_df.head())

# Save explicitly
drifted_weights.to_csv('drifted_weights_corrected.csv')
equal_weights.to_csv('equal_weights.csv')
returns_df.to_csv('portfolio_returns_combined.csv')

