In [None]:
# stage 1 training and prediction 
import pandas as pd
import numpy as np
import xgboost as xgb
import shap
import ta
import joblib
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import time
# Load data
factors = pd.read_csv("aligned_factors.csv", index_col=0, parse_dates=True)
returns = pd.read_csv("daily_returns_10ETFs.csv", index_col=0, parse_dates=True)

# Align dates
dates = factors.index.intersection(returns.index)
factors = factors.loc[dates]
returns = returns.loc[dates]

# Compute expanded generic technical indicators
all_tech_features = []

for etf in returns.columns:
    close = (1 + returns[etf]).cumprod()
    etf_tech_features = pd.DataFrame(index=returns.index)

    etf_tech_features[f'{etf}_SMA_5'] = ta.trend.sma_indicator(close, window=5)
    etf_tech_features[f'{etf}_SMA_20'] = ta.trend.sma_indicator(close, window=20)
    etf_tech_features[f'{etf}_SMA_50'] = ta.trend.sma_indicator(close, window=50)
    etf_tech_features[f'{etf}_EMA_12'] = ta.trend.ema_indicator(close, window=12)
    etf_tech_features[f'{etf}_EMA_26'] = ta.trend.ema_indicator(close, window=26)
    etf_tech_features[f'{etf}_EMA_50'] = ta.trend.ema_indicator(close, window=50)
    etf_tech_features[f'{etf}_RSI_7'] = ta.momentum.rsi(close, window=7)
    etf_tech_features[f'{etf}_RSI_14'] = ta.momentum.rsi(close, window=14)
    etf_tech_features[f'{etf}_MACD'] = ta.trend.macd_diff(close)
    etf_tech_features[f'{etf}_ATR'] = ta.volatility.average_true_range(high=close*1.01, low=close*0.99, close=close, window=14)
    etf_tech_features[f'{etf}_Volatility_5'] = returns[etf].rolling(window=5).std()
    etf_tech_features[f'{etf}_Volatility_20'] = returns[etf].rolling(window=20).std()
    etf_tech_features[f'{etf}_Volatility_50'] = returns[etf].rolling(window=50).std()
    etf_tech_features[f'{etf}_Momentum_3'] = returns[etf].rolling(window=3).mean()
    etf_tech_features[f'{etf}_Momentum_10'] = returns[etf].rolling(window=10).mean()

    all_tech_features.append(etf_tech_features)

# Concatenate all ETF technical features at once to prevent DataFrame fragmentation
technical_features = pd.concat(all_tech_features, axis=1)

# Combine original factors with technical indicators
features = pd.concat([factors, technical_features], axis=1).dropna()

# Shift target by 1 day for next-day prediction
target_returns = returns.shift(-1).loc[features.index].dropna()
features = features.loc[target_returns.index]

# Define rolling window parameters
train_years = 10          # Length of training data in years
valid_years = 2           # Length of validation data in years
test_years = 1            # Length of testing/prediction data in years (configurable)
retrain_frequency = 1     # Retrain model every N years (configurable)
start_year = 2009
end_year = 2024

# Generic feature names
all_generic_features = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA',
                        'SMA_5', 'SMA_20', 'SMA_50', 'EMA_12', 'EMA_26', 'EMA_50',
                        'RSI_7', 'RSI_14', 'MACD', 'ATR', 'Volatility_5', 'Volatility_20', 'Volatility_50',
                        'Momentum_3', 'Momentum_10']

# Step 1: Determine top N generic important features using aggregated SHAP across all ETFs
shap_importances = pd.DataFrame(0.0, index=all_generic_features, columns=['SHAP_Value'])

for etf in returns.columns:
    print(f"Computing SHAP for ETF: {etf}")
    train_start = pd.Timestamp(2009 - train_years, 1, 1)
    train_end = pd.Timestamp(2009 - valid_years - 1, 12, 31)
    
    etf_features = [col for col in features.columns if etf in col or col in factors.columns]
    X_train = features.loc[train_start:train_end, etf_features]
    y_train = target_returns[etf].loc[train_start:train_end]

    model = xgb.XGBRegressor(tree_method='hist', device='cuda').fit(X_train, y_train)
    explainer = shap.Explainer(model)
    shap_values = explainer(X_train)

    for generic in all_generic_features:
        cols = [col for col in X_train.columns if generic in col]
        if cols:
            idx = [X_train.columns.get_loc(c) for c in cols]
            shap_importances.loc[generic] += np.mean(np.abs(shap_values.values[:, idx]))

shap_importances /= len(returns.columns)
common_generic_features = shap_importances.sort_values('SHAP_Value', ascending=False).head(10).index.tolist()

# Step 2: Retrain models using selected top generic important features
all_predictions = []
for etf in returns.columns:  # Adjust this slice for all ETFs
    
    print(f"Processing ETF: {etf}")
    selected_features = [f for f in features.columns if any(generic in f for generic in common_generic_features) or f in factors.columns]

    # for year in range(2009, 2010):  # Adjust range for all years
    year = start_year
    while year <= end_year - test_years + 1:
        print(f"\nTraining window starting year: {year}")
        start_time = time.time()
        train_start = pd.Timestamp(year - train_years, 1, 1)
        train_end = pd.Timestamp(year - valid_years - 1, 12, 31)
        valid_start = pd.Timestamp(year - valid_years, 1, 1)
        valid_end = pd.Timestamp(year - 1, 12, 31)
        test_start = pd.Timestamp(year, 1, 1)
        test_end = pd.Timestamp(year + test_years - 1, 12, 31)

        X_train = features.loc[train_start:train_end, selected_features]
        y_train = target_returns[etf].loc[train_start:train_end]

        X_valid = features.loc[valid_start:valid_end, selected_features]
        y_valid = target_returns[etf].loc[valid_start:valid_end]

        X_test = features.loc[test_start:test_end, selected_features]
        y_test = target_returns[etf].loc[test_start:test_end]

        model = xgb.XGBRegressor(
            tree_method='hist',
            device='cuda',
            objective='reg:squarederror',
            random_state=42,
            n_jobs=4
        )

        params = {
            'n_estimators': [100, 200],
            'max_depth': [3, 4, 5],
            'learning_rate': [0.01, 0.03, 0.05],
            'subsample': [0.7, 0.8],
            'colsample_bytree': [0.7, 0.8]
        }

        grid_search = GridSearchCV(model, params, cv=TimeSeriesSplit(3), scoring='neg_mean_squared_error', verbose=1, n_jobs=4)
        grid_search.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=False)

        best_model = grid_search.best_estimator_
       
        # Convert test data explicitly to DMatrix for GPU
        dtest = xgb.DMatrix(X_test, enable_categorical=False)
        
        # Make predictions using the best_model explicitly
        preds = best_model.get_booster().predict(dtest)


        mse = mean_squared_error(y_test, preds)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, preds)
        r2 = r2_score(y_test, preds)
        directional_accuracy = np.mean((np.sign(y_test) == np.sign(preds)).astype(int))

        print(f"MSE: {mse:.6f}, RMSE: {rmse:.6f}, MAE: {mae:.6f}, R2: {r2:.6f}, Directional Accuracy: {directional_accuracy:.2%}")
        joblib.dump(best_model, f"best_model_{etf}_{year}.joblib")

        # Save daily predictions
        predictions_df = pd.DataFrame({'Date': X_test.index, 'ETF': etf, 'Year': year, 
                                       'Actual_Return': y_test, 'Predicted_Return': preds})

        # SHAP values explicitly aligned
        explainer_test = shap.Explainer(best_model)
        shap_values_test = explainer_test(X_test)
        
        # Create SHAP DataFrame explicitly with 'Date' to ensure correct merging
        shap_df = pd.DataFrame(
            shap_values_test.values,
            columns=[f'SHAP_{col}' for col in X_test.columns],
            index=X_test.index
        ).reset_index().rename(columns={'index': 'Date'})
        
        # Ensure predictions_df has 'Date' column explicitly for merging
        predictions_df = predictions_df.reset_index(drop=True)
        
        # Merge explicitly by 'Date' to align SHAP values correctly
        predictions_df = pd.merge(predictions_df, shap_df, on='Date', how='left')
        
        # Append explicitly for each ETF-year combination
        all_predictions.append(predictions_df)
        year += retrain_frequency
        end_time = time.time()
        print(f"Elapsed time: {end_time - start_time:.4f} seconds")

final_predictions_df = pd.concat(all_predictions).reset_index(drop=True)
final_predictions_df.to_csv("stage1_predictions_with_shap.csv", index=False)

print("Stage 1 completed and data saved for Stage 2.")


In [None]:

import pandas as pd
import numpy as np

# Load stage 1 data explicitly
stage1_df = pd.read_csv("stage1_predictions_with_shap_DIA_ETF.csv", parse_dates=['Date'])
# stage1_df = pd.read_csv("stage1_predictions_with_shap.csv", parse_dates=['Date'])
etfs = stage1_df['ETF'].unique()

# Initialize DataFrame for daily aggregated data explicitly
dates = sorted(stage1_df['Date'].unique())
aggregated_data = pd.DataFrame({'Date': dates})

# ETF-specific predicted returns, actual returns, and volatility
for etf in etfs:
    etf_data = stage1_df[stage1_df['ETF'] == etf].set_index('Date').sort_index()

    aggregated_data[f'Predicted_Return_{etf}'] = aggregated_data['Date'].map(etf_data['Predicted_Return'])
    aggregated_data[f'Actual_Return_{etf}'] = aggregated_data['Date'].map(etf_data['Actual_Return'])

    # Explicit rolling 5-day volatility calculation
    volatility = etf_data['Actual_Return'].rolling(window=5).std()
    aggregated_data[f'Volatility_{etf}'] = aggregated_data['Date'].map(volatility)

# Define explicitly generic SHAP features to aggregate across ETFs
# generic_shap_features = [
#     'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 
#     'SMA_5', 'SMA_20', 'SMA_50',
#     'EMA_12', 'EMA_26', 'EMA_50',
#     'RSI_7', 'RSI_14', 'MACD', 'ATR',
#     'Volatility_5', 'Volatility_20', 'Volatility_50',
#     'Momentum_3', 'Momentum_10'
# ]
generic_shap_features = ['Mkt-RF',
 'Volatility_50',
 'HML',
 'Momentum_3',
 'Volatility_5',
 'Volatility_20',
 'RMW',
 'SMB',
 'CMA',
 'RSI_7']

# Aggregate SHAP values explicitly across ETFs by generic feature
shap_aggregated = {'Date': dates}
shap_df_list = []

for feature in generic_shap_features:
    # Adjust matching explicitly for 'SHAP_{ETF}_{feature}' format
    matching_shap_cols = [col for col in stage1_df.columns if col.startswith('SHAP_') and col.endswith(f'_{feature}')]
    
    if matching_shap_cols:
        # Compute daily mean explicitly across selected SHAP columns
        daily_shap_mean = stage1_df.groupby('Date')[matching_shap_cols].mean().mean(axis=1)
        shap_df_list.append(daily_shap_mean.rename(f'Avg_SHAP_{feature}'))
    else:
        print(f"Warning: No matches found explicitly for feature: {feature}")

# Concatenate aggregated SHAP features explicitly, ensuring alignment
shap_aggregated_df = pd.concat(shap_df_list, axis=1).reset_index()

# Explicit merge with ETF-specific metrics on Date to ensure alignment
aggregated_data = pd.merge(aggregated_data, shap_aggregated_df, on='Date', how='left')

# Explicit handling of missing values
aggregated_data.sort_values('Date', inplace=True)
aggregated_data.fillna(method='ffill', inplace=True)

# Remove rows explicitly if initial volatility calculations have NaNs
vol_cols = [f'Volatility_{etf}' for etf in etfs]
aggregated_data.dropna(subset=vol_cols, inplace=True)

# Check if aggregated_data is empty before saving explicitly
if aggregated_data.empty:
    print("Warning: aggregated_data is empty after processing. Please verify input data and alignment explicitly.")
else:
    aggregated_data.to_csv("stage2_rl_observations_optimized_DIA_ETF.csv", index=False)
    print(f"Optimized Stage 2 RL dataset created with shape: {aggregated_data.shape}")


In [None]:
# start of stage 2 training
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from sklearn.model_selection import ParameterGrid

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import time
import os
class PortfolioEnv(gym.Env):
    def __init__(self, data, etf_list, reward_type='mean_cvar', risk_coefficient=0.5, rebalance_period=21, lookback_period=21):
        super().__init__()

        self.data = data.reset_index(drop=True)
        self.etf_list = etf_list
        self.reward_type = reward_type
        self.risk_coefficient = risk_coefficient
        self.rebalance_period = rebalance_period
        self.lookback_period = lookback_period
        self.action_space = spaces.Box(low=-1, high=1, shape=(len(etf_list),), dtype=np.float32)

        # Explicitly select feature columns (excluding Date and returns used only for calculating reward)
        self.feature_cols = [col for col in data.columns if col not in ['Date'] and not col.startswith('Actual_Return')]
        self.num_features_per_day = len(self.feature_cols)

        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(self.num_features_per_day * self.lookback_period,),
            dtype=np.float32
        )

        self.current_step = self.lookback_period
        self.done = False
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(etf_list)] * len(etf_list))

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            self.seed(seed)
        self.current_step = self.lookback_period
        self.done = False
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(self.etf_list)] * len(self.etf_list))
        return self._get_obs(), {}

    def step(self, action):
        next_step = self.current_step + 1

        if self.current_step % self.rebalance_period == 0:
            # v2 long short
            desired_long = 1.20  # 120% long exposure explicitly
            desired_short = 0.20  # 20% short exposure explicitly
            clip_bounds = (-0.2, 0.8)

            raw_weights = action.copy()

            # Separate explicitly positive (long) and negative (short) actions
            long_weights = np.maximum(raw_weights, 0)
            short_weights = np.abs(np.minimum(raw_weights, 0))

            has_longs = np.sum(long_weights) > 0
            has_shorts = np.sum(short_weights) > 0

            if has_longs and has_shorts:
                # Normal 120/20 explicitly0
                normalized_long = desired_long * long_weights / np.sum(long_weights)
                normalized_short = desired_short * short_weights / np.sum(short_weights)
            elif has_longs and not has_shorts:
                # Only long explicitly: default realistically to 100% long
                normalized_long = long_weights / np.sum(long_weights)
                normalized_short = np.zeros_like(short_weights)
            elif not has_longs and has_shorts:
                # Only short explicitly (unrealistic), fallback clearly to equal-weight long-only
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)
            else:
                # All zeros explicitly: fallback explicitly to equal-weight long-only
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)

            # Apply explicit clipping
            combined_weights = normalized_long - normalized_short
            clipped_weights = np.clip(combined_weights, clip_bounds[0], clip_bounds[1])

            # Re-separate explicitly after clipping
            long_clipped = np.maximum(clipped_weights, 0)
            short_clipped = np.abs(np.minimum(clipped_weights, 0))

            has_long_clipped = np.sum(long_clipped) > 0
            has_short_clipped = np.sum(short_clipped) > 0

            # Final explicit normalization after clipping
            if has_long_clipped and has_short_clipped:
                final_long = desired_long * long_clipped / np.sum(long_clipped)
                final_short = desired_short * short_clipped / np.sum(short_clipped)
            elif has_long_clipped and not has_short_clipped:
                final_long = long_clipped / np.sum(long_clipped)  # exactly 100% long
                final_short = np.zeros_like(short_clipped)
            else:
                # Realistic fallback explicitly: equal-weight long-only
                num_assets = len(raw_weights)
                final_long = np.ones(num_assets) / num_assets
                final_short = np.zeros(num_assets)

            final_weights = final_long - final_short
            self.current_weights = final_weights
            
            # v1 softmax normalization
            
            # temperature = 0.5  # Explicitly lower for higher concentration (try 0.2 to 0.8)
            # scaled_action = action / temperature
            # self.current_weights = np.exp(scaled_action) / np.sum(np.exp(scaled_action))

        else:
            returns_today = np.array([self.data.loc[self.current_step, f'Actual_Return_{etf}'] for etf in self.etf_list])
            self.current_weights *= (1 + returns_today)
            self.current_weights /= np.sum(self.current_weights)

        if next_step >= len(self.data):
            terminated = True
            reward = 0.0
        else:
            returns = np.array([self.data.loc[next_step, f'Actual_Return_{etf}'] for etf in self.etf_list])
            portfolio_return = np.dot(self.current_weights, returns)
            self.cumulative_wealth *= (1 + portfolio_return)
            reward = self.calculate_reward(portfolio_return, returns)
            terminated = next_step >= len(self.data) - 1

        self.current_step += 1

        return self._get_obs(), reward, terminated, False, {}

        # def _get_obs(self):
        #     obs_window = self.data.iloc[self.current_step - self.lookback_period:self.current_step]
        #     obs_window = obs_window.drop(columns=['Date']).values.flatten().astype(np.float32)
        #     return obs_window

    def _get_obs(self):
        obs_window = self.data.iloc[self.current_step - self.lookback_period:self.current_step]
        obs_window = obs_window[self.feature_cols].values.flatten().astype(np.float32)
        return obs_window

    def calculate_reward(self, portfolio_return, asset_returns):
        if self.reward_type == 'cumulative_return':
            return self.cumulative_wealth - 1.0
        elif self.reward_type == 'log_wealth':
            return np.log(self.cumulative_wealth)
        elif self.reward_type == 'mean_var':
            return portfolio_return - self.risk_coefficient * np.var(asset_returns)
        elif self.reward_type == 'mean_cvar':
            alpha = 0.05
            var = np.percentile(asset_returns, 100 * alpha)
            cvar = np.mean(asset_returns[asset_returns <= var])
            return portfolio_return - self.risk_coefficient * cvar
        else:
            raise ValueError('Invalid reward type')

    def seed(self, seed=None):
        np.random.seed(seed)

import pandas as pd
import numpy as np

def add_stable_features(df, etf_list):
    data = df.copy()

    for etf in etf_list:
        price_col = f'Price_{etf}'

        # Volatility (20-day)
        data[f'Volatility_{etf}'] = data[price_col].pct_change().rolling(20).std()

        # Momentum indicators (returns over 5, 10, 20 days)
        data[f'Momentum_5d_{etf}'] = data[price_col].pct_change(periods=5)
        data[f'Momentum_10d_{etf}'] = data[price_col].pct_change(periods=10)
        data[f'Momentum_20d_{etf}'] = data[price_col].pct_change(periods=20)

        # Moving averages (5-day and 20-day)
        data[f'MA_5d_{etf}'] = data[price_col].rolling(5).mean()
        data[f'MA_20d_{etf}'] = data[price_col].rolling(20).mean()

        # Moving average crossover (5-day MA - 20-day MA)
        data[f'MA_Crossover_{etf}'] = data[f'MA_5d_{etf}'] - data[f'MA_20d_{etf}']

    # Drop NaN values due to rolling calculations
    data.dropna(inplace=True)

    return data

def filter_features(df, include_predicted_returns=True, include_shap_metrics=True):
    df_filtered = df.copy()

    # Explicit patterns to identify columns
    predicted_return_pattern = 'Predicted_Return'
    shap_metric_pattern = 'SHAP'

    # Exclude Predicted Returns explicitly if requested
    if not include_predicted_returns:
        predicted_cols = [col for col in df_filtered.columns if predicted_return_pattern in col]
        df_filtered.drop(columns=predicted_cols, inplace=True)
        print(f"Excluded predicted return columns: {predicted_cols}")

    # Exclude SHAP-related metrics explicitly if requested
    if not include_shap_metrics:
        shap_cols = [col for col in df_filtered.columns if shap_metric_pattern in col]
        df_filtered.drop(columns=shap_cols, inplace=True)
        print(f"Excluded SHAP-related columns: {shap_cols}")

    return df_filtered

# ETFs
# etf_list = ['XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLY', 'XLV', 'XLU']

etf_list = ['BA',
'AMGN',
'DIS',
'NKE',
'HON',
'MMM',
'CAT',
'KO',
'PG',
'AXP',
'JPM',
'MCD',
'HD',
'AAPL',
'CSCO',
'IBM',
'MSFT',
'TRV',
'UNH',
'CVX',
'JNJ',
'MRK',
'AMZN',
'WMT',
'INTC',
'VZ']
# Hyperparameter tuning
param_grid = {
    'learning_rate': [1e-4, 5e-5],
    'n_steps': [20, 40],
    'batch_size': [5, 10],
    'gamma': [0.98, 0.99]
}
consolidated_file = 'stage2_rl_observations_optimized_DIA_ETF.csv'
reward_type = 'mean_cvar'
# data = pd.read_csv(consolidated_file, parse_dates=['Date'])
# data = data.sort_values('Date').reset_index(drop=True)

data = pd.read_csv('stage2_rl_observations_optimized_DIA_ETF.csv', parse_dates=['Date'])
price_data = pd.read_csv('stock_prices_DIA_ETF.csv')
# price_data = pd.read_csv('stock_prices_10ETFs.csv')
# Convert the Date column in price data, handling the timezone correctly
price_data['Date'] = pd.to_datetime(price_data['Date'], utc=True)
price_data['Date'] = price_data['Date'].dt.tz_localize(None)

# Rename price columns explicitly to 'price_{ticker}'
price_cols = {col: f'Price_{col}' for col in price_data.columns if col != 'Date'}
price_data.rename(columns=price_cols, inplace=True)

# Merge datasets on Date
merged_data = pd.merge(data, price_data, on='Date', how='inner')
merged_data.reset_index(drop=True, inplace=True)
# Check if merge was successful
if len(merged_data) != len(data):
    print(f"Warning: Data length mismatch after merging (Original: {len(data)}, Merged: {len(merged_data)}).")
else:
    print("Merged successfully with aligned dates.")

data_with_features_raw = add_stable_features(merged_data, etf_list)
data_with_features_raw.reset_index(drop=True, inplace=True)


# Usage Example clearly for benchmark (only price metrics, no predicted return or SHAP):
data_with_features = filter_features(data_with_features_raw, 
                                 include_predicted_returns=True, 
                                 include_shap_metrics=True)
################### override data to use SHAP only
# data_with_features = data
################### END override 

# Define your rolling window lengths clearly:
train_window_days = 252 * 7
validation_window_days = 252
prediction_window_days = 252
lookback_period = 21
rebalance_period = 21

start_indices = range(0, len(data) - (train_window_days + validation_window_days + prediction_window_days), prediction_window_days)
all_weights = []
model_path = 'ppo_single_train_best_model_DIA_ETF.zip'

from sklearn.model_selection import ParameterSampler
def validate_and_tune(train_data, val_data, reward_type, rebalance_period=10, lookback_period=10, n_iter=8, timesteps=5000):
    best_reward, best_params = -np.inf, None

    # Narrow and meaningful parameter distribution
    param_dist = {
        'learning_rate': [3e-4, 1e-4],
        'n_steps': [20, 40],
        'batch_size': [10, 20],
        'gamma': [0.95, 0.98],
        'risk_coefficient': [0.1, 0.5, 1.0] if reward_type in ['mean_var', 'mean_cvar'] else [0.5],
    }

    sampled_params = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42))

    for params in sampled_params:
        risk_coeff = params.pop('risk_coefficient', 0.5)

        env = make_vec_env(lambda: PortfolioEnv(train_data, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period), n_envs=1)
        model = PPO('MlpPolicy', env,
                    ent_coef=0.01,    # explicitly encourages exploration
                    clip_range=0.2,
                    **params, verbose=0)
        model.learn(total_timesteps=timesteps)

        val_env = PortfolioEnv(val_data, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
        obs, _ = val_env.reset()
        done, total_reward = False, 0

        while not done:
            num_samples = 50  # Recommended starting point
            action_samples = []
        
            for _ in range(num_samples):
                sampled_action, _ = model.predict(obs, deterministic=False)  # obs directly
                action_samples.append(sampled_action)
        
            action = np.mean(action_samples, axis=0)
        
            obs, reward, done, _, _ = val_env.step(action)
            total_reward += reward

        if total_reward > best_reward:
            best_reward = total_reward
            best_params = {**params, 'risk_coefficient': risk_coeff}

    return best_params

def scale_data(df, feature_cols, scaler):
    scaled_features = scaler.transform(df[feature_cols])
    scaled_df = pd.DataFrame(scaled_features, columns=feature_cols, index=df.index)

    # Re-add columns that were not scaled (e.g., Date, Actual_Return_*)
    for col in df.columns:
        if col not in feature_cols:
            scaled_df[col] = df[col].values

    # Keep original column order
    scaled_df = scaled_df[df.columns]
    return scaled_df

# Main execution
from sklearn.preprocessing import StandardScaler
for idx, start_idx in enumerate(start_indices):
    # for start_idx in range(0, 252*2, 252):
    start_time = time.time()

    # Explicit indices for training, validation, and prediction datasets
    train_start_idx = start_idx
    train_end_idx = train_start_idx + train_window_days

    val_start_idx = train_end_idx
    val_end_idx = val_start_idx + validation_window_days

    pred_start_idx = val_end_idx
    pred_end_idx = pred_start_idx + prediction_window_days

    # Corresponding dates explicitly
    train_start_date = data_with_features.loc[train_start_idx, 'Date']
    train_end_date = data_with_features.loc[train_end_idx - 1, 'Date']

    val_start_date = data_with_features.loc[val_start_idx, 'Date']
    val_end_date = data_with_features.loc[val_end_idx - 1, 'Date']

    pred_start_date = data_with_features.loc[pred_start_idx, 'Date']
    pred_end_date = data_with_features.loc[pred_end_idx - 1, 'Date']

    # Clearly print ranges for clarity
    print(f"Training period: {train_start_date.date()} to {train_end_date.date()}")
    print(f"Validation period: {val_start_date.date()} to {val_end_date.date()}")
    print(f"Prediction period: {pred_start_date.date()} to {pred_end_date.date()}")

    # Explicitly subset data accordingly
    train_data = data_with_features.iloc[train_start_idx:train_end_idx].reset_index(drop=True)
    val_data = data_with_features.iloc[val_start_idx:val_end_idx].reset_index(drop=True)
    pred_data = data_with_features.iloc[pred_start_idx:pred_end_idx].reset_index(drop=True)

    feature_cols = [col for col in train_data.columns if col != 'Date' and not col.startswith('Actual_Return')]

    scaler = StandardScaler()
    scaler.fit(train_data[feature_cols])

    train_data_scaled = scale_data(train_data, feature_cols, scaler)
    val_data_scaled = scale_data(val_data, feature_cols, scaler)
    pred_data_scaled = scale_data(pred_data, feature_cols, scaler)

    print("Starting hyperparameter tuning...")
    best_params = validate_and_tune(train_data_scaled, val_data_scaled, reward_type)
    print(f"Best parameters: {best_params}")

    incremental_timesteps = 5000
    max_timesteps = 30000
    patience = 3
    
    best_val_reward = -np.inf
    no_improve_steps = 0

    risk_coeff = best_params.pop('risk_coefficient',0.5)
    policy_kwargs = dict(net_arch=[256, 256])

    env = make_vec_env(lambda: PortfolioEnv(train_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period), n_envs=1)
    
    # Load previous model if exists
    if idx > 0 and os.path.exists(model_path):
        print(f"Loading previous model from {model_path}...")
        model = PPO.load(model_path, env=env)
        model.set_env(env)
    else:
        print("Initializing new PPO model...")
        model = PPO('MlpPolicy', env,
                    policy_kwargs=policy_kwargs,
                    ent_coef=0.01,
                    clip_range=0.2,
                    **best_params, verbose=0)
     # always retrain
    # model = PPO('MlpPolicy', env,
    #             policy_kwargs=policy_kwargs,
    #             ent_coef=0.01,    # explicitly encourages exploration
    #             clip_range=0.2,
    #             **best_params, verbose=0)
    # print("Starting model training...")
    # model.learn(total_timesteps=20000)
    print("Starting model training with early stopping...")

    for step in range(0, max_timesteps, incremental_timesteps):
        model.learn(total_timesteps=incremental_timesteps)
    
        # Evaluate on validation environment
        val_env = PortfolioEnv(val_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
        val_obs, _ = val_env.reset()
        val_done = False
        val_total_reward = 0.0
    
        while not val_done:
            # val_action, _ = model.predict(val_obs, deterministic=True)
            num_samples = 50  # Recommended
            value_action_samples = []
    
            for _ in range(num_samples):
                value_sampled_action, _ = model.predict(val_obs, deterministic=False)
                value_action_samples.append(value_sampled_action)
        
            val_action = np.mean(value_action_samples, axis=0)    
            
            val_obs, val_reward, val_done, _, _ = val_env.step(val_action)
            val_total_reward += val_reward
    
        print(f"Step: {step + incremental_timesteps}, Validation Total Reward: {val_total_reward:.4f}")
    
        # Early stopping check
        if val_total_reward > best_val_reward:
            best_val_reward = val_total_reward
            no_improve_steps = 0
            # model.save("best_ppo_model.zip")
            model.save(model_path)
            print(f"Improved validation reward; model saved at step {step + incremental_timesteps}")
        else:
            no_improve_steps += 1
            print(f"No improvement ({no_improve_steps}/{patience})")
    
            if no_improve_steps >= patience:
                print("Early stopping explicitly triggered.")
                break
    
    # Load the best model explicitly
    model = PPO.load(model_path)
    print("Loaded the best PPO model explicitly for prediction.")



    # Ensure historical context explicitly available in prediction
    full_data = pd.concat([train_data_scaled, val_data_scaled, pred_data_scaled])
    pred_data_with_history = full_data[full_data['Date'] >= (pred_start_date - pd.Timedelta(days=lookback_period))].reset_index(drop=True)

    pred_env = PortfolioEnv(pred_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
    # pred_env = PortfolioEnv(pred_data_with_history, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)

    obs, info = pred_env.reset()
    done = False

    action = np.zeros(len(etf_list), dtype=np.float32)

    while not done:
        if pred_env.current_step >= lookback_period and pred_env.current_step % pred_env.rebalance_period == 0:
            # obs_for_agent = pred_data_with_history.drop(columns=['Date']).iloc[pred_env.current_step - lookback_period:pred_env.current_step].values.flatten().astype(np.float32)
            # action, _ = model.predict(obs_for_agent, deterministic=True)

            # v1 normalize weight
            # action, _ = model.predict(obs, deterministic=True)
            # use determinstic = FALSE       
            # num_samples = 50  # Recommended
            # action_samples = []
            # for _ in range(num_samples):
            #     sampled_action, _ = model.predict(obs, deterministic=False)
            #     action_samples.append(sampled_action)
            # action = np.mean(action_samples, axis=0)    
            # 
            # temperature = 0.5
            # scaled_action = action / temperature
            # weights = np.exp(scaled_action) / np.sum(np.exp(scaled_action))
            # rebalance_date = pred_data_with_history.loc[pred_env.current_step, 'Date']
            # all_weights.append([rebalance_date] + weights.tolist())


            # v2 long short normalization
            # action, _ = model.predict(obs, deterministic=True)
            
            # uncomment this for predictopm
            num_samples = 50  # Recommended
            action_samples = []

            for _ in range(num_samples):
                sampled_action, _ = model.predict(obs, deterministic=False)
                action_samples.append(sampled_action)

            action = np.mean(action_samples, axis=0)    

            # Explicitly apply your new 120/20 normalization logic (to match environment step)
            desired_long = 1.20  # Explicitly 120% long exposure
            desired_short = 0.20  # Explicitly 20% short exposure
            clip_bounds = (-0.2, 0.8)

            raw_weights = action.copy()

            # Separate explicitly positive (long) and negative (short) actions
            long_weights = np.maximum(raw_weights, 0)
            short_weights = np.abs(np.minimum(raw_weights, 0))

            has_longs = np.sum(long_weights) > 0
            has_shorts = np.sum(short_weights) > 0

            if has_longs and has_shorts:
                normalized_long = desired_long * long_weights / np.sum(long_weights)
                normalized_short = desired_short * short_weights / np.sum(short_weights)
            elif has_longs and not has_shorts:
                normalized_long = long_weights / np.sum(long_weights)
                normalized_short = np.zeros_like(short_weights)
            elif not has_longs and has_shorts:
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)
            else:
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)

            combined_weights = normalized_long - normalized_short
            clipped_weights = np.clip(combined_weights, clip_bounds[0], clip_bounds[1])

            # Re-separate after clipping explicitly
            long_clipped = np.maximum(clipped_weights, 0)
            short_clipped = np.abs(np.minimum(clipped_weights, 0))

            has_long_clipped = np.sum(long_clipped) > 0
            has_short_clipped = np.sum(short_clipped) > 0

            if has_long_clipped and has_short_clipped:
                final_long = desired_long * long_clipped / np.sum(long_clipped)
                final_short = desired_short * short_clipped / np.sum(short_clipped)
            elif has_long_clipped and not has_short_clipped:
                final_long = long_clipped / np.sum(long_clipped)
                final_short = np.zeros_like(short_clipped)
            else:
                num_assets = len(raw_weights)
                final_long = np.ones(num_assets) / num_assets
                final_short = np.zeros(num_assets)

            final_weights = final_long - final_short

            rebalance_date = pred_data_with_history.loc[pred_env.current_step, 'Date']
            all_weights.append([rebalance_date] + final_weights.tolist())

        obs, _, done, _, _ = pred_env.step(action)

    end_time = time.time()
    print(f"Elapsed time: {end_time - start_time:.4f} seconds")

columns = ['Date'] + etf_list
weights_df = pd.DataFrame(all_weights, columns=columns)
weights_df.to_csv('ppo_multi_year_weights_DIA_ETF.csv', index=False)
print("Saved predictions to ppo_multi_year_weights_DIA_ETF.csv")


In [None]:
columns = ['Date'] + etf_list
weights_df = pd.DataFrame(all_weights, columns=columns)
weights_df.to_csv('ppo_multi_year_weights.csv', index=False)
print("Saved predictions to ppo_multi_year_weights.csv")


In [None]:
############################## This is start to run 25 iterations ##############################
########################################################################################################################

In [2]:
# ITERATION - final variable: 128/20 - retrain - 50kx30k sample - mean cvar - determinstic false with 50 - 7 yr train by 21 day test
# start of stage 2 training
import pandas as pd
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from sklearn.model_selection import ParameterGrid

import gymnasium as gym
from gymnasium import spaces
import numpy as np
import time
class PortfolioEnv(gym.Env):
    def __init__(self, data, etf_list, reward_type='mean_cvar', risk_coefficient=0.5, rebalance_period=21, lookback_period=21):
        super().__init__()

        self.data = data.reset_index(drop=True)
        self.etf_list = etf_list
        self.reward_type = reward_type
        self.risk_coefficient = risk_coefficient
        self.rebalance_period = rebalance_period
        self.lookback_period = lookback_period
        self.action_space = spaces.Box(low=-1, high=1, shape=(len(etf_list),), dtype=np.float32)

        # Explicitly select feature columns (excluding Date and returns used only for calculating reward)
        self.feature_cols = [col for col in data.columns if col not in ['Date'] and not col.startswith('Actual_Return')]
        self.num_features_per_day = len(self.feature_cols)

        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf,
            shape=(self.num_features_per_day * self.lookback_period,),
            dtype=np.float32
        )

        self.current_step = self.lookback_period
        self.done = False
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(etf_list)] * len(etf_list))

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            self.seed(seed)
        self.current_step = self.lookback_period
        self.done = False
        self.cumulative_wealth = 1.0
        self.current_weights = np.array([1.0 / len(self.etf_list)] * len(self.etf_list))
        return self._get_obs(), {}

    def step(self, action):
        next_step = self.current_step + 1

        if self.current_step % self.rebalance_period == 0:
            # v2 long short
            desired_long = 1.20  # 120% long exposure explicitly
            desired_short = 0.20  # 20% short exposure explicitly
            clip_bounds = (-0.2, 0.8)

            raw_weights = action.copy()

            # Separate explicitly positive (long) and negative (short) actions
            long_weights = np.maximum(raw_weights, 0)
            short_weights = np.abs(np.minimum(raw_weights, 0))

            has_longs = np.sum(long_weights) > 0
            has_shorts = np.sum(short_weights) > 0

            if has_longs and has_shorts:
                # Normal 120/20 explicitly0
                normalized_long = desired_long * long_weights / np.sum(long_weights)
                normalized_short = desired_short * short_weights / np.sum(short_weights)
            elif has_longs and not has_shorts:
                # Only long explicitly: default realistically to 100% long
                normalized_long = long_weights / np.sum(long_weights)
                normalized_short = np.zeros_like(short_weights)
            elif not has_longs and has_shorts:
                # Only short explicitly (unrealistic), fallback clearly to equal-weight long-only
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)
            else:
                # All zeros explicitly: fallback explicitly to equal-weight long-only
                num_assets = len(raw_weights)
                normalized_long = np.ones(num_assets) / num_assets
                normalized_short = np.zeros(num_assets)

            # Apply explicit clipping
            combined_weights = normalized_long - normalized_short
            clipped_weights = np.clip(combined_weights, clip_bounds[0], clip_bounds[1])

            # Re-separate explicitly after clipping
            long_clipped = np.maximum(clipped_weights, 0)
            short_clipped = np.abs(np.minimum(clipped_weights, 0))

            has_long_clipped = np.sum(long_clipped) > 0
            has_short_clipped = np.sum(short_clipped) > 0

            # Final explicit normalization after clipping
            if has_long_clipped and has_short_clipped:
                final_long = desired_long * long_clipped / np.sum(long_clipped)
                final_short = desired_short * short_clipped / np.sum(short_clipped)
            elif has_long_clipped and not has_short_clipped:
                final_long = long_clipped / np.sum(long_clipped)  # exactly 100% long
                final_short = np.zeros_like(short_clipped)
            else:
                # Realistic fallback explicitly: equal-weight long-only
                num_assets = len(raw_weights)
                final_long = np.ones(num_assets) / num_assets
                final_short = np.zeros(num_assets)

            final_weights = final_long - final_short
            self.current_weights = final_weights
            # v1 softmax normalization
            # 
            # temperature = 0.5  # Explicitly lower for higher concentration (try 0.2 to 0.8)
            # scaled_action = action / temperature
            # self.current_weights = np.exp(scaled_action) / np.sum(np.exp(scaled_action))

        else:
            returns_today = np.array([self.data.loc[self.current_step, f'Actual_Return_{etf}'] for etf in self.etf_list])
            self.current_weights *= (1 + returns_today)
            self.current_weights /= np.sum(self.current_weights)

        if next_step >= len(self.data):
            terminated = True
            reward = 0.0
        else:
            returns = np.array([self.data.loc[next_step, f'Actual_Return_{etf}'] for etf in self.etf_list])
            portfolio_return = np.dot(self.current_weights, returns)
            self.cumulative_wealth *= (1 + portfolio_return)
            reward = self.calculate_reward(portfolio_return, returns)
            terminated = next_step >= len(self.data) - 1

        self.current_step += 1

        return self._get_obs(), reward, terminated, False, {}

        # def _get_obs(self):
        #     obs_window = self.data.iloc[self.current_step - self.lookback_period:self.current_step]
        #     obs_window = obs_window.drop(columns=['Date']).values.flatten().astype(np.float32)
        #     return obs_window

    def _get_obs(self):
        obs_window = self.data.iloc[self.current_step - self.lookback_period:self.current_step]
        obs_window = obs_window[self.feature_cols].values.flatten().astype(np.float32)
        return obs_window

    def calculate_reward(self, portfolio_return, asset_returns):
        if self.reward_type == 'cumulative_return':
            return self.cumulative_wealth - 1.0
        elif self.reward_type == 'log_wealth':
            return np.log(self.cumulative_wealth)
        elif self.reward_type == 'mean_var':
            return portfolio_return - self.risk_coefficient * np.var(asset_returns)
        elif self.reward_type == 'mean_cvar':
            alpha = 0.05
            var = np.percentile(asset_returns, 100 * alpha)
            cvar = np.mean(asset_returns[asset_returns <= var])
            return portfolio_return - self.risk_coefficient * cvar
        else:
            raise ValueError('Invalid reward type')

    def seed(self, seed=None):
        np.random.seed(seed)

import pandas as pd
import numpy as np

def add_stable_features(df, etf_list):
    data = df.copy()

    for etf in etf_list:
        price_col = f'Price_{etf}'

        # Volatility (20-day)
        data[f'Volatility_{etf}'] = data[price_col].pct_change().rolling(20).std()

        # Momentum indicators (returns over 5, 10, 20 days)
        data[f'Momentum_5d_{etf}'] = data[price_col].pct_change(periods=5)
        data[f'Momentum_10d_{etf}'] = data[price_col].pct_change(periods=10)
        data[f'Momentum_20d_{etf}'] = data[price_col].pct_change(periods=20)

        # Moving averages (5-day and 20-day)
        data[f'MA_5d_{etf}'] = data[price_col].rolling(5).mean()
        data[f'MA_20d_{etf}'] = data[price_col].rolling(20).mean()

        # Moving average crossover (5-day MA - 20-day MA)
        data[f'MA_Crossover_{etf}'] = data[f'MA_5d_{etf}'] - data[f'MA_20d_{etf}']

    # Drop NaN values due to rolling calculations
    data.dropna(inplace=True)

    return data

def filter_features(df, include_predicted_returns=True, include_shap_metrics=True):
    df_filtered = df.copy()

    # Explicit patterns to identify columns
    predicted_return_pattern = 'Predicted_Return'
    shap_metric_pattern = 'SHAP'

    # Exclude Predicted Returns explicitly if requested
    if not include_predicted_returns:
        predicted_cols = [col for col in df_filtered.columns if predicted_return_pattern in col]
        df_filtered.drop(columns=predicted_cols, inplace=True)
        print(f"Excluded predicted return columns: {predicted_cols}")

    # Exclude SHAP-related metrics explicitly if requested
    if not include_shap_metrics:
        shap_cols = [col for col in df_filtered.columns if shap_metric_pattern in col]
        df_filtered.drop(columns=shap_cols, inplace=True)
        print(f"Excluded SHAP-related columns: {shap_cols}")

    return df_filtered

# ETFs
# etf_list = ['XLB', 'XLE', 'XLF', 'XLI', 'XLK', 'XLP', 'XLY', 'XLV', 'XLU']
etf_list = ['BA',
'AMGN',
'DIS',
'NKE',
'HON',
'MMM',
'CAT',
'KO',
'PG',
'AXP',
'JPM',
'MCD',
'HD',
'AAPL',
'CSCO',
'IBM',
'MSFT',
'TRV',
'UNH',
'CVX',
'JNJ',
'MRK',
'AMZN',
'WMT',
'INTC',
'VZ']
# Hyperparameter tuning
param_grid = {
    'learning_rate': [1e-4, 5e-5],
    'n_steps': [20, 40],
    'batch_size': [5, 10],
    'gamma': [0.98, 0.99]
}
consolidated_file = 'stage2_rl_observations_optimized_DIA_ETF.csv'
reward_type = 'mean_cvar'
# data = pd.read_csv(consolidated_file, parse_dates=['Date'])
# data = data.sort_values('Date').reset_index(drop=True)

data = pd.read_csv('stage2_rl_observations_optimized_DIA_ETF.csv', parse_dates=['Date'])
price_data = pd.read_csv('stock_prices_DIA_ETF.csv')

# Convert the Date column in price data, handling the timezone correctly
price_data['Date'] = pd.to_datetime(price_data['Date'], utc=True)
price_data['Date'] = price_data['Date'].dt.tz_localize(None)

# Rename price columns explicitly to 'price_{ticker}'
price_cols = {col: f'Price_{col}' for col in price_data.columns if col != 'Date'}
price_data.rename(columns=price_cols, inplace=True)

# Merge datasets on Date
merged_data = pd.merge(data, price_data, on='Date', how='inner')
merged_data.reset_index(drop=True, inplace=True)
# Check if merge was successful
if len(merged_data) != len(data):
    print(f"Warning: Data length mismatch after merging (Original: {len(data)}, Merged: {len(merged_data)}).")
else:
    print("Merged successfully with aligned dates.")

data_with_features_raw = add_stable_features(merged_data, etf_list)
data_with_features_raw.reset_index(drop=True, inplace=True)


# Usage Example clearly for benchmark (only price metrics, no predicted return or SHAP):
data_with_features = filter_features(data_with_features_raw, 
                                 include_predicted_returns=True, 
                                 include_shap_metrics=True)
################### override data to use SHAP only
# data_with_features = data
################### END override 

# Define your rolling window lengths clearly:
train_window_days = 252 * 7
validation_window_days = 252
prediction_window_days = 252
lookback_period = 21
rebalance_period = 21

start_indices = range(0, len(data) - (train_window_days + validation_window_days + prediction_window_days), prediction_window_days)
all_weights = []

from sklearn.model_selection import ParameterSampler
def validate_and_tune(train_data, val_data, reward_type, rebalance_period=10, lookback_period=10, n_iter=8, timesteps=5000):
    best_reward, best_params = -np.inf, None

    # Narrow and meaningful parameter distribution
    param_dist = {
        'learning_rate': [3e-4, 1e-4],
        'n_steps': [20, 40],
        'batch_size': [10, 20],
        'gamma': [0.95, 0.98],
        'risk_coefficient': [0.1, 0.5, 1.0] if reward_type in ['mean_var', 'mean_cvar'] else [0.5],
    }

    sampled_params = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42))

    for params in sampled_params:
        risk_coeff = params.pop('risk_coefficient', 0.5)

        env = make_vec_env(lambda: PortfolioEnv(train_data, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period), n_envs=1)
        model = PPO('MlpPolicy', env,
                    ent_coef=0.01,    # explicitly encourages exploration
                    clip_range=0.2,
                    **params, verbose=0)
        model.learn(total_timesteps=timesteps)

        val_env = PortfolioEnv(val_data, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
        obs, _ = val_env.reset()
        done, total_reward = False, 0
        
        # while not done:
        #     action, _ = model.predict(obs, deterministic=True)
        #     obs, reward, done, _, _ = val_env.step(action)
        #     total_reward += reward
        
        while not done:
            num_samples = 50  # Recommended starting point
            action_samples = []
        
            for _ in range(num_samples):
                sampled_action, _ = model.predict(obs, deterministic=False)  # obs directly
                action_samples.append(sampled_action)
        
            action = np.mean(action_samples, axis=0)
        
            obs, reward, done, _, _ = val_env.step(action)
            total_reward += reward

        if total_reward > best_reward:
            best_reward = total_reward
            best_params = {**params, 'risk_coefficient': risk_coeff}

    return best_params

def scale_data(df, feature_cols, scaler):
    scaled_features = scaler.transform(df[feature_cols])
    scaled_df = pd.DataFrame(scaled_features, columns=feature_cols, index=df.index)

    # Re-add columns that were not scaled (e.g., Date, Actual_Return_*)
    for col in df.columns:
        if col not in feature_cols:
            scaled_df[col] = df[col].values

    # Keep original column order
    scaled_df = scaled_df[df.columns]
    return scaled_df

# Main execution
from sklearn.preprocessing import StandardScaler

iterations = 25
all_weights_iterations = []

for iteration in range(iterations):
    print(f"\n==== Starting Iteration {iteration + 1}/{iterations} ====")
    model_path = f"ppo_train_best_model_iteration_{iteration}.zip"
    for start_idx in start_indices:
        # for start_idx in range(0, 252*2, 252):
        start_time = time.time()
    
        # Explicit indices for training, validation, and prediction datasets
        train_start_idx = start_idx
        train_end_idx = train_start_idx + train_window_days
    
        val_start_idx = train_end_idx
        val_end_idx = val_start_idx + validation_window_days
    
        pred_start_idx = val_end_idx
        pred_end_idx = pred_start_idx + prediction_window_days
    
        # Corresponding dates explicitly
        train_start_date = data_with_features.loc[train_start_idx, 'Date']
        train_end_date = data_with_features.loc[train_end_idx - 1, 'Date']
    
        val_start_date = data_with_features.loc[val_start_idx, 'Date']
        val_end_date = data_with_features.loc[val_end_idx - 1, 'Date']
    
        pred_start_date = data_with_features.loc[pred_start_idx, 'Date']
        pred_end_date = data_with_features.loc[pred_end_idx - 1, 'Date']
    
        # Clearly print ranges for clarity
        print(f"Training period: {train_start_date.date()} to {train_end_date.date()}")
        print(f"Validation period: {val_start_date.date()} to {val_end_date.date()}")
        print(f"Prediction period: {pred_start_date.date()} to {pred_end_date.date()}")
    
        # Explicitly subset data accordingly
        train_data = data_with_features.iloc[train_start_idx:train_end_idx].reset_index(drop=True)
        val_data = data_with_features.iloc[val_start_idx:val_end_idx].reset_index(drop=True)
        pred_data = data_with_features.iloc[pred_start_idx:pred_end_idx].reset_index(drop=True)
    
        feature_cols = [col for col in train_data.columns if col != 'Date' and not col.startswith('Actual_Return')]
    
        scaler = StandardScaler()
        scaler.fit(train_data[feature_cols])
    
        train_data_scaled = scale_data(train_data, feature_cols, scaler)
        val_data_scaled = scale_data(val_data, feature_cols, scaler)
        pred_data_scaled = scale_data(pred_data, feature_cols, scaler)
    
        print("Starting hyperparameter tuning...")
        best_params = validate_and_tune(train_data_scaled, val_data_scaled, reward_type)
        print(f"Best parameters: {best_params}")
    
        incremental_timesteps = 5000
        max_timesteps = 30000
        patience = 3
        
        best_val_reward = -np.inf
        no_improve_steps = 0
    
        risk_coeff = best_params.pop('risk_coefficient',0.5)
        policy_kwargs = dict(net_arch=[256, 256])
    
        env = make_vec_env(lambda: PortfolioEnv(train_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period), n_envs=1)
        
        # Load previous model if exists
        if idx > 0 and os.path.exists(model_path):
            print(f"Loading previous model from {model_path}...")
            model = PPO.load(model_path, env=env)
            model.set_env(env)
        else:
            print("Initializing new PPO model...")
            model = PPO('MlpPolicy', env,
                        policy_kwargs=policy_kwargs,
                        ent_coef=0.01,
                        clip_range=0.2,
                        **best_params, verbose=0)
         # always retrain
        # model = PPO('MlpPolicy', env,
        #             policy_kwargs=policy_kwargs,
        #             ent_coef=0.01,    # explicitly encourages exploration
        #             clip_range=0.2,
        #             **best_params, verbose=0)
        # print("Starting model training...")
        # model.learn(total_timesteps=20000)
        print("Starting model training with early stopping...")
        
        # model = PPO('MlpPolicy', env,
        #             policy_kwargs=policy_kwargs,
        #             ent_coef=0.01,    # explicitly encourages exploration
        #             clip_range=0.2,
        #             **best_params, verbose=0)
        # print("Starting model training...")
        # model.learn(total_timesteps=20000)
    
        for step in range(0, max_timesteps, incremental_timesteps):
            model.learn(total_timesteps=incremental_timesteps)
        
            # Evaluate on validation environment
            val_env = PortfolioEnv(val_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
            val_obs, _ = val_env.reset()
            val_done = False
            val_total_reward = 0.0
        
            while not val_done:
                # val_action, _ = model.predict(val_obs, deterministic=True)
                # val_obs, val_reward, val_done, _, _ = val_env.step(val_action)
                # val_total_reward += val_reward
                
                num_samples = 50  # Recommended
                value_action_samples = []
        
                for _ in range(num_samples):
                    value_sampled_action, _ = model.predict(val_obs, deterministic=False)
                    value_action_samples.append(value_sampled_action)
            
                val_action = np.mean(value_action_samples, axis=0)    
                
                val_obs, val_reward, val_done, _, _ = val_env.step(val_action)
                val_total_reward += val_reward
        
            print(f"Step: {step + incremental_timesteps}, Validation Total Reward: {val_total_reward:.4f}")
        
            # Early stopping check
            if val_total_reward > best_val_reward:
                best_val_reward = val_total_reward
                no_improve_steps = 0
                # model.save("best_ppo_model.zip")
                model.save(model_path)
                print(f"Improved validation reward; model saved at step {step + incremental_timesteps}")
            else:
                no_improve_steps += 1
                print(f"No improvement ({no_improve_steps}/{patience})")
        
                if no_improve_steps >= patience:
                    print("Early stopping explicitly triggered.")
                    break
        
        # Load the best model explicitly
        # model = PPO.load("best_ppo_model.zip")
        model = PPO.load(model_path)
        
        print("Loaded the best PPO model explicitly for prediction.")
    
    
    
        # Ensure historical context explicitly available in prediction
        full_data = pd.concat([train_data_scaled, val_data_scaled, pred_data_scaled])
        pred_data_with_history = full_data[full_data['Date'] >= (pred_start_date - pd.Timedelta(days=lookback_period))].reset_index(drop=True)
    
        pred_env = PortfolioEnv(pred_data_scaled, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
        # pred_env = PortfolioEnv(pred_data_with_history, etf_list, reward_type, risk_coeff, rebalance_period, lookback_period)
    
        obs, info = pred_env.reset()
        done = False
    
        action = np.zeros(len(etf_list), dtype=np.float32)
    
        while not done:
            if pred_env.current_step >= lookback_period and pred_env.current_step % pred_env.rebalance_period == 0:
                # obs_for_agent = pred_data_with_history.drop(columns=['Date']).iloc[pred_env.current_step - lookback_period:pred_env.current_step].values.flatten().astype(np.float32)
                # action, _ = model.predict(obs_for_agent, deterministic=True)
    
                # v1 normalize weight
                # action, _ = model.predict(obs, deterministic=True)
                
                # num_samples = 50  # Recommended
                # action_samples = []
                # 
                # for _ in range(num_samples):
                #     sampled_action, _ = model.predict(obs, deterministic=False)
                #     action_samples.append(sampled_action)
                # 
                # action = np.mean(action_samples, axis=0)    
                # 
                # temperature = 0.5
                # scaled_action = action / temperature
                # weights = np.exp(scaled_action) / np.sum(np.exp(scaled_action))
                # rebalance_date = pred_data_with_history.loc[pred_env.current_step, 'Date']
                # all_weights.append([rebalance_date] + weights.tolist())
    
    
                # v2 long short normalization
                # action, _ = model.predict(obs, deterministic=True)
                
                num_samples = 50  # Recommended
                action_samples = []

                for _ in range(num_samples):
                    sampled_action, _ = model.predict(obs, deterministic=False)
                    action_samples.append(sampled_action)

                action = np.mean(action_samples, axis=0)    

                # Explicitly apply your new 120/20 normalization logic (to match environment step)
                desired_long = 1.20  # Explicitly 120% long exposure
                desired_short = 0.20  # Explicitly 20% short exposure
                clip_bounds = (-0.2, 0.8)

                raw_weights = action.copy()

                # Separate explicitly positive (long) and negative (short) actions
                long_weights = np.maximum(raw_weights, 0)
                short_weights = np.abs(np.minimum(raw_weights, 0))

                has_longs = np.sum(long_weights) > 0
                has_shorts = np.sum(short_weights) > 0

                if has_longs and has_shorts:
                    normalized_long = desired_long * long_weights / np.sum(long_weights)
                    normalized_short = desired_short * short_weights / np.sum(short_weights)
                elif has_longs and not has_shorts:
                    normalized_long = long_weights / np.sum(long_weights)
                    normalized_short = np.zeros_like(short_weights)
                elif not has_longs and has_shorts:
                    num_assets = len(raw_weights)
                    normalized_long = np.ones(num_assets) / num_assets
                    normalized_short = np.zeros(num_assets)
                else:
                    num_assets = len(raw_weights)
                    normalized_long = np.ones(num_assets) / num_assets
                    normalized_short = np.zeros(num_assets)

                combined_weights = normalized_long - normalized_short
                clipped_weights = np.clip(combined_weights, clip_bounds[0], clip_bounds[1])

                # Re-separate after clipping explicitly
                long_clipped = np.maximum(clipped_weights, 0)
                short_clipped = np.abs(np.minimum(clipped_weights, 0))

                has_long_clipped = np.sum(long_clipped) > 0
                has_short_clipped = np.sum(short_clipped) > 0

                if has_long_clipped and has_short_clipped:
                    final_long = desired_long * long_clipped / np.sum(long_clipped)
                    final_short = desired_short * short_clipped / np.sum(short_clipped)
                elif has_long_clipped and not has_short_clipped:
                    final_long = long_clipped / np.sum(long_clipped)
                    final_short = np.zeros_like(short_clipped)
                else:
                    num_assets = len(raw_weights)
                    final_long = np.ones(num_assets) / num_assets
                    final_short = np.zeros(num_assets)

                final_weights = final_long - final_short

                rebalance_date = pred_data_with_history.loc[pred_env.current_step, 'Date']
                # all_weights.append([rebalance_date] + final_weights.tolist())
                all_weights_iterations.append([iteration + 1, rebalance_date] + final_weights.tolist())
            
            obs, _, done, _, _ = pred_env.step(action)
    
        end_time = time.time()
        print(f"Iteration {iteration + 1}, start index {start_idx} completed in {end_time - start_time:.4f} seconds")

columns = ['Iteration', 'Date'] + etf_list
weights_df = pd.DataFrame(all_weights_iterations, columns=columns)
weights_df.to_csv('ppo_allocations_multiple_iterations_DIA_ETF.csv', index=False)
print("Saved all iterations' allocations to ppo_allocations_multiple_iterations_DIA_ETF.csv")



Merged successfully with aligned dates.


  data[f'Momentum_5d_{etf}'] = data[price_col].pct_change(periods=5)
  data[f'Momentum_10d_{etf}'] = data[price_col].pct_change(periods=10)
  data[f'Momentum_20d_{etf}'] = data[price_col].pct_change(periods=20)
  data[f'MA_5d_{etf}'] = data[price_col].rolling(5).mean()
  data[f'MA_20d_{etf}'] = data[price_col].rolling(20).mean()
  data[f'MA_Crossover_{etf}'] = data[f'MA_5d_{etf}'] - data[f'MA_20d_{etf}']
  data[f'Momentum_5d_{etf}'] = data[price_col].pct_change(periods=5)
  data[f'Momentum_10d_{etf}'] = data[price_col].pct_change(periods=10)
  data[f'Momentum_20d_{etf}'] = data[price_col].pct_change(periods=20)
  data[f'MA_5d_{etf}'] = data[price_col].rolling(5).mean()
  data[f'MA_20d_{etf}'] = data[price_col].rolling(20).mean()
  data[f'MA_Crossover_{etf}'] = data[f'MA_5d_{etf}'] - data[f'MA_20d_{etf}']
  data[f'Momentum_5d_{etf}'] = data[price_col].pct_change(periods=5)
  data[f'Momentum_10d_{etf}'] = data[price_col].pct_change(periods=10)
  data[f'Momentum_20d_{etf}'] = data[price_c


==== Starting Iteration 1/25 ====
Training period: 2009-02-06 to 2016-02-09
Validation period: 2016-02-10 to 2017-02-08
Prediction period: 2017-02-09 to 2018-02-08
Starting hyperparameter tuning...


KeyboardInterrupt: 