# Homework 11 — SOLUTION

# RL Portfolio Manager

**This is the instructor solution. Do not distribute to students.**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO, A2C, SAC
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.callbacks import BaseCallback
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)
np.random.seed(42)

---
## Part 1: Data Setup (15 pts)

In [None]:
# Download 30 stocks
tickers = [
    'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'JPM', 'JNJ', 'V', 'PG', 'UNH', 'HD',
    'MA', 'DIS', 'NVDA', 'BAC', 'ADBE', 'CRM', 'CMCSA', 'XOM', 'CSCO', 'PFE',
    'NFLX', 'ABT', 'KO', 'PEP', 'TMO', 'AVGO', 'COST', 'WMT', 'MRK', 'CVX',
]

print(f"Downloading {len(tickers)} stocks...")
data = yf.download(tickers, start='2018-01-01', end='2024-01-01',
                   auto_adjust=True, progress=False)
prices = data['Close'][tickers].dropna()
returns = prices.pct_change().dropna()

print(f"Prices shape: {prices.shape}")
print(f"Date range: {prices.index[0].date()} to {prices.index[-1].date()}")
print(f"Missing values: {prices.isna().sum().sum()}")

In [None]:
# Train/test split: 2018-2022 train, 2023 test
train_mask = returns.index < '2023-01-01'
test_mask = returns.index >= '2023-01-01'

train_returns = returns[train_mask].values
test_returns = returns[test_mask].values

print(f"Train: {train_returns.shape[0]} days ({returns.index[train_mask][0].date()} to {returns.index[train_mask][-1].date()})")
print(f"Test:  {test_returns.shape[0]} days ({returns.index[test_mask][0].date()} to {returns.index[test_mask][-1].date()})")

In [None]:
class PortfolioEnv(gym.Env):
    """Portfolio environment with configurable rewards for 30 stocks."""
    
    def __init__(self, returns_data, lookback=20, transaction_cost=0.001,
                 reward_type='return', reward_lambda=2.0):
        super().__init__()
        self.returns_data = returns_data
        self.n_assets = returns_data.shape[1]
        self.lookback = lookback
        self.tc = transaction_cost
        self.reward_type = reward_type
        self.reward_lambda = reward_lambda
        
        obs_dim = 3 * self.n_assets
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32
        )
        self.action_space = spaces.Box(
            low=-1, high=1, shape=(self.n_assets,), dtype=np.float32
        )
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.t = self.lookback
        self.weights = np.ones(self.n_assets) / self.n_assets
        self.portfolio_value = 1.0
        self.peak_value = 1.0
        self.returns_buffer = []
        self.history = []
        self.weight_history = []
        return self._get_obs(), {}
    
    def _get_obs(self):
        recent = self.returns_data[self.t - self.lookback:self.t]
        mean_ret = recent.mean(axis=0)
        vol = recent.std(axis=0) + 1e-8
        return np.concatenate([self.weights, mean_ret, vol]).astype(np.float32)
    
    def _softmax(self, x):
        e = np.exp(x - x.max())
        return e / e.sum()
    
    def _compute_reward(self, port_return):
        if self.reward_type == 'return':
            return port_return
        elif self.reward_type == 'sharpe':
            self.returns_buffer.append(port_return)
            if len(self.returns_buffer) < 20:
                return port_return
            recent = np.array(self.returns_buffer[-20:])
            return recent.mean() / (recent.std() + 1e-8)
        elif self.reward_type == 'return_dd':
            self.peak_value = max(self.peak_value, self.portfolio_value)
            dd = (self.peak_value - self.portfolio_value) / self.peak_value
            return port_return - self.reward_lambda * dd
        return port_return
    
    def step(self, action):
        new_weights = self._softmax(action)
        turnover = np.abs(new_weights - self.weights).sum()
        tc_cost = self.tc * turnover
        
        asset_returns = self.returns_data[self.t]
        port_return = np.dot(new_weights, asset_returns) - tc_cost
        
        self.portfolio_value *= (1 + port_return)
        self.weights = new_weights * (1 + asset_returns)
        self.weights /= self.weights.sum()
        self.t += 1
        
        reward = self._compute_reward(port_return)
        
        self.history.append({
            'portfolio_value': self.portfolio_value,
            'return': port_return,
            'turnover': turnover,
        })
        self.weight_history.append(new_weights.copy())
        
        terminated = self.t >= len(self.returns_data)
        return self._get_obs(), reward, terminated, False, {}

# Verify environment
env = PortfolioEnv(train_returns)
obs, _ = env.reset()
action = env.action_space.sample()
obs2, reward, done, _, _ = env.step(action)
print(f"Obs shape: {obs.shape}, Action shape: {action.shape}")
print(f"Reward: {reward:.6f}, Done: {done}")
print("Environment working correctly.")

---
## Part 2: Train PPO, A2C, SAC Agents (25 pts)

In [None]:
# Callback to track training rewards
class RewardCallback(BaseCallback):
    def __init__(self):
        super().__init__()
        self.episode_rewards = []
        self.current_rewards = 0
    
    def _on_step(self):
        self.current_rewards += self.locals['rewards'][0]
        if self.locals['dones'][0]:
            self.episode_rewards.append(self.current_rewards)
            self.current_rewards = 0
        return True

In [None]:
# Train PPO
print("Training PPO...")
ppo_env = DummyVecEnv([lambda: PortfolioEnv(train_returns)])
ppo_cb = RewardCallback()
ppo_model = PPO(
    'MlpPolicy', ppo_env,
    learning_rate=3e-4,
    n_steps=512,
    batch_size=128,
    n_epochs=10,
    gamma=0.99,
    gae_lambda=0.95,
    clip_range=0.2,
    verbose=0,
    seed=42,
)
ppo_model.learn(total_timesteps=150_000, callback=ppo_cb)
print(f"  Episodes: {len(ppo_cb.episode_rewards)}")

# Train A2C
print("Training A2C...")
a2c_env = DummyVecEnv([lambda: PortfolioEnv(train_returns)])
a2c_cb = RewardCallback()
a2c_model = A2C(
    'MlpPolicy', a2c_env,
    learning_rate=3e-4,
    n_steps=128,
    gamma=0.99,
    gae_lambda=0.95,
    verbose=0,
    seed=42,
)
a2c_model.learn(total_timesteps=150_000, callback=a2c_cb)
print(f"  Episodes: {len(a2c_cb.episode_rewards)}")

# Train SAC
print("Training SAC...")
sac_env = DummyVecEnv([lambda: PortfolioEnv(train_returns)])
sac_cb = RewardCallback()
sac_model = SAC(
    'MlpPolicy', sac_env,
    learning_rate=3e-4,
    buffer_size=100_000,
    batch_size=256,
    gamma=0.99,
    tau=0.005,
    learning_starts=1000,
    verbose=0,
    seed=42,
)
sac_model.learn(total_timesteps=150_000, callback=sac_cb)
print(f"  Episodes: {len(sac_cb.episode_rewards)}")

print("\nAll agents trained.")

In [None]:
# Plot training curves
fig, ax = plt.subplots(figsize=(12, 5))
for name, cb in [('PPO', ppo_cb), ('A2C', a2c_cb), ('SAC', sac_cb)]:
    if len(cb.episode_rewards) > 0:
        # Smooth with rolling mean
        rewards = pd.Series(cb.episode_rewards)
        smoothed = rewards.rolling(min(5, len(rewards)), min_periods=1).mean()
        ax.plot(smoothed.values, label=name)

ax.set_title('Training Episode Rewards')
ax.set_xlabel('Episode')
ax.set_ylabel('Cumulative Reward')
ax.legend()
plt.show()

---
## Part 3: Benchmark Comparison (25 pts)

In [None]:
# Helper functions
def evaluate_agent(model, test_data, reward_type='return'):
    env = PortfolioEnv(test_data, reward_type=reward_type)
    obs, _ = env.reset()
    done = False
    while not done:
        if model is None:
            action = np.zeros(env.n_assets)  # equal weight
        elif callable(model):
            action = model(obs, env)  # custom policy
        else:
            action, _ = model.predict(obs, deterministic=True)
        obs, _, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
    return pd.DataFrame(env.history), np.array(env.weight_history)

def compute_metrics(results):
    rets = results['return'].values
    pv = results['portfolio_value']
    return {
        'Total Return': pv.iloc[-1] - 1,
        'Ann. Sharpe': np.sqrt(252) * rets.mean() / (rets.std() + 1e-8),
        'Max Drawdown': ((pv.cummax() - pv) / pv.cummax()).max(),
        'Ann. Volatility': rets.std() * np.sqrt(252),
        'Avg Turnover': results['turnover'].mean(),
    }

In [None]:
# Minimum variance policy (using rolling covariance from observation)
def min_var_policy(obs, env):
    """Compute min-variance weights from recent returns."""
    recent = env.returns_data[env.t - env.lookback:env.t]
    cov = np.cov(recent.T) + 1e-6 * np.eye(env.n_assets)
    try:
        inv_cov = np.linalg.inv(cov)
        ones = np.ones(env.n_assets)
        w = inv_cov @ ones / (ones @ inv_cov @ ones)
        w = np.clip(w, 0, None)  # long-only
        w /= w.sum()
    except np.linalg.LinAlgError:
        w = np.ones(env.n_assets) / env.n_assets
    # Convert to action space (inverse softmax approximation)
    action = np.log(w + 1e-8)
    return action.astype(np.float32)

# Momentum policy: overweight recent winners
def momentum_policy(obs, env):
    """Simple momentum: weight proportional to recent return rank."""
    recent = env.returns_data[env.t - env.lookback:env.t]
    cum_ret = (1 + recent).prod(axis=0) - 1
    # Rank-based weights
    ranks = cum_ret.argsort().argsort().astype(float)
    action = (ranks - ranks.mean()) / (ranks.std() + 1e-8)
    return action.astype(np.float32)

In [None]:
# Evaluate all strategies
strategies = {
    'PPO': ppo_model,
    'A2C': a2c_model,
    'SAC': sac_model,
    'Equal Weight': None,
    'Min Variance': min_var_policy,
    'Momentum': momentum_policy,
}

all_results = {}
all_metrics = {}

for name, model in strategies.items():
    res, weights = evaluate_agent(model, test_returns)
    all_results[name] = res
    all_metrics[name] = compute_metrics(res)

# Plot cumulative returns
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

for name, res in all_results.items():
    style = '--' if name in ['Equal Weight', 'Min Variance', 'Momentum'] else '-'
    axes[0].plot(res['portfolio_value'].values, label=name, linestyle=style)

axes[0].set_title('Cumulative Returns — All Strategies (Test Set)')
axes[0].set_xlabel('Day')
axes[0].set_ylabel('Portfolio Value')
axes[0].legend(fontsize=9)

# Drawdown comparison
for name, res in all_results.items():
    pv = res['portfolio_value']
    dd = (pv.cummax() - pv) / pv.cummax()
    style = '--' if name in ['Equal Weight', 'Min Variance', 'Momentum'] else '-'
    axes[1].plot(dd.values, label=name, linestyle=style, alpha=0.7)

axes[1].set_title('Drawdown — All Strategies')
axes[1].set_xlabel('Day')
axes[1].set_ylabel('Drawdown')
axes[1].legend(fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Summary table
metrics_df = pd.DataFrame(all_metrics).T
metrics_df['Total Return'] = metrics_df['Total Return'].map('{:.2%}'.format)
metrics_df['Ann. Sharpe'] = metrics_df['Ann. Sharpe'].map('{:.2f}'.format)
metrics_df['Max Drawdown'] = metrics_df['Max Drawdown'].map('{:.2%}'.format)
metrics_df['Ann. Volatility'] = metrics_df['Ann. Volatility'].map('{:.2%}'.format)
metrics_df['Avg Turnover'] = metrics_df['Avg Turnover'].map('{:.4f}'.format)
print(metrics_df.to_string())

**Commentary:**

The RL agents produce results that are broadly comparable to classical baselines. Equal-weight remains a strong baseline due to its implicit diversification and zero-cost rebalancing. The minimum-variance portfolio tends to have lower volatility but also lower returns. RL agents can sometimes reduce drawdowns relative to equal-weight, particularly when trained with drawdown-penalizing rewards, but rarely generate significantly higher Sharpe ratios out-of-sample.

---
## Part 4: Reward Shaping Experiments (20 pts)

In [None]:
# Train PPO with different reward functions
reward_configs = [
    ('return', {}),
    ('sharpe', {}),
    ('return_dd (lam=1)', {'reward_type': 'return_dd', 'reward_lambda': 1.0}),
    ('return_dd (lam=5)', {'reward_type': 'return_dd', 'reward_lambda': 5.0}),
]

reward_models = {}
for name, kwargs in reward_configs:
    rtype = kwargs.get('reward_type', name)
    rlam = kwargs.get('reward_lambda', 2.0)
    print(f"Training PPO with reward='{name}'...")
    env = DummyVecEnv([lambda rt=rtype, rl=rlam: PortfolioEnv(
        train_returns, reward_type=rt, reward_lambda=rl
    )])
    model = PPO('MlpPolicy', env, learning_rate=3e-4, n_steps=512,
                batch_size=128, n_epochs=10, verbose=0, seed=42)
    model.learn(total_timesteps=100_000)
    reward_models[name] = model

print("Done.")

In [None]:
# Evaluate reward variants
reward_metrics = {}

fig, axes = plt.subplots(1, 3, figsize=(16, 4))

for name, model in reward_models.items():
    res, _ = evaluate_agent(model, test_returns)
    reward_metrics[name] = compute_metrics(res)
    axes[0].plot(res['portfolio_value'].values, label=name)
    pv = res['portfolio_value']
    dd = (pv.cummax() - pv) / pv.cummax()
    axes[1].plot(dd.values, label=name, alpha=0.7)
    axes[2].plot(pd.Series(res['turnover'].values).rolling(20).mean(), label=name, alpha=0.7)

axes[0].set_title('Portfolio Value')
axes[0].legend(fontsize=8)
axes[1].set_title('Drawdown')
axes[1].legend(fontsize=8)
axes[2].set_title('Rolling Turnover (20d)')
axes[2].legend(fontsize=8)

for ax in axes:
    ax.set_xlabel('Day')

plt.tight_layout()
plt.show()

print(pd.DataFrame(reward_metrics).T.round(4).to_string())

**Analysis:**

- The `return_dd` reward with lambda=5 most aggressively reduces drawdowns, but at the cost of lower returns. The agent becomes too conservative.
- The `sharpe` reward produces moderate turnover and a balanced risk/return profile.
- The raw `return` reward leads to the highest turnover as the agent chases short-term returns.
- In practice, `return_dd` with lambda in the range 1-3 offers a good balance. The Sharpe-based reward is harder to tune because the rolling window introduces lag.

---
## Part 5: Analysis of Agent Behavior (15 pts)

In [None]:
# Analyze PPO agent behavior on test set
best_model = ppo_model
res, weights = evaluate_agent(best_model, test_returns)

# Weight evolution — stacked area chart (top 10 stocks by average weight)
weights_df = pd.DataFrame(weights, columns=tickers)
top_10 = weights_df.mean().nlargest(10).index.tolist()
other = weights_df.drop(columns=top_10).sum(axis=1)

fig, axes = plt.subplots(2, 1, figsize=(14, 8))

plot_df = weights_df[top_10].copy()
plot_df['Other'] = other
axes[0].stackplot(range(len(plot_df)), plot_df.values.T,
                  labels=plot_df.columns, alpha=0.8)
axes[0].set_title('PPO Agent — Portfolio Weight Evolution (Top 10 + Other)')
axes[0].set_xlabel('Day')
axes[0].set_ylabel('Weight')
axes[0].legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize=8)
axes[0].set_xlim(0, len(plot_df))

# Turnover
axes[1].plot(res['turnover'].values, alpha=0.5, linewidth=0.8, label='Daily')
axes[1].plot(pd.Series(res['turnover'].values).rolling(20).mean(),
             color='red', linewidth=2, label='20-day MA')
axes[1].set_title('PPO Agent — Daily Turnover')
axes[1].set_xlabel('Day')
axes[1].set_ylabel('Turnover')
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Overfitting check: train vs test
res_train, _ = evaluate_agent(best_model, train_returns)
res_test, _ = evaluate_agent(best_model, test_returns)

print("PPO — Train vs Test Performance:")
comparison = pd.DataFrame({
    'Train': compute_metrics(res_train),
    'Test': compute_metrics(res_test),
})
print(comparison.round(4).to_string())

In [None]:
# Multi-seed robustness
print("\nMulti-seed robustness (PPO, 5 seeds):")
seed_results = []

for seed in range(5):
    env = DummyVecEnv([lambda: PortfolioEnv(train_returns)])
    model = PPO('MlpPolicy', env, learning_rate=3e-4, n_steps=512,
                batch_size=128, n_epochs=10, seed=seed, verbose=0)
    model.learn(total_timesteps=80_000)
    res, _ = evaluate_agent(model, test_returns)
    m = compute_metrics(res)
    seed_results.append(m)

seed_df = pd.DataFrame(seed_results)
print(f"{'Metric':<20} {'Mean':>10} {'Std':>10}")
print('-' * 40)
for col in seed_df.columns:
    print(f"{col:<20} {seed_df[col].mean():>10.4f} {seed_df[col].std():>10.4f}")

**Behavioral Analysis:**

The PPO agent tends to concentrate positions in a subset of stocks (typically those with the highest recent Sharpe ratios) and adjusts allocations based on recent volatility. Turnover is moderate — the agent does not trade every day but rebalances when return/risk profiles shift. The train/test gap in Sharpe ratio indicates some degree of overfitting, which is expected given that RL agents optimize over the full training trajectory. Multi-seed analysis shows moderate variance, suggesting the agent is learning some signal rather than purely memorizing. Overall, the agent behaves like a risk-aware momentum trader, but should not be deployed without further walk-forward validation and position concentration limits.