# Week 11 Seminar — RL for Portfolio Management (Hands-On)

**Exercises:**
1. Set up a FinRL environment with 10 stocks, train PPO (25 min)
2. Custom reward functions: return, Sharpe-based, return - lambda * drawdown (25 min)
3. Compare PPO vs A2C vs DDPG (20 min)
4. Discussion: How to evaluate if RL agent learned vs overfit (20 min)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gymnasium as gym
from gymnasium import spaces
from stable_baselines3 import PPO, A2C, DDPG
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.noise import NormalActionNoise
import yfinance as yf
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)
np.random.seed(42)

---
## Exercise 1: Set Up Environment with 10 Stocks, Train PPO (25 min)

We'll build a clean portfolio environment that works with real stock data.

In [None]:
# Download 10 stocks
tickers = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'JPM',
           'JNJ', 'XOM', 'PG', 'NVDA', 'V']

print("Downloading stock data...")
data = yf.download(tickers, start='2019-01-01', end='2024-01-01',
                   auto_adjust=True, progress=False)
prices = data['Close'][tickers].dropna()
returns = prices.pct_change().dropna().values

print(f"Data shape: {returns.shape}  ({returns.shape[0]} days, {returns.shape[1]} stocks)")
print(f"Date range: {prices.index[1].date()} to {prices.index[-1].date()}")

In [None]:
class PortfolioEnv(gym.Env):
    """Portfolio allocation environment with configurable reward."""
    
    def __init__(self, returns_data, lookback=20, transaction_cost=0.001,
                 reward_type='return'):
        super().__init__()
        self.returns_data = returns_data
        self.n_assets = returns_data.shape[1]
        self.lookback = lookback
        self.tc = transaction_cost
        self.reward_type = reward_type
        
        obs_dim = 3 * self.n_assets  # weights + mean_ret + vol
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(obs_dim,), dtype=np.float32
        )
        self.action_space = spaces.Box(
            low=-1, high=1, shape=(self.n_assets,), dtype=np.float32
        )
    
    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.t = self.lookback
        self.weights = np.ones(self.n_assets) / self.n_assets
        self.portfolio_value = 1.0
        self.peak_value = 1.0
        self.returns_history = []
        self.history = []
        return self._get_obs(), {}
    
    def _get_obs(self):
        recent = self.returns_data[self.t - self.lookback:self.t]
        mean_ret = recent.mean(axis=0)
        vol = recent.std(axis=0) + 1e-8
        return np.concatenate([self.weights, mean_ret, vol]).astype(np.float32)
    
    def _softmax(self, x):
        e = np.exp(x - x.max())
        return e / e.sum()
    
    def _compute_reward(self, port_return):
        if self.reward_type == 'return':
            return port_return
        
        elif self.reward_type == 'sharpe':
            self.returns_history.append(port_return)
            if len(self.returns_history) < 20:
                return port_return
            recent = np.array(self.returns_history[-20:])
            return recent.mean() / (recent.std() + 1e-8)
        
        elif self.reward_type == 'return_dd':
            self.peak_value = max(self.peak_value, self.portfolio_value)
            drawdown = (self.peak_value - self.portfolio_value) / self.peak_value
            lam = 2.0
            return port_return - lam * drawdown
        
        return port_return
    
    def step(self, action):
        new_weights = self._softmax(action)
        turnover = np.abs(new_weights - self.weights).sum()
        tc_cost = self.tc * turnover
        
        asset_returns = self.returns_data[self.t]
        port_return = np.dot(new_weights, asset_returns) - tc_cost
        
        self.portfolio_value *= (1 + port_return)
        self.weights = new_weights * (1 + asset_returns)
        self.weights /= self.weights.sum()
        self.t += 1
        
        reward = self._compute_reward(port_return)
        
        self.history.append({
            'portfolio_value': self.portfolio_value,
            'return': port_return,
            'turnover': turnover,
        })
        
        terminated = self.t >= len(self.returns_data)
        return self._get_obs(), reward, terminated, False, {}

In [None]:
# Train/test split: 80/20
split = int(0.8 * len(returns))
train_returns = returns[:split]
test_returns = returns[split:]

print(f"Train: {train_returns.shape[0]} days")
print(f"Test:  {test_returns.shape[0]} days")

# Train PPO
train_env = DummyVecEnv([lambda: PortfolioEnv(train_returns, reward_type='return')])

ppo_model = PPO(
    'MlpPolicy', train_env,
    learning_rate=3e-4,
    n_steps=256,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    verbose=0,
)

print("Training PPO...")
ppo_model.learn(total_timesteps=100_000)
print("Done.")

In [None]:
# Helper: run agent on test data
def evaluate_agent(model, test_data, reward_type='return', deterministic=True):
    env = PortfolioEnv(test_data, reward_type=reward_type)
    obs, _ = env.reset()
    done = False
    while not done:
        if model is None:  # equal weight baseline
            action = np.zeros(env.n_assets)
        else:
            action, _ = model.predict(obs, deterministic=deterministic)
        obs, _, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
    return pd.DataFrame(env.history)

def compute_metrics(results):
    rets = results['return'].values
    return {
        'Total Return': results['portfolio_value'].iloc[-1] - 1,
        'Sharpe': np.sqrt(252) * rets.mean() / (rets.std() + 1e-8),
        'Max Drawdown': ((results['portfolio_value'].cummax() - results['portfolio_value']) 
                         / results['portfolio_value'].cummax()).max(),
        'Avg Turnover': results['turnover'].mean(),
    }

# Evaluate
res_ppo = evaluate_agent(ppo_model, test_returns)
res_ew = evaluate_agent(None, test_returns)

plt.plot(res_ppo['portfolio_value'].values, label='PPO')
plt.plot(res_ew['portfolio_value'].values, label='Equal Weight')
plt.title('PPO vs Equal Weight (Test Set)')
plt.xlabel('Day')
plt.ylabel('Portfolio Value')
plt.legend()
plt.show()

metrics = pd.DataFrame({
    'PPO': compute_metrics(res_ppo),
    'Equal Weight': compute_metrics(res_ew),
})
print(metrics.round(4))

---
## Exercise 2: Custom Reward Functions (25 min)

Train PPO with three different reward signals and compare behavior.

| Reward | Formula |
|--------|---------|
| `return` | $r_t = r_p$ |
| `sharpe` | $r_t = \bar{r}_{20} / \sigma_{20}$ |
| `return_dd` | $r_t = r_p - 2 \cdot \text{drawdown}_t$ |

In [None]:
# Train agents with different reward functions
reward_types = ['return', 'sharpe', 'return_dd']
models = {}

for rtype in reward_types:
    print(f"Training PPO with reward='{rtype}'...")
    env = DummyVecEnv([lambda rt=rtype: PortfolioEnv(train_returns, reward_type=rt)])
    model = PPO(
        'MlpPolicy', env,
        learning_rate=3e-4,
        n_steps=256,
        batch_size=64,
        n_epochs=10,
        gamma=0.99,
        verbose=0,
    )
    model.learn(total_timesteps=80_000)
    models[rtype] = model
    print(f"  Done.")

print("\nAll models trained.")

In [None]:
# Evaluate all on test set
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

all_metrics = {}
for rtype, model in models.items():
    res = evaluate_agent(model, test_returns)
    axes[0].plot(res['portfolio_value'].values, label=f'PPO ({rtype})')
    all_metrics[f'PPO ({rtype})'] = compute_metrics(res)

res_ew = evaluate_agent(None, test_returns)
axes[0].plot(res_ew['portfolio_value'].values, label='Equal Weight', linestyle='--', color='black')
all_metrics['Equal Weight'] = compute_metrics(res_ew)

axes[0].set_title('Portfolio Value by Reward Type')
axes[0].set_xlabel('Day')
axes[0].set_ylabel('Portfolio Value')
axes[0].legend()

# Drawdown comparison
for rtype, model in models.items():
    res = evaluate_agent(model, test_returns)
    pv = res['portfolio_value']
    dd = (pv.cummax() - pv) / pv.cummax()
    axes[1].plot(dd.values, label=f'PPO ({rtype})', alpha=0.7)

axes[1].set_title('Drawdown by Reward Type')
axes[1].set_xlabel('Day')
axes[1].set_ylabel('Drawdown')
axes[1].legend()

plt.tight_layout()
plt.show()

print(pd.DataFrame(all_metrics).round(4))

### Observations

- **Raw return** reward: agent tends to concentrate positions (higher risk, higher potential return)
- **Sharpe-based** reward: agent trades less, more diversified
- **Return - drawdown** reward: agent actively avoids drawdowns, sometimes at cost of return

**Key insight:** The reward function is the most important design choice in financial RL.

---
## Exercise 3: Compare PPO vs A2C vs DDPG (20 min)

In [None]:
# Train A2C
print("Training A2C...")
a2c_env = DummyVecEnv([lambda: PortfolioEnv(train_returns, reward_type='return')])
a2c_model = A2C(
    'MlpPolicy', a2c_env,
    learning_rate=3e-4,
    n_steps=128,
    gamma=0.99,
    verbose=0,
)
a2c_model.learn(total_timesteps=80_000)
print("Done.")

# Train DDPG
print("Training DDPG...")
ddpg_env = DummyVecEnv([lambda: PortfolioEnv(train_returns, reward_type='return')])
n_actions = train_returns.shape[1]
action_noise = NormalActionNoise(
    mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)
)
ddpg_model = DDPG(
    'MlpPolicy', ddpg_env,
    learning_rate=1e-3,
    action_noise=action_noise,
    buffer_size=50_000,
    batch_size=64,
    gamma=0.99,
    verbose=0,
)
ddpg_model.learn(total_timesteps=80_000)
print("Done.")

In [None]:
# Compare all algorithms
algo_models = {
    'PPO': models['return'],
    'A2C': a2c_model,
    'DDPG': ddpg_model,
}

algo_metrics = {}
for name, model in algo_models.items():
    res = evaluate_agent(model, test_returns)
    plt.plot(res['portfolio_value'].values, label=name)
    algo_metrics[name] = compute_metrics(res)

res_ew = evaluate_agent(None, test_returns)
plt.plot(res_ew['portfolio_value'].values, label='Equal Weight', linestyle='--', color='black')
algo_metrics['Equal Weight'] = compute_metrics(res_ew)

plt.title('Algorithm Comparison (Test Set)')
plt.xlabel('Day')
plt.ylabel('Portfolio Value')
plt.legend()
plt.show()

print(pd.DataFrame(algo_metrics).round(4))

### Typical observations

- **PPO**: Most stable training, consistent results across runs
- **A2C**: Faster training but higher variance between runs
- **DDPG**: Can learn good policies but sensitive to hyperparameters
- **Equal weight** is often competitive — a humbling but important baseline

---
## Exercise 4: Discussion — Learned or Overfit? (20 min)

### How to tell if your RL agent learned something real:

**1. Train vs Test performance gap**
- If train Sharpe >> test Sharpe, the agent memorized the training data
- Healthy gap: < 30% degradation

**2. Robustness checks**
- Train on multiple time periods — does the agent behave consistently?
- Shuffle training data order — does it still learn?
- Add noise to observations — graceful degradation?

**3. Behavioral analysis**
- Does the agent's allocation make economic sense?
- Does it trade too much? (High turnover = overfitting to noise)
- Does it concentrate in one stock? (Probably memorized that stock's run)

**4. Multiple seeds**
- Train 5-10 agents with different random seeds
- Report mean +/- std of metrics
- If variance across seeds > signal, you're overfitting

**5. Walk-forward validation**
- Train on 2019-2020, test on 2021
- Train on 2019-2021, test on 2022
- Train on 2019-2022, test on 2023
- Look for consistent out-of-sample performance

In [None]:
# Quick robustness check: train vs test performance
res_train = evaluate_agent(models['return'], train_returns)
res_test = evaluate_agent(models['return'], test_returns)

train_metrics = compute_metrics(res_train)
test_metrics = compute_metrics(res_test)

comparison = pd.DataFrame({'Train': train_metrics, 'Test': test_metrics})
print("PPO (return reward) — Train vs Test:")
print(comparison.round(4))
print(f"\nSharpe degradation: {(1 - test_metrics['Sharpe'] / (train_metrics['Sharpe'] + 1e-8)):.1%}")

In [None]:
# Multi-seed robustness
print("Training PPO with 5 different seeds...")
seed_results = []

for seed in range(5):
    env = DummyVecEnv([lambda: PortfolioEnv(train_returns)])
    model = PPO('MlpPolicy', env, learning_rate=3e-4, n_steps=256,
                batch_size=64, seed=seed, verbose=0)
    model.learn(total_timesteps=50_000)
    res = evaluate_agent(model, test_returns)
    m = compute_metrics(res)
    seed_results.append(m)
    print(f"  Seed {seed}: Sharpe={m['Sharpe']:.3f}, Return={m['Total Return']:.3%}")

seed_df = pd.DataFrame(seed_results)
print(f"\nMean Sharpe: {seed_df['Sharpe'].mean():.3f} +/- {seed_df['Sharpe'].std():.3f}")
print(f"Mean Return: {seed_df['Total Return'].mean():.3%} +/- {seed_df['Total Return'].std():.3%}")

---
## Summary

1. Built a reusable `PortfolioEnv` with configurable reward functions
2. Reward design is the single most important choice in financial RL
3. PPO is the most reliable algorithm; DDPG can work but needs tuning
4. Always check for overfitting: train/test gap, multi-seed, behavioral analysis
5. Equal weight remains a tough baseline to beat consistently