In [7]:
import sys
import os

# Add Week 3 to Python path
sys.path.insert(0, os.path.abspath(".."))

import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.logger import configure

from day2_environment.trading_env import TradingEnv


In [8]:
log_dir = "logs/ppo_day5"
os.makedirs(log_dir, exist_ok=True)

env = TradingEnv(max_steps=100)
env = Monitor(env, log_dir)


In [9]:
new_logger = configure(log_dir, ["stdout", "csv"])

model = PPO(
    policy="MlpPolicy",
    env=env,
    verbose=1
)

model.set_logger(new_logger)


Logging to logs/ppo_day5
Using cpu device
Wrapping the env in a DummyVecEnv.


In [10]:
model.learn(total_timesteps=50_000)


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 100      |
|    ep_rew_mean     | -0.419   |
| time/              |          |
|    fps             | 513      |
|    iterations      | 1        |
|    time_elapsed    | 3        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 100         |
|    ep_rew_mean          | -0.499      |
| time/                   |             |
|    fps                  | 416         |
|    iterations           | 2           |
|    time_elapsed         | 9           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013274909 |
|    clip_fraction        | 0.197       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.09       |
|    explained_variance   | 0.553       |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x13e15b98050>

In [11]:
import pandas as pd

progress_path = os.path.join(log_dir, "progress.csv")

progress = pd.read_csv(
    progress_path,
    usecols=[
        "time/total_timesteps",
        "rollout/ep_rew_mean",
        "train/entropy_loss"
    ]
)

# Downsample heavily to avoid memory issues
progress_small = progress.iloc[::20]

progress_small.to_csv("day5_metrics.csv", index=False)

print("Saved day5_metrics.csv")
print(progress_small.head())
print(progress_small.tail())


Saved day5_metrics.csv
    time/total_timesteps  rollout/ep_rew_mean  train/entropy_loss
0                   2048            -0.419067                 NaN
20                 43008            -0.025752           -0.533455
    time/total_timesteps  rollout/ep_rew_mean  train/entropy_loss
0                   2048            -0.419067                 NaN
20                 43008            -0.025752           -0.533455


In [12]:
obs, _ = env.reset()

for _ in range(30):
    action, _ = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)

    print(
        f"Action: {action}, "
        f"Reward: {reward:.5f}, "
        f"Inventory: {info['inventory']:.2f}"
    )

    if terminated or truncated:
        obs, _ = env.reset()


Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inventory: 0.00
Action: 0, Reward: 0.00000, Inve

In [13]:
model.save("ppo_trading_model")


Gym has been unmaintained since 2022 and does not support NumPy 2.0 amongst other critical functionality.
Please upgrade to Gymnasium, the maintained drop-in replacement of Gym, or contact the authors of your software and request that they upgrade.
Users of this version of Gym should be able to simply replace 'import gym' with 'import gymnasium as gym' in the vast majority of cases.
See the migration guide at https://gymnasium.farama.org/introduction/migration_guide/ for additional information.


## Day 5 â€“ Learnability Verdict

A 50,000-timestep PPO training run was completed successfully using the
risk-aware TradingEnv developed in earlier stages.

Training metrics were logged using Stable-Baselines3 monitoring utilities.
Due to repeated Jupyter kernel crashes during plotting on Windows,
metrics were exported and analyzed externally from the generated CSV logs.

Observations:
- Mean episode reward stabilized near zero over training.
- Policy entropy decreased gradually, indicating convergence of the policy.
- The learned policy converged to a deterministic and conservative
  Hold-dominant strategy with near-zero inventory.

Verdict:
The agent demonstrates stable and coherent learning behavior.
The environment, reward function, and PPO algorithm together form a
well-posed and learnable system.

One concrete improvement:
Introduce structured order flow and multi-agent interactions to create
non-zero expected trading opportunities and richer market dynamics.
