In [None]:
## Imports
import gymnasium as gym
from typing import Callable

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.vec_env import VecMonitor
import torch as th
from stable_baselines3.common.callbacks import EvalCallback

import pickle

## Import Our environment
from dev_env import tradingEng





In [None]:
## Load Paths
with open("ZeroCorrFrs1Half","rb") as fp:
    paths1 = pickle.load(fp)
with open("ZeroCorrFrs2Half","rb") as fp:
    paths1 = paths1 + pickle.load(fp)

with open("ZeroCorrSnd1Half","rb") as fp:
    paths2 = pickle.load(fp)
with open("ZeroCorrSnd2Half","rb") as fp:
    paths2 = paths2 + pickle.load(fp)

with open("ZeroCorrTest","rb") as fp:
    paths_ev = pickle.load(fp)


In [None]:
## Linear LR schedule, see SB3 Documentation at https://stable-baselines3.readthedocs.io/en/master/guide/examples.html#learning-rate-schedule
def linear_schedule(initial_value: float) -> Callable[[float], float]:
    """
    Linear learning rate schedule.

    :param initial_value: Initial learning rate.
    :return: schedule that computes
      current learning rate depending on remaining progress
    """
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0.

        :param progress_remaining:
        :return: current learning rate
        """
        return progress_remaining * initial_value

    return func

## Policy Kwargs
policy_kwargs = dict(activation_fn=th.nn.LeakyReLU,
                     net_arch=dict(pi=[512,512,256,128], vf=[512,512,256,128], optimizers_class = th.optim.Adam)) #


In [None]:
#####################
# Training section  #
#####################
envs = VecMonitor(DummyVecEnv([
    lambda: tradingEng(paths1,action = 'big', obs = 'xs'), # <- Set action and obs
    lambda: tradingEng(paths2,action = 'big', obs = 'xs')  # <- Set action and obs
]),filename='logs-xsbig')
ev_env = VecMonitor(DummyVecEnv([
    lambda: tradingEng(paths_ev,action = 'big', obs = 'xs'),    # <- Set action and obs
]))

eval_callback = EvalCallback(
    ev_env,
    best_model_save_path='./logs/models/xsbig_best', # <- Remember to name best model save path
    log_path='./logs/eval_logs/xsbig', # Remeber to name eval data path
    eval_freq=252*12*5, 
    deterministic=True,
    render=False,
    verbose = True,
    n_eval_episodes = 16
)

model = PPO("MlpPolicy", envs, batch_size = 252*4*5, learning_rate=linear_schedule(0.0025), policy_kwargs=policy_kwargs, n_steps=252*12*5, normalize_advantage=True, gamma = 0.9, verbose = 1) 

model.learn(total_timesteps=3e6, log_interval=2, callback=eval_callback) 
model.save('./logs/models/xsbig_final') # <- Save final model in case we need it

Using cpu device
Eval num_timesteps=60480, episode_reward=-0.10 +/- 0.04
Episode length: 1259.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 1.26e+03    |
|    mean_reward          | -0.103      |
| time/                   |             |
|    total_timesteps      | 60480       |
| train/                  |             |
|    approx_kl            | 0.005688921 |
|    clip_fraction        | 0.0543      |
|    clip_range           | 0.2         |
|    entropy_loss         | -56.7       |
|    explained_variance   | -2.34       |
|    learning_rate        | 0.00247     |
|    loss                 | -0.00559    |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.00392    |
|    std                  | 0.997       |
|    value_loss           | 0.000894    |
-----------------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    e