In [1]:
# Imports
import gymnasium as gym
from typing import Callable

from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.vec_env import SubprocVecEnv
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.vec_env import VecMonitor
import torch as th

import pickle
import gc

# Import Our environment
from dev_env import tradingEng




In [2]:
# Load Paths
with open("ZeroCorrFrs1Half","rb") as fp:
    paths1 = pickle.load(fp)
with open("ZeroCorrFrs2Half","rb") as fp:
    paths2 = pickle.load(fp)
with open("ZeroCorrSnd1Half","rb") as fp:
    paths3 = pickle.load(fp)
with open("ZeroCorrSnd2Half","rb") as fp:
    paths4 = pickle.load(fp)

In [3]:
## LR schedule
def linear_schedule(initial_value: float) -> Callable[[float], float]:
    """
    Linear learning rate schedule.

    :param initial_value: Initial learning rate.
    :return: schedule that computes
      current learning rate depending on remaining progress
    """
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0.

        :param progress_remaining:
        :return: current learning rate
        """
        return progress_remaining * initial_value

    return func

import numpy as np
  ## LR schedule
def exp_schedule(initial_value: float, decayRate = 5) -> Callable[[float], float]:
    """
    Exp learning rate schedule.

    :param initial_value: Initial learning rate.
    :return: schedule that computes
      current learning rate depending on remaining progress
    """
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0.

        :param progress_remaining:
        :return: current learning rate
        """
        factor = 1 - progress_remaining
        return (np.exp((1-factor)*decayRate) - 1) * initial_value / (np.exp(decayRate) - 1)

    return func

In [None]:
# Def Env
def start_and_release(set, action = 'small-More-Trust', obs = 'big'):
    ret = tradingEng(set, action = action, obs = obs, resetdate=5)
    del(set)
    gc.collect()
    return ret

#t = start_and_release(paths1,action='small-More-Trust', obs = 'auto')
envs = VecMonitor(DummyVecEnv([
    lambda: start_and_release(paths1,action = 'big-Magnus', obs = 'xs',),
    lambda: start_and_release(paths2,action = 'big-Magnus', obs = 'xs',),
    lambda: start_and_release(paths3,action = 'big-Magnus', obs = 'xs',),
    lambda: start_and_release(paths4,action = 'big-Magnus', obs = 'xs'), #4
]), filename='TrainLog')

# Instantiate the agent
policy_kwargs = dict(activation_fn=th.nn.LeakyReLU,
                     net_arch=dict(pi=[512,512,256,128,64,64,64,64,36,18], vf=[512,512,256,128,64,64,64,64,36,18], optimizers_class = th.optim.Adam, ortho_init = True, use_sde = True, log_std_init = .35, use_expln = True, squash_output = True))
model = PPO("MlpPolicy", envs, batch_size = 2*5*252*2, learning_rate=linear_schedule(0.001), policy_kwargs=policy_kwargs, n_steps=2*5*252*4, normalize_advantage=True, gamma = 0.9, verbose = 1)

model.learn(total_timesteps=int(5e6), log_interval=1)
# Save the agent
model.save("PPOs")

Using cpu device
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1.26e+03 |
|    ep_rew_mean     | -0.602   |
| time/              |          |
|    fps             | 1974     |
|    iterations      | 1        |
|    time_elapsed    | 20       |
|    total_timesteps | 40320    |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 1.26e+03    |
|    ep_rew_mean          | -0.6        |
| time/                   |             |
|    fps                  | 1532        |
|    iterations           | 2           |
|    time_elapsed         | 52          |
|    total_timesteps      | 80640       |
| train/                  |             |
|    approx_kl            | 0.005219996 |
|    clip_fraction        | 0.0408      |
|    clip_range           | 0.2         |
|    entropy_loss         | -58.1       |
|    explained_variance   | -2.06       |
|    learning

In [14]:
import numpy as np
print(env._get_obs())
path = paths[800]
lambdax = path.lambdas[25]
r = path.r[25]
t = path.t_s[25]
print(lambdax)
print(r)

#model.predict(observation=np.asmatrix([lambdax,r,t]),state=None, deterministic=True)

[[-0.99977158  0.05536838 -0.56841282]]
0.005196713067477058
0.030745992173113156
