In [None]:
from agents import DPM_Agent, agent_loss, sampled_agent_reward
from trading_env.environment import TradingEnv

import random
import pandas as pd
import numpy as np
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
r = np.logspace( -4, -2, 4)

rates = np.concatenate((-r,r),axis = 0)
data_length = 100
time_array = np.linspace(0,data_length-1,data_length)

rates_v_time =  np.outer(rates,time_array)

data = np.exp(np.stack([rates_v_time,rates_v_time],axis = 2))

data.shape

In [None]:
agent = DPM_Agent()
env = TradingEnv(data)

In [None]:
# Simulate untrained agent to view baseline
loss = agent_loss(env,agent)
plt.figure(figsize=(8,8))
plt.plot(time_array[49:], env.portfolio_value_hist)
plt.xticks(rotation=45)
plt.title('Agent performance on validation set without training.')
plt.show()
portfolio_vals = env.portfolio_value_hist
print('Initial value of portfolio: ',portfolio_vals[0].numpy())
print('Final value of portfolio: ', portfolio_vals[-1].numpy())
print('Maximum value of portfolio: ',max(portfolio_vals).numpy())
print('Minimum value of portfolio: ',min(portfolio_vals).numpy())

In [None]:
TRADING_DAYS_PER_YEAR = 253

#@tf.function
def train_step(agent,batch):
    """Runs a model training step."""
    env = TradingEnv(batch,train_noise=0)
 
    with tf.GradientTape() as tape:
        tape.watch(agent.model.trainable_variables)
        loss = agent_loss(env,agent,dsct = 0.99)

        grad = tape.gradient(loss,agent.model.trainable_variables)
        grad = [g / tf.reduce_mean(tf.abs(g)) for g in grad]
        agent.opt.apply_gradients(zip(grad,agent.model.trainable_variables))

    reward = -TRADING_DAYS_PER_YEAR * loss/(env._end_tick-env._start_tick)

    return reward

In [None]:
def Callback_EarlyStopping(RewardsList, min_delta=0, patience=30):
    
    if RewardsList[-1] != RewardsList[-1]:
        print('NAN error')
        return True   


    #No early stopping for 3*patience epochs 
    if len(RewardsList)//patience < 3 :
        return False
    #Mean loss for last patience epochs and second-last patience epochs
    mean_previous = np.mean(RewardsList[::-1][2*patience:3*patience]) #third-last
    mean_recent = np.mean(RewardsList[::-1][:patience]) #last
    #you can use relative or absolute change
    delta = mean_recent - mean_previous # change
    percent_delta = delta / mean_previous  # relative change
    if percent_delta < min_delta : 
        print(f"*CB_ES* Percent change in reward value: {percent_delta*1e2:.4f}")
        return True 
    else:
        return False

In [None]:
train_avg_rewards = []
val_avg_rewards = []
MAX_EPOCHS = 100
n_substocks = 20
VALIDATION_SAMPLES = 50
for epoch in range(MAX_EPOCHS):
    training_rewards = []

    reward = train_step(agent,data)
        
    training_rewards.append(reward)
    
    train_avg_rewards.append(np.mean(training_rewards))


    if Callback_EarlyStopping(train_avg_rewards):
        break

In [None]:
# Simulate trained agent to view behavior after training
loss = agent_loss(env,agent)
plt.figure(figsize=(8,8))
plt.plot(time_array[49:], env.portfolio_value_hist)
plt.xticks(rotation=45)
plt.title(f'Agent performance on validation set after {epoch} training epochs.')
plt.show()
print('Initial value of portfolio: ',env.portfolio_value_hist[0].numpy())
print('Final value of portfolio: ',env.portfolio_value_hist[-1].numpy())
print('Maximum value of portfolio: ',max(env.portfolio_value_hist).numpy())
print('Minimum value of portfolio: ',min(env.portfolio_value_hist).numpy())