In [1]:
from agents.dpm_agent import DPMAgent
from trading_env.environment import TradingEnv


In [2]:
import keras.backend as K
from keras.callbacks import TensorBoard

In [3]:
import dill
import numpy as np

In [4]:
yf_file = "./data/archive_data/yf_data.dill"
with open(yf_file,'rb') as dill_file:
    yf_df = dill.load(dill_file)
    

In [5]:
symbols = yf_df['Symbol'].unique()
stocks = []
for symbol in symbols:
    df = yf_df[yf_df['Symbol'] == symbol]
    stocks.append(df.iloc[:,2:6].to_numpy())
stocks = np.array(stocks)
    

In [6]:
 

def get_advantages(values, masks, rewards):
    gamma = 0.99
    lmbda = 0.95
    critic_discount = 0.5

    returns = []
    gae = 0
    for i in reversed(range(len(rewards))):
        delta = rewards[i] + gamma * values[i + 1] * masks[i] - values[i]
        gae = delta + gamma * lmbda * masks[i] * gae
        returns.insert(0, gae + values[i])

    adv = np.array(returns) - values[:-1]
    return returns, (adv - np.mean(adv)) / (np.std(adv) + 1e-10)

In [7]:
actor = DPMAgent(21,4,1,'actor')
critic = DPMAgent(21,4,1,'critic')



In [8]:
def test_reward():
    state = env.reset()
    done = False
    total_reward = 0
    print('testing...')
    limit = 0
    while not done:
        state_input = K.expand_dims(state, 0)
        action_probs = model_actor.predict([state_input, dummy_n, dummy_1, dummy_1, dummy_1], steps=1)
        action = np.argmax(action_probs)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        total_reward += reward
        limit += 1
        if limit > 20:
            break
    return total_reward

In [9]:
def one_hot_encoding(probs):
    one_hot = np.zeros_like(probs)
    one_hot[:, np.argmax(probs, axis=1)] = 1
    return one_hot

In [10]:
env = TradingEnv(stocks)
obs = env.reset()
state_dims = env.observation_space.shape
n_actions = env.action_space.shape[0]

dummy_n = np.zeros((1, 1, n_actions))
dummy_1 = np.zeros((1, 1, 1))

In [11]:
stock_state = obs

In [12]:
env.action_space.shape

(22,)

In [13]:
tensor_board = TensorBoard(log_dir='./logs')

In [14]:
ppo_steps = 128
target_reached = False
best_reward = 0
iters = 0
max_iters = 1

In [15]:



while not target_reached and iters < max_iters:

    states = []
    actions = []
    values = []
    masks = []
    rewards = []
    actions_probs = []
    actions_onehot = []
    state_input = None
    for itr in range(ppo_steps):
        state_input = K.expand_dims(stock_state, 0)
        action_dist = actor.model.predict([state_input, dummy_n, dummy_1, dummy_1, dummy_1], steps=1)
        q_value = critic.model.predict([state_input], steps=1)
        action = np.random.choice(n_actions, p=action_dist[0, :])
        action_onehot = np.zeros(n_actions)
        action_onehot[action] = 1




        observation, reward, done, info = env.step(action_dist)
        print('itr: ' + str(itr) + ', action=' + str(action) + ', reward=' + str(reward) + ', q val=' + str(q_value))
        mask = not done

        states.append(stock_state)
        actions.append(action)
        actions_onehot.append(action_onehot)
        values.append(q_value)
        masks.append(mask)
        rewards.append(reward)
        actions_probs.append(action_dist)

        stock_state = observation 

        if done:
            env.reset()
    

    q_value = critic.model.predict(state_input, steps=1)
    values.append(q_value)
    returns, advantages = get_advantages(values, masks, rewards)

    states = np.array(states)
    actions_probs = np.array(actions_probs)
    rewards = np.reshape(rewards, newshape=(-1, 1, 1))
    values = np.array(values[:-1])
    actions_onehot = np.reshape(actions_onehot, newshape=(-1, n_actions))

    actor_loss = actor.model.fit(
        [states, actions_probs, advantages, rewards, values],
        [actions_onehot], verbose=True, shuffle=True, epochs=8,
        callbacks=[tensor_board])
    critic_loss = model_critic.fit([states], [np.reshape(returns, newshape=(-1, 1))], shuffle=True, epochs=8,
                                   verbose=True, callbacks=[tensor_board])

    avg_reward = np.mean([test_reward() for _ in range(5)])
    print('total test reward=' + str(avg_reward))
    if avg_reward > best_reward:
        print('best reward=' + str(avg_reward))
        actor.model.save('model_actor_{}_{}.hdf5'.format(iters, avg_reward))
        critic.model.save('model_critic_{}_{}.hdf5'.format(iters, avg_reward))
        best_reward = avg_reward
    if best_reward > 0.9 or iters > max_iters:
        target_reached = True
    iters += 1
    env.reset()

env.close()

itr: 0, action=16, reward=0.0025982670979001147, q val=[[-0.2037347]]
itr: 1, action=6, reward=-0.007858264446218835, q val=[[-0.2037347]]
itr: 2, action=19, reward=0.004422573840911233, q val=[[-0.2037347]]
itr: 3, action=7, reward=0.0047424370505121365, q val=[[-0.2037347]]
itr: 4, action=17, reward=0.00691183404786655, q val=[[-0.2037347]]
itr: 5, action=0, reward=-0.005909008021246886, q val=[[-0.2037347]]
itr: 6, action=0, reward=0.0020144899926779823, q val=[[-0.2037347]]
itr: 7, action=2, reward=-0.02764351333310642, q val=[[-0.2037347]]
itr: 8, action=0, reward=-0.03819000663464601, q val=[[-0.2037347]]
itr: 9, action=17, reward=-0.017453876375786305, q val=[[-0.2037347]]
itr: 10, action=15, reward=0.030895193373914075, q val=[[-0.2037347]]
itr: 11, action=9, reward=0.004481912475622163, q val=[[-0.2037347]]
itr: 12, action=10, reward=-0.006514291667175617, q val=[[-0.2037347]]
itr: 13, action=16, reward=0.004926753619528598, q val=[[-0.2037347]]
itr: 14, action=1, reward=0.016

ValueError: in user code:

    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:805 train_function  *
        return step_function(self, iterator)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:795 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:1259 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:2730 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\distribute\distribute_lib.py:3417 _call_for_each_replica
        return fn(*args, **kwargs)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:788 run_step  **
        outputs = model.train_step(data)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\training.py:754 train_step
        y_pred = self(x, training=True)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\base_layer.py:1012 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\functional.py:424 call
        return self._run_internal_graph(
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\functional.py:560 _run_internal_graph
        outputs = node.layer(*args, **kwargs)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\engine\base_layer.py:1012 __call__
        outputs = call_fn(inputs, *args, **kwargs)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\layers\merge.py:183 call
        return self._merge_function(inputs)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\layers\merge.py:522 _merge_function
        return K.concatenate(inputs, axis=self.axis)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\keras\backend.py:2989 concatenate
        return array_ops.concat([to_dense(x) for x in tensors], axis)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\util\dispatch.py:201 wrapper
        return target(*args, **kwargs)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\ops\array_ops.py:1677 concat
        return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\ops\gen_array_ops.py:1206 concat_v2
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\op_def_library.py:748 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\func_graph.py:590 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:3528 _create_op_internal
        ret = Operation(
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:2015 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    C:\Users\rajin\AppData\Roaming\Python\Python38\site-packages\tensorflow\python\framework\ops.py:1856 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimension 0 in both shapes must be equal, but are 32 and 1. Shapes are [32,21,1] and [1,21,1]. for '{{node model/concatenate/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](model/conv2/Relu, model/Cast, model/concatenate/concat/axis)' with input shapes: [32,21,1,20], [1,21,1,1], [] and with computed input tensors: input[2] = <3>.


In [16]:
states.shape

(128, 21, 64, 4)

In [17]:
actions_probs.shape

(128, 1, 22)

In [18]:
advantages.shape

(128, 1, 1)