<a href="https://colab.research.google.com/github/shelan-de-livera/finalprojecttestrepo/blob/main/test2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tf-agents

Collecting tf-agents
  Downloading tf_agents-0.18.0-py3-none-any.whl (1.4 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.1/1.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.4/1.4 MB[0m [31m20.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m17.2 MB/s[0m eta [36m0:00:00[0m
Collecting gym<=0.23.0,>=0.17.0 (from tf-agents)
  Downloading gym-0.23.0.tar.gz (624 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.4/624.4 kB[0m [31m38.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pygame==2.1.3 (from tf-ag

In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
import gin
import numpy as np
import pandas as pd
import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import
import tensorflow_probability as tfp

In [None]:
from tf_agents.networks import categorical_projection_network
from tf_agents.networks import network
from tf_agents.networks import normal_projection_network
from tf_agents.specs import tensor_spec
from tf_agents.utils import nest_utils
from tf_agents.networks import encoding_network
from tf_agents.keras_layers import dynamic_unroll_layer
from tf_agents.trajectories import time_step

# agents - agents.agent.py


In [None]:
from abc import ABC, abstractmethod

In [None]:
# from abc import ABC, abstractmethod

class Agent(ABC):
    @abstractmethod
    def initialize(self):
        pass

    @abstractmethod
    def save(self):
        pass

    @abstractmethod
    def load(self):
        pass

    @abstractmethod
    def train(self, **kwargs) -> list:
        pass

    @abstractmethod
    def eval(self, **kwargs) -> float:
        pass

    @abstractmethod
    def compute_action(self, **kwargs):
        pass

    @abstractmethod
    def get_action_probabilities(self, **kwargs):
        pass


# agents.tfagents - tfagents.tfagent.py


In [None]:
from typing import Callable
from tf_agents.drivers.tf_driver import TFDriver
from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.metrics.tf_metrics import AverageReturnMetric
from tf_agents.policies.tf_policy import TFPolicy
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.utils.common import Checkpointer, function
from tf_agents.agents.tf_agent import TFAgent

In [None]:
# import numpy as np
# import tensorflow as tf
# from abc import ABC, abstractmethod
# from typing import Callable
# from tf_agents.drivers.tf_driver import TFDriver
# from tf_agents.drivers.dynamic_step_driver import DynamicStepDriver
# from tf_agents.drivers.dynamic_episode_driver import DynamicEpisodeDriver
# from tf_agents.environments.tf_py_environment import TFPyEnvironment
# from tf_agents.metrics.tf_metrics import AverageReturnMetric
# from tf_agents.policies.tf_policy import TFPolicy
# from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
# from tf_agents.utils.common import Checkpointer, function
# from tf_agents.agents.tf_agent import TFAgent
# from agents.agent import Agent


class TFAgentBase(Agent, ABC):
    def __init__(
            self,
            agent: TFAgent,
            checkpoint_filepath: str,
            env_batch_size: int,
            replay_memory_capacity: int,
            replay_memory_batch_size: int,
            trajectory_num_steps: int,
            clear_memory_after_train_iteration: bool
    ):
        self._agent = agent
        self._checkpointer = Checkpointer(
            ckpt_dir=checkpoint_filepath,
            max_to_keep=1,
            agent=agent,
            policy=agent.policy
        )

        self._env_batch_size = env_batch_size
        self._replay_memory_capacity = replay_memory_capacity
        self._replay_memory_batch_size = replay_memory_batch_size
        self._trajectory_num_steps = trajectory_num_steps
        self._clear_memory_after_train_iteration = clear_memory_after_train_iteration

    @property
    def train_step_counter(self) -> tf.Variable:
        return self._agent.train_step_counter

    @property
    def policy(self):
        return self._agent.policy

    @property
    def collect_policy(self):
        return self._agent.collect_policy

    def initialize(self):
        self._agent.initialize()
        self._checkpointer.initialize_or_restore()

    def save(self):
        self._checkpointer.save(self._agent.train_step_counter.value())

    def load(self):
        self._checkpointer.initialize_or_restore()

    def _get_replay_memory(self) -> TFUniformReplayBuffer:
        return TFUniformReplayBuffer(
            data_spec=self._agent.collect_data_spec,
            batch_size=self._env_batch_size,
            max_length=self._replay_memory_capacity
        )

    def _get_replay_memory_dataset(self, replay_memory: TFUniformReplayBuffer) -> tf.data.Dataset:
        num_steps = self._trajectory_num_steps + 1

        return replay_memory.as_dataset(
            sample_batch_size=self._replay_memory_batch_size,
            num_steps=num_steps,
            num_parallel_calls=num_steps + 1,
        ).prefetch(num_steps + 1)

    def _get_step_driver(
            self,
            env: TFPyEnvironment,
            policy: TFPolicy,
            observers: list,
            num_steps: int
    ):
        return DynamicStepDriver(
            env=env,
            policy=policy,
            observers=observers,
            num_steps=num_steps
        )

    def _get_episode_driver(
            self,
            env: TFPyEnvironment,
            policy: TFPolicy,
            observers: list,
            num_episodes: int
    ):
        return DynamicEpisodeDriver(env=env, policy=policy, observers=observers, num_episodes=num_episodes)

    @abstractmethod
    def _get_collect_driver(self, train_env: TFPyEnvironment, observers: list) -> TFDriver:
        pass

    @abstractmethod
    def _get_train_step_fn(self) -> Callable:
        pass

    def train(
            self,
            train_env: TFPyEnvironment,
            eval_env: TFPyEnvironment,
            train_iterations: int,
            eval_episodes: int,
            iterations_per_eval: int,
            iterations_per_log: int,
            iterations_per_checkpoint: int,
            save_best_only: bool
    ) -> list:
        assert train_env.batch_size == self._env_batch_size, \
            f'AssertionError: Expected environment batch size to be {self._env_batch_size}, got {train_env.batch_size}'

        replay_memory = TFUniformReplayBuffer(
            data_spec=self._agent.collect_data_spec,
            batch_size=self._env_batch_size,
            max_length=self._replay_memory_capacity
        )
        dataset = replay_memory.as_dataset(
            sample_batch_size=self._replay_memory_batch_size,
            num_steps=None if self._trajectory_num_steps is None else self._trajectory_num_steps + 1,
            num_parallel_calls=self._trajectory_num_steps
        ).prefetch(1 if self._trajectory_num_steps is None else self._trajectory_num_steps)
        collect_driver = self._get_collect_driver(train_env=train_env, observers=[replay_memory.add_batch])
        collect_driver.run = function(collect_driver.run)

        return self._train(
            train_env=train_env,
            eval_env=eval_env,
            train_iterations=train_iterations,
            eval_episodes=eval_episodes,
            iterations_per_eval=iterations_per_eval,
            iterations_per_log=iterations_per_log,
            iterations_per_checkpoint=iterations_per_checkpoint,
            replay_memory=replay_memory,
            replay_memory_dataset=dataset,
            train_collect_driver=collect_driver,
            save_best_only=save_best_only
        )

    def _train(
            self,
            train_env: TFPyEnvironment,
            eval_env: TFPyEnvironment,
            train_iterations: int,
            eval_episodes: int,
            iterations_per_eval: int,
            iterations_per_log: int,
            iterations_per_checkpoint: int,
            replay_memory: TFUniformReplayBuffer,
            replay_memory_dataset: tf.data.Dataset,
            train_collect_driver: TFDriver,
            save_best_only: bool
    ) -> list:
        average_returns = []
        max_avg_reward = -np.inf

        eval_metric = AverageReturnMetric(batch_size=eval_env.batch_size, buffer_size=200)
        eval_env_driver = self._get_episode_driver(
            env=eval_env,
            policy=self._agent.policy,
            observers=[eval_metric],
            num_episodes=eval_episodes
        )
        dataset_iter = iter(replay_memory_dataset)

        eval_env_driver.run = function(eval_env_driver.run)
        self._agent.train = function(self._agent.train)
        train_step = self._get_train_step_fn()

        print('Training has started...')

        eval_env_driver.run()
        avg_reward = eval_metric.result().numpy()
        average_returns.append(avg_reward)
        eval_metric.reset()

        if max_avg_reward < avg_reward:
            max_avg_reward = avg_reward

            if save_best_only:
                print(
                    f'\nNew best average return found at {max_avg_reward}! '
                    f'Saving checkpoint at iteration {0}'
                )
                self.save()

        for i in range(1, train_iterations + 1):
            train_collect_driver.run()
            train_loss = train_step(replay_memory=replay_memory, dataset_iter=dataset_iter)

            if self._clear_memory_after_train_iteration:
                replay_memory.clear()

            if i % iterations_per_eval == 0:
                eval_env_driver.run()
                avg_reward = eval_metric.result().numpy()
                average_returns.append(avg_reward)
                eval_metric.reset()

                if max_avg_reward < avg_reward:
                    max_avg_reward = avg_reward

                    if save_best_only:
                        print(
                            f'\nNew best average return found at {max_avg_reward}! '
                            f'Saving checkpoint at iteration {i}'
                        )
                        self.save()

            if not save_best_only and i % iterations_per_checkpoint == 0:
                print(f'\nSaving checkpoint at iteration {i}')
                self.save()

            if i % iterations_per_log == 0:
                print(f'\nIteration: {i}'
                      f'\nTrain Loss: {train_loss}'
                      f'\nAverage Return: {avg_reward}')
        return average_returns

    def eval(self, eval_env: TFPyEnvironment, num_episodes: int) -> float:
        eval_metric = AverageReturnMetric(batch_size=eval_env.batch_size, buffer_size=200)

        eval_env_driver = self._get_episode_driver(
            env=eval_env,
            policy=self._agent.policy,
            observers=[eval_metric],
            num_episodes=num_episodes
        )
        eval_env_driver.run = function(eval_env_driver.run)
        eval_env_driver.run()
        return eval_metric.result().numpy()

    def _get_action_step(self, time_step, policy_state=None, use_greedy_policy: bool = True):
        policy = self._agent.policy if use_greedy_policy else self._agent.collect_policy

        if policy_state is None:
            policy_state = policy.get_initial_state(batch_size=self._env_batch_size)
        return policy.action(time_step=time_step, policy_state=policy_state)

    def compute_action(self, time_step, policy_state=None, use_greedy_policy: bool = True) -> int:
        return self._get_action_step(
            time_step=time_step,
            policy_state=policy_state,
            use_greedy_policy=use_greedy_policy
        ).action

    def get_action_probabilities(self, time_step, policy_state=None, use_greedy_policy: bool = True) -> np.ndarray:
        return self._get_action_step(
            time_step=time_step,
            policy_state=policy_state,
            use_greedy_policy=use_greedy_policy
        ).info


# 1#. agents.tfagents.networks - networks.lstm_encoding_network.py

---




In [None]:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Keras LSTM Encoding Network.

Implements a network that will generate the following layers:

  [optional]: preprocessing_layers  # preprocessing_layers
  [optional]: (Add | Concat(axis=-1) | ...)  # preprocessing_combiner
  [optional]: Conv2D # input_conv_layer_params
  Flatten
  [optional]: Dense  # input_fc_layer_params
  [optional]: LSTM cell
  [optional]: Dense  # output_fc_layer_params
"""

# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function

# import gin
# import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

# from tf_agents.keras_layers import dynamic_unroll_layer
# from tf_agents.networks import encoding_network
# from tf_agents.networks import network
# from tf_agents.specs import tensor_spec
# from tf_agents.trajectories import time_step
# from tf_agents.utils import nest_utils

KERAS_LSTM_FUSED = 2


@gin.configurable
class LSTMEncodingNetwork(network.Network):
    """Recurrent network."""

    def __init__(
            self,
            input_tensor_spec,
            preprocessing_layers=None,
            preprocessing_combiner=None,
            conv_layer_params=None,
            input_fc_layer_params=(75, 40),
            lstm_size=None,
            output_fc_layer_params=(75, 40),
            activation_fn=tf.keras.activations.relu,
            rnn_construction_fn=None,
            rnn_construction_kwargs=None,
            conv_type='1d',
            dtype=tf.float32,
            name='LSTMEncodingNetwork',
    ):
        """Creates an instance of `LSTMEncodingNetwork`.

        Input preprocessing is possible via `preprocessing_layers` and
        `preprocessing_combiner` Layers.  If the `preprocessing_layers` nest is
        shallower than `input_tensor_spec`, then the layers will get the subnests.
        For example, if:

        ```python
        input_tensor_spec = ([TensorSpec(3)] * 2, [TensorSpec(3)] * 5)
        preprocessing_layers = (Layer1(), Layer2())
        ```

        then preprocessing will call:

        ```python
        preprocessed = [preprocessing_layers[0](observations[0]),
                        preprocessing_layers[1](observations[1])]
        ```

        However if

        ```python
        preprocessing_layers = ([Layer1() for _ in range(2)],
                                [Layer2() for _ in range(5)])
        ```

        then preprocessing will call:
        ```python
        preprocessed = [
          layer(obs) for layer, obs in zip(flatten(preprocessing_layers),
                                           flatten(observations))
        ]
        ```

        Args:
          input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the
            observations.
          preprocessing_layers: (Optional.) A nest of `tf.keras.layers.Layer`
            representing preprocessing for the different observations. All of these
            layers must not be already built.
          preprocessing_combiner: (Optional.) A keras layer that takes a flat list
            of tensors and combines them.  Good options include
            `tf.keras.layers.Add` and `tf.keras.layers.Concatenate(axis=-1)`. This
            layer must not be already built.
          conv_layer_params: Optional list of convolution layers parameters, where
            each item is a length-three tuple indicating (filters, kernel_size,
            stride).
          input_fc_layer_params: Optional list of fully connected parameters, where
            each item is the number of units in the layer. These feed into the
            recurrent layer.
          lstm_size: An iterable of ints specifying the LSTM cell sizes to use.
          output_fc_layer_params: Optional list of fully connected parameters, where
            each item is the number of units in the layer. These are applied on top
            of the recurrent layer.
          activation_fn: Activation function, e.g. tf.keras.activations.relu,.
          rnn_construction_fn: (Optional.) Alternate RNN construction function, e.g.
            tf.keras.layers.LSTM, tf.keras.layers.CuDNNLSTM. It is invalid to
            provide both rnn_construction_fn and lstm_size.
          rnn_construction_kwargs: (Optional.) Dictionary or arguments to pass to
            rnn_construction_fn.

            The RNN will be constructed via:

            ```
            rnn_layer = rnn_construction_fn(**rnn_construction_kwargs)
            ```
          dtype: The dtype to use by the convolution, LSTM, and fully connected
            layers.
          name: A string representing name of the network.

        Raises:
          ValueError: If any of `preprocessing_layers` is already built.
          ValueError: If `preprocessing_combiner` is already built.
          ValueError: If neither `lstm_size` nor `rnn_construction_fn` are provided.
          ValueError: If both `lstm_size` and `rnn_construction_fn` are provided.
        """
        if lstm_size is None and rnn_construction_fn is None:
            raise ValueError('Need to provide either custom rnn_construction_fn or '
                             'lstm_size.')
        if lstm_size and rnn_construction_fn:
            raise ValueError('Cannot provide both custom rnn_construction_fn and '
                             'lstm_size.')

        kernel_initializer = tf.compat.v1.variance_scaling_initializer(
            scale=2.0, mode='fan_in', distribution='truncated_normal')

        input_encoder = encoding_network.EncodingNetwork(
            input_tensor_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=preprocessing_combiner,
            conv_layer_params=conv_layer_params,
            fc_layer_params=input_fc_layer_params,
            activation_fn=activation_fn,
            kernel_initializer=kernel_initializer,
            conv_type=conv_type,
            dtype=dtype)

        # Create RNN cell
        if rnn_construction_fn:
            rnn_construction_kwargs = rnn_construction_kwargs or {}
            lstm_network = rnn_construction_fn(**rnn_construction_kwargs)
        else:
            if len(lstm_size) == 1:
                cell = tf.keras.layers.LSTMCell(
                    lstm_size[0],
                    dtype=dtype,
                    implementation=KERAS_LSTM_FUSED)
            else:
                cell = tf.keras.layers.StackedRNNCells(
                    [tf.keras.layers.LSTMCell(size, dtype=dtype,
                                              implementation=KERAS_LSTM_FUSED)
                     for size in lstm_size])
            lstm_network = dynamic_unroll_layer.DynamicUnroll(cell)

        output_encoder = []
        if output_fc_layer_params:
            output_encoder = [
                tf.keras.layers.Dense(
                    num_units,
                    activation=activation_fn,
                    kernel_initializer=kernel_initializer,
                    dtype=dtype) for num_units in output_fc_layer_params
            ]

        counter = [-1]

        def create_spec(size):
            counter[0] += 1
            return tensor_spec.TensorSpec(
                size, dtype=dtype, name='network_state_%d' % counter[0])

        state_spec = tf.nest.map_structure(create_spec,
                                           lstm_network.cell.state_size)

        super(LSTMEncodingNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec, state_spec=state_spec, name=name)

        self._conv_layer_params = conv_layer_params
        self._input_encoder = input_encoder
        self._lstm_network = lstm_network
        self._output_encoder = output_encoder

    def call(self,
             observation,
             step_type,
             network_state=(),
             training=False):
        """Apply the network.

        Args:
          observation: A tuple of tensors matching `input_tensor_spec`.
          step_type: A tensor of `StepType.
          network_state: (optional.) The network state.
          training: Whether the output is being used for training.

        Returns:
          `(outputs, network_state)` - the network output and next network state.

        Raises:
          ValueError: If observation tensors lack outer `(batch,)` or
            `(batch, time)` axes.
        """
        num_outer_dims = nest_utils.get_outer_rank(observation,
                                                   self.input_tensor_spec)
        if num_outer_dims not in (1, 2):
            raise ValueError(
                'Input observation must have a batch or batch x time outer shape.')

        has_time_dim = num_outer_dims == 2
        if not has_time_dim:
            # Add a time dimension to the inputs.
            observation = tf.nest.map_structure(lambda t: tf.expand_dims(t, 1),
                                                observation)
            step_type = tf.nest.map_structure(lambda t: tf.expand_dims(t, 1),
                                              step_type)

        state, _ = self._input_encoder(
            observation, step_type=step_type, network_state=(), training=training)

        network_kwargs = {}
        if isinstance(self._lstm_network, dynamic_unroll_layer.DynamicUnroll):
            network_kwargs['reset_mask'] = tf.equal(step_type,
                                                    time_step.StepType.FIRST,
                                                    name='mask')

        # Unroll over the time sequence.
        output = self._lstm_network(
            inputs=state,
            initial_state=network_state,
            training=training,
            **network_kwargs)

        if isinstance(self._lstm_network, dynamic_unroll_layer.DynamicUnroll):
            state, network_state = output
        else:
            state = output[0]
            network_state = tf.nest.pack_sequence_as(
                self._lstm_network.cell.state_size, tf.nest.flatten(output[1:]))

        for layer in self._output_encoder:
            state = layer(state, training=training)

        if not has_time_dim:
            # Remove time dimension from the state.
            state = tf.squeeze(state, [1])

        return state, network_state


# 1A. agents.tfagents.networks - networks.actor_distribution_network.py


In [None]:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Sample Keras actor network that generates distributions."""

# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function

# import gin
# import numpy as np
# import tensorflow as tf
# import tensorflow_probability as tfp

# from tf_agents.networks import categorical_projection_network
# from tf_agents.networks import encoding_network
# from tf_agents.networks import network
# from tf_agents.networks import normal_projection_network
# from tf_agents.specs import tensor_spec
# from tf_agents.utils import nest_utils


def _categorical_projection_net(action_spec, logits_init_output_factor=0.1):
    return categorical_projection_network.CategoricalProjectionNetwork(
        action_spec, logits_init_output_factor=logits_init_output_factor)


def _normal_projection_net(action_spec,
                           init_action_stddev=0.35,
                           init_means_output_factor=0.1,
                           seed_stream_class=tfp.util.SeedStream,
                           seed=None):
    std_bias_initializer_value = np.log(np.exp(init_action_stddev) - 1)

    return normal_projection_network.NormalProjectionNetwork(
        action_spec,
        init_means_output_factor=init_means_output_factor,
        std_bias_initializer_value=std_bias_initializer_value,
        scale_distribution=False,
        seed_stream_class=seed_stream_class,
        seed=seed)


@gin.configurable
class ActorDistributionNetwork(network.DistributionNetwork):
    """Creates an actor producing either Normal or Categorical distribution.
    Note: By default, this network uses `NormalProjectionNetwork` for continuous
    projection which by default uses `tanh_squash_to_spec` to normalize its
    output. Due to the nature of the `tanh` function, values near the spec bounds
    cannot be returned.
    """

    def __init__(self,
                 input_tensor_spec,
                 output_tensor_spec,
                 preprocessing_layers=None,
                 preprocessing_combiner=None,
                 conv_layer_params=None,
                 fc_layer_params=(200, 100),
                 dropout_layer_params=None,
                 activation_fn=tf.keras.activations.relu,
                 kernel_initializer=None,
                 seed_stream_class=tfp.util.SeedStream,
                 seed=None,
                 batch_squash=True,
                 dtype=tf.float32,
                 discrete_projection_net=_categorical_projection_net,
                 continuous_projection_net=_normal_projection_net,
                 conv_type='1d',
                 name='ActorDistributionNetwork'):
        """Creates an instance of `ActorDistributionNetwork`.
        Args:
          input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the
            input.
          output_tensor_spec: A nest of `tensor_spec.BoundedTensorSpec` representing
            the output.
          preprocessing_layers: (Optional.) A nest of `tf.keras.layers.Layer`
            representing preprocessing for the different observations.
            All of these layers must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          preprocessing_combiner: (Optional.) A keras layer that takes a flat list
            of tensors and combines them. Good options include
            `tf.keras.layers.Add` and `tf.keras.layers.Concatenate(axis=-1)`.
            This layer must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          conv_layer_params: Optional list of convolution layers parameters, where
            each item is a length-three tuple indicating (filters, kernel_size,
            stride).
          fc_layer_params: Optional list of fully_connected parameters, where each
            item is the number of units in the layer.
          dropout_layer_params: Optional list of dropout layer parameters, each item
            is the fraction of input units to drop or a dictionary of parameters
            according to the keras.Dropout documentation. The additional parameter
            `permanent`, if set to True, allows to apply dropout at inference for
            approximated Bayesian inference. The dropout layers are interleaved with
            the fully connected layers; there is a dropout layer after each fully
            connected layer, except if the entry in the list is None. This list must
            have the same length of fc_layer_params, or be None.
          activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ...
          kernel_initializer: Initializer to use for the kernels of the conv and
            dense layers. If none is provided a default glorot_uniform.
          seed_stream_class: The seed stream class. This is almost always
            tfp.util.SeedStream, except for in unit testing, when one may want to
            seed all the layers deterministically.
          seed: seed used for Keras kernal initializers for NormalProjectionNetwork.
          batch_squash: If True the outer_ranks of the observation are squashed into
            the batch dimension. This allow encoding networks to be used with
            observations with shape [BxTx...].
          dtype: The dtype to use by the convolution and fully connected layers.
          discrete_projection_net: Callable that generates a discrete projection
            network to be called with some hidden state and the outer_rank of the
            state.
          continuous_projection_net: Callable that generates a continuous projection
            network to be called with some hidden state and the outer_rank of the
            state.
          name: A string representing name of the network.
        Raises:
          ValueError: If `input_tensor_spec` contains more than one observation.
        """

        if not kernel_initializer:
            kernel_initializer = tf.compat.v1.keras.initializers.glorot_uniform()

        encoder = encoding_network.EncodingNetwork(
            input_tensor_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=preprocessing_combiner,
            conv_layer_params=conv_layer_params,
            fc_layer_params=fc_layer_params,
            dropout_layer_params=dropout_layer_params,
            activation_fn=activation_fn,
            kernel_initializer=kernel_initializer,
            batch_squash=batch_squash,
            conv_type=conv_type,
            dtype=dtype)

        def map_proj(spec):
            if tensor_spec.is_discrete(spec):
                return discrete_projection_net(spec)
            else:
                kwargs = {}
                if continuous_projection_net is _normal_projection_net:
                    kwargs['seed'] = seed
                    kwargs['seed_stream_class'] = seed_stream_class
                return continuous_projection_net(spec, **kwargs)

        projection_networks = tf.nest.map_structure(map_proj, output_tensor_spec)
        output_spec = tf.nest.map_structure(lambda proj_net: proj_net.output_spec,
                                            projection_networks)

        super(ActorDistributionNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            output_spec=output_spec,
            name=name)

        self._encoder = encoder
        self._projection_networks = projection_networks
        self._output_tensor_spec = output_tensor_spec

    @property
    def output_tensor_spec(self):
        return self._output_tensor_spec

    def call(self,
             observations,
             step_type,
             network_state,
             training=False,
             mask=None):
        state, network_state = self._encoder(
            observations,
            step_type=step_type,
            network_state=network_state,
            training=training)
        outer_rank = nest_utils.get_outer_rank(observations, self.input_tensor_spec)

        def call_projection_net(proj_net):
            distribution, _ = proj_net(
                state, outer_rank, training=training, mask=mask)
            return distribution

        output_actions = tf.nest.map_structure(
            call_projection_net, self._projection_networks)
        return output_actions, network_state

# 1B. agents.tfagents.networks - networks.actor_distribution_rnn_network.py


In [None]:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Sample Keras actor network  with LSTM cells that generates distributions."""

# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function


# import gin
# import numpy as np
# import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

# from tf_agents.networks import categorical_projection_network
# from tf_agents.networks import network
# from tf_agents.networks import normal_projection_network
# from tf_agents.specs import tensor_spec
# from tf_agents.utils import nest_utils

# # from agents.tfagents.networks.lstm_encoding_network import LSTMEncodingNetwork   THIS WAS CALLED IN THE NOTEBOOK JUST EARLIER


def _categorical_projection_net(action_spec, logits_init_output_factor=0.1):
    return categorical_projection_network.CategoricalProjectionNetwork(
        action_spec, logits_init_output_factor=logits_init_output_factor)


def _normal_projection_net(action_spec,
                           init_action_stddev=0.35,
                           init_means_output_factor=0.1):
    std_bias_initializer_value = np.log(np.exp(init_action_stddev) - 1)

    return normal_projection_network.NormalProjectionNetwork(
        action_spec,
        init_means_output_factor=init_means_output_factor,
        std_bias_initializer_value=std_bias_initializer_value)


@gin.configurable
class ActorDistributionRnnNetwork(network.DistributionNetwork):
    """Creates an actor producing either Normal or Categorical distribution.
    Note: By default, this network uses `NormalProjectionNetwork` for continuous
    projection which by default uses `tanh_squash_to_spec` to normalize its
    output. Due to the nature of the `tanh` function, values near the spec bounds
    cannot be returned.
    """

    def __init__(self,
                 input_tensor_spec,
                 output_tensor_spec,
                 preprocessing_layers=None,
                 preprocessing_combiner=None,
                 conv_layer_params=None,
                 input_fc_layer_params=(200, 100),
                 input_dropout_layer_params=None,
                 lstm_size=None,
                 output_fc_layer_params=(200, 100),
                 activation_fn=tf.keras.activations.relu,
                 dtype=tf.float32,
                 discrete_projection_net=_categorical_projection_net,
                 continuous_projection_net=_normal_projection_net,
                 rnn_construction_fn=None,
                 rnn_construction_kwargs={},
                 conv_type='1d',
                 name='ActorDistributionRnnNetwork'):
        """Creates an instance of `ActorDistributionRnnNetwork`.
        Args:
          input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the
            input.
          output_tensor_spec: A nest of `tensor_spec.BoundedTensorSpec` representing
            the output.
          preprocessing_layers: (Optional.) A nest of `tf.keras.layers.Layer`
            representing preprocessing for the different observations.
            All of these layers must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          preprocessing_combiner: (Optional.) A keras layer that takes a flat list
            of tensors and combines them. Good options include
            `tf.keras.layers.Add` and `tf.keras.layers.Concatenate(axis=-1)`.
            This layer must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          conv_layer_params: Optional list of convolution layers parameters, where
            each item is a length-three tuple indicating (filters, kernel_size,
            stride).
          input_fc_layer_params: Optional list of fully_connected parameters, where
            each item is the number of units in the layer. This is applied before
            the LSTM cell.
          input_dropout_layer_params: Optional list of dropout layer parameters,
            each item is the fraction of input units to drop or a dictionary of
            parameters according to the keras.Dropout documentation. The additional
            parameter `permanent`, if set to True, allows to apply dropout at
            inference for approximated Bayesian inference. The dropout layers are
            interleaved with the fully connected layers; there is a dropout layer
            after each fully connected layer, except if the entry in the list is
            None. This list must have the same length of input_fc_layer_params, or
            be None.
          lstm_size: An iterable of ints specifying the LSTM cell sizes to use.
          output_fc_layer_params: Optional list of fully_connected parameters, where
            each item is the number of units in the layer. This is applied after the
            LSTM cell.
          activation_fn: Activation function, e.g. tf.nn.relu, slim.leaky_relu, ...
          dtype: The dtype to use by the convolution and fully connected layers.
          discrete_projection_net: Callable that generates a discrete projection
            network to be called with some hidden state and the outer_rank of the
            state.
          continuous_projection_net: Callable that generates a continuous projection
            network to be called with some hidden state and the outer_rank of the
            state.
          rnn_construction_fn: (Optional.) Alternate RNN construction function, e.g.
            tf.keras.layers.LSTM, tf.keras.layers.CuDNNLSTM. It is invalid to
            provide both rnn_construction_fn and lstm_size.
          rnn_construction_kwargs: (Optional.) Dictionary or arguments to pass to
            rnn_construction_fn.
            The RNN will be constructed via:
            ```
            rnn_layer = rnn_construction_fn(**rnn_construction_kwargs)
            ```
          name: A string representing name of the network.
        Raises:
          ValueError: If 'input_dropout_layer_params' is not None.
        """
        if input_dropout_layer_params:
            raise ValueError('Dropout layer is not supported.')

        lstm_encoder = LSTMEncodingNetwork(
            input_tensor_spec=input_tensor_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=preprocessing_combiner,
            conv_layer_params=conv_layer_params,
            input_fc_layer_params=input_fc_layer_params,
            lstm_size=lstm_size,
            output_fc_layer_params=output_fc_layer_params,
            activation_fn=activation_fn,
            rnn_construction_fn=rnn_construction_fn,
            rnn_construction_kwargs=rnn_construction_kwargs,
            conv_type=conv_type,
            dtype=dtype,
            name=name)

        def map_proj(spec):
            if tensor_spec.is_discrete(spec):
                return discrete_projection_net(spec)
            else:
                return continuous_projection_net(spec)

        projection_networks = tf.nest.map_structure(map_proj, output_tensor_spec)
        output_spec = tf.nest.map_structure(lambda proj_net: proj_net.output_spec,
                                            projection_networks)

        super(ActorDistributionRnnNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=lstm_encoder.state_spec,
            output_spec=output_spec,
            name=name)

        self._lstm_encoder = lstm_encoder
        self._projection_networks = projection_networks
        self._output_tensor_spec = output_tensor_spec

    @property
    def output_tensor_spec(self):
        return self._output_tensor_spec

    def call(self, observation, step_type, network_state=(), training=False):
        state, network_state = self._lstm_encoder(
            observation, step_type=step_type, network_state=network_state,
            training=training)
        outer_rank = nest_utils.get_outer_rank(observation, self.input_tensor_spec)
        output_actions = tf.nest.map_structure(
            lambda proj_net: proj_net(state, outer_rank, training=training)[0],
            self._projection_networks)
        return output_actions, network_state

# 1C. agents.tfagents.networks - networks.value_network.py

In [None]:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Sample Keras Value Network.
Implements a network that will generate the following layers:
  [optional]: preprocessing_layers  # preprocessing_layers
  [optional]: (Add | Concat(axis=-1) | ...)  # preprocessing_combiner
  [optional]: Conv2D # conv_layer_params
  Flatten
  [optional]: Dense  # fc_layer_params
  Dense -> 1         # Value output
"""

# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function

# import gin
# import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

# from tf_agents.networks import encoding_network
# from tf_agents.networks import network


@gin.configurable
class ValueNetwork(network.Network):
    """Feed Forward value network. Reduces to 1 value output per batch item."""

    def __init__(self,
                 input_tensor_spec,
                 preprocessing_layers=None,
                 preprocessing_combiner=None,
                 conv_layer_params=None,
                 fc_layer_params=(75, 40),
                 dropout_layer_params=None,
                 activation_fn=tf.keras.activations.relu,
                 kernel_initializer=None,
                 batch_squash=True,
                 dtype=tf.float32,
                 conv_type='1d',
                 name='ValueNetwork'):
        """Creates an instance of `ValueNetwork`.
        Network supports calls with shape outer_rank + observation_spec.shape. Note
        outer_rank must be at least 1.
        Args:
          input_tensor_spec: A `tensor_spec.TensorSpec` or a tuple of specs
            representing the input observations.
          preprocessing_layers: (Optional.) A nest of `tf.keras.layers.Layer`
            representing preprocessing for the different observations.
            All of these layers must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          preprocessing_combiner: (Optional.) A keras layer that takes a flat list
            of tensors and combines them. Good options include
            `tf.keras.layers.Add` and `tf.keras.layers.Concatenate(axis=-1)`.
            This layer must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          conv_layer_params: Optional list of convolution layers parameters, where
            each item is a length-three tuple indicating (filters, kernel_size,
            stride).
          fc_layer_params: Optional list of fully_connected parameters, where each
            item is the number of units in the layer.
          dropout_layer_params: Optional list of dropout layer parameters, each item
            is the fraction of input units to drop or a dictionary of parameters
            according to the keras.Dropout documentation. The additional parameter
            `permanent`, if set to True, allows to apply dropout at inference for
            approximated Bayesian inference. The dropout layers are interleaved with
            the fully connected layers; there is a dropout layer after each fully
            connected layer, except if the entry in the list is None. This list must
            have the same length of fc_layer_params, or be None.
          activation_fn: Activation function, e.g. tf.keras.activations.relu,.
          kernel_initializer: Initializer to use for the kernels of the conv and
            dense layers. If none is provided a default variance_scaling_initializer
          batch_squash: If True the outer_ranks of the observation are squashed into
            the batch dimension. This allow encoding networks to be used with
            observations with shape [BxTx...].
          dtype: The dtype to use by the convolution and fully connected layers.
          name: A string representing name of the network.
        Raises:
          ValueError: If input_tensor_spec is not an instance of network.InputSpec.
        """
        super(ValueNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name=name)

        if not kernel_initializer:
            kernel_initializer = tf.compat.v1.keras.initializers.glorot_uniform()

        self._encoder = encoding_network.EncodingNetwork(
            input_tensor_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=preprocessing_combiner,
            conv_layer_params=conv_layer_params,
            fc_layer_params=fc_layer_params,
            dropout_layer_params=dropout_layer_params,
            activation_fn=activation_fn,
            kernel_initializer=kernel_initializer,
            batch_squash=batch_squash,
            conv_type=conv_type,
            dtype=dtype)

        self._postprocessing_layers = tf.keras.layers.Dense(
            1,
            activation=None,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-0.03, maxval=0.03))

    def call(self, observation, step_type=None, network_state=(), training=False):
        state, network_state = self._encoder(
            observation, step_type=step_type, network_state=network_state,
            training=training)
        value = self._postprocessing_layers(state, training=training)
        return tf.squeeze(value, -1), network_state

# 1D. agents.tfagents.networks - networks.value_rnn_network.py

---



In [None]:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Sample Keras Value Network with LSTM cells .
Implements a network that will generate the following layers:
  [optional]: preprocessing_layers  # preprocessing_layers
  [optional]: (Add | Concat(axis=-1) | ...)  # preprocessing_combiner
  [optional]: Conv2D # conv_layer_params
  Flatten
  [optional]: Dense  # input_fc_layer_params
  [optional]: LSTM   # lstm_cell_params
  [optional]: Dense  # output_fc_layer_params
  Dense -> 1         # Value output
"""

# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function

# import gin
# import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

# from tf_agents.networks import network

# # from agents.tfagents.networks.lstm_encoding_network import LSTMEncodingNetwork


@gin.configurable
class ValueRnnNetwork(network.Network):
    """Recurrent value network. Reduces to 1 value output per batch item."""

    def __init__(self,
                 input_tensor_spec,
                 preprocessing_layers=None,
                 preprocessing_combiner=None,
                 conv_layer_params=None,
                 input_fc_layer_params=(75, 40),
                 input_dropout_layer_params=None,
                 lstm_size=(40,),
                 output_fc_layer_params=(75, 40),
                 activation_fn=tf.keras.activations.relu,
                 dtype=tf.float32,
                 conv_type='1d',
                 name='ValueRnnNetwork'):
        """Creates an instance of `ValueRnnNetwork`.
        Network supports calls with shape outer_rank + input_tensor_shape.shape.
        Note outer_rank must be at least 1.
        Args:
          input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the
            input observations.
          preprocessing_layers: (Optional.) A nest of `tf.keras.layers.Layer`
            representing preprocessing for the different observations.
            All of these layers must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          preprocessing_combiner: (Optional.) A keras layer that takes a flat list
            of tensors and combines them.  Good options include
            `tf.keras.layers.Add` and `tf.keras.layers.Concatenate(axis=-1)`.
            This layer must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          conv_layer_params: Optional list of convolution layers parameters, where
            each item is a length-three tuple indicating (filters, kernel_size,
            stride).
          input_fc_layer_params: Optional list of fully_connected parameters, where
            each item is the number of units in the layer. This is applied before
            the LSTM cell.
          input_dropout_layer_params: Optional list of dropout layer parameters,
            where each item is the fraction of input units to drop. The dropout
            layers are interleaved with the fully connected layers; there is a
            dropout layer after each fully connected layer, except if the entry in
            the list is None. This list must have the same length of
            input_fc_layer_params, or be None.
          lstm_size: An iterable of ints specifying the LSTM cell sizes to use.
          output_fc_layer_params: Optional list of fully_connected parameters, where
            each item is the number of units in the layer. This is applied after the
            LSTM cell.
          activation_fn: Activation function, e.g. tf.keras.activations.relu,.
          dtype: The dtype to use by the convolution, LSTM, and fully connected
            layers.
          name: A string representing name of the network.
        """
        del input_dropout_layer_params

        lstm_encoder = LSTMEncodingNetwork(
            input_tensor_spec=input_tensor_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=preprocessing_combiner,
            conv_layer_params=conv_layer_params,
            input_fc_layer_params=input_fc_layer_params,
            lstm_size=lstm_size,
            output_fc_layer_params=output_fc_layer_params,
            activation_fn=activation_fn,
            dtype=dtype,
            conv_type=conv_type,
            name=name)

        postprocessing_layers = tf.keras.layers.Dense(
            1,
            activation=None,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-0.03, maxval=0.03))

        super(ValueRnnNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=lstm_encoder.state_spec,
            name=name)

        self._lstm_encoder = lstm_encoder
        self._postprocessing_layers = postprocessing_layers

    def call(self,
             observation,
             step_type=None,
             network_state=(),
             training=False):
        state, network_state = self._lstm_encoder(
            observation, step_type=step_type, network_state=network_state,
            training=training)
        value = self._postprocessing_layers(state, training=training)
        return tf.squeeze(value, -1), network_state

# 1. agents.tfagents - tfagents.ppo.py


In [None]:
import math
from typing import Callable
from tf_agents.agents.ppo.ppo_clip_agent import PPOClipAgent
from tf_agents.drivers.tf_driver import TFDriver
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.utils.common import function

In [None]:
# import math
# import tensorflow as tf
# from typing import Callable
# from tf_agents.agents.ppo.ppo_clip_agent import PPOClipAgent
# from tf_agents.drivers.tf_driver import TFDriver
# from tf_agents.environments.tf_py_environment import TFPyEnvironment
# from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
# from tf_agents.utils.common import function
# # from agents.tfagents.tfagent import TFAgentBase
# # from agents.tfagents.networks.actor_distribution_network import ActorDistributionNetwork
# # from agents.tfagents.networks.actor_distribution_rnn_network import ActorDistributionRnnNetwork
# # from agents.tfagents.networks.value_network import ValueNetwork
# # from agents.tfagents.networks.value_rnn_network import ValueRnnNetwork


class PPOAgent(TFAgentBase):
    def __init__(
            self,
            input_tensor_spec,
            action_spec,
            time_step_spec,
            env_batch_size: int,
            checkpoint_filepath: str,
            fc_layers: list or None,
            conv_layers: list[tuple[int, int, int]] or None = None,
            conv_type: str = '1d',
            lstm_layers: list[int] or None = None,
            train_sequence_len: int = 1,
            preprocessing_layers: list or dict = None,
            preprocessing_combiner: list or dict or Callable = None,
            greedy_eval: bool = True,
            epsilon_clipping: float = 0.3,
            lambda_value: float = 0.95,
            entropy_regularization_init: float = 0.0,
            entropy_regularization_min: float = 0.0,
            entropy_regularization_decay_steps: int = 0,
            num_epochs: int = 40,
            use_gae: bool = True,
            gamma: float = 0.99,
            replay_memory_capacity: int = 1000,
            replay_memory_batch_size: int or None = None,
            collection_episodes_per_iteration: int = 5,
            train_step_counter: tf.Variable = tf.Variable(initial_value=0)
    ):
        assert train_sequence_len >= 1, \
            f'AssertionError: train_sequence_len is expected to be >= 1, got {train_sequence_len}'
        assert lstm_layers is None or train_sequence_len > 1, \
            'AssertionError: train_sequence_len is expected to be greater than 1 if lstm_layers is not None'

        assert entropy_regularization_decay_steps == 0 or entropy_regularization_init == 0.0 or \
               entropy_regularization_min < entropy_regularization_init <= 1.0, \
            'AssertionError: entropy_regularization_decay_steps is expected to be zero ' \
            'or entropy_regularization_init is expected to be zero or ' \
            'or entropy_regularization_min < entropy_regularization_init <= 1.0' \
            f'got entropy_regularization_decay_steps={entropy_regularization_decay_steps}, ' \
            f'entropy_regularization_init={entropy_regularization_init}, ' \
            f'entropy_regularization_min={entropy_regularization_min}'

        if entropy_regularization_decay_steps > 0:
            entropy_regularization_fn = tf.keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=entropy_regularization_init,
                decay_steps=entropy_regularization_decay_steps,
                end_learning_rate=entropy_regularization_min
            )
            entropy_regularization = lambda: entropy_regularization_fn(train_step_counter)
        else:
            entropy_regularization = entropy_regularization_init

        if lstm_layers is None:
            actor_network = ActorDistributionNetwork(
                input_tensor_spec=input_tensor_spec,
                output_tensor_spec=action_spec,
                preprocessing_layers=preprocessing_layers,
                preprocessing_combiner=preprocessing_combiner,
                fc_layer_params=fc_layers,
                conv_layer_params=conv_layers,
                conv_type=conv_type,
                activation_fn='gelu'
            )
            value_network = ValueNetwork(
                input_tensor_spec=input_tensor_spec,
                preprocessing_layers=preprocessing_layers,
                preprocessing_combiner=preprocessing_combiner,
                fc_layer_params=fc_layers,
                conv_layer_params=conv_layers,
                conv_type=conv_type,
                activation_fn='gelu'
            )
        else:
            actor_network = ActorDistributionRnnNetwork(
                input_tensor_spec=input_tensor_spec,
                output_tensor_spec=action_spec,
                preprocessing_layers=preprocessing_layers,
                preprocessing_combiner=preprocessing_combiner,
                lstm_size=lstm_layers,
                output_fc_layer_params=fc_layers,
                conv_layer_params=conv_layers,
                conv_type=conv_type,
                activation_fn='gelu'
            )
            value_network = ValueRnnNetwork(
                input_tensor_spec=input_tensor_spec,
                preprocessing_layers=preprocessing_layers,
                preprocessing_combiner=preprocessing_combiner,
                lstm_size=lstm_layers,
                output_fc_layer_params=fc_layers,
                conv_layer_params=conv_layers,
                conv_type=conv_type,
                activation_fn='gelu'
            )

        agent = PPOClipAgent(
            time_step_spec=time_step_spec,
            action_spec=action_spec,
            optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
            actor_net=actor_network,
            value_net=value_network,
            greedy_eval=greedy_eval,
            importance_ratio_clipping=epsilon_clipping,
            lambda_value=lambda_value,
            discount_factor=gamma,
            entropy_regularization=entropy_regularization,
            num_epochs=num_epochs,
            use_gae=use_gae,
            train_step_counter=train_step_counter
        )

        super().__init__(
            agent=agent,
            checkpoint_filepath=checkpoint_filepath,
            env_batch_size=env_batch_size,
            replay_memory_capacity=replay_memory_capacity,
            replay_memory_batch_size=replay_memory_batch_size,
            trajectory_num_steps=train_sequence_len,
            clear_memory_after_train_iteration=True
        )

        self._replay_memory_batch_size = replay_memory_batch_size
        self._collection_episodes_per_iteration = collection_episodes_per_iteration

        self._optimized_minibatch_train_fn = function(self._train_step_minibatch_fn)

    def _get_collect_driver(self, train_env: TFPyEnvironment, observers: list) -> TFDriver:
        return super()._get_episode_driver(
            env=train_env,
            policy=self.collect_policy,
            observers=observers,
            num_episodes=self._collection_episodes_per_iteration
        )

    def _train_step_batch_fn(self, replay_memory: TFUniformReplayBuffer, dataset_iter: iter) -> float:
        trajectories = replay_memory.gather_all()
        return self._agent.train(experience=trajectories).loss

    def _train_step_minibatch_fn(self, dataset_iter: iter) -> float:
        trajectories, _ = next(dataset_iter)
        return self._agent.train(experience=trajectories).loss

    def _train_step_minibatch(self, replay_memory: TFUniformReplayBuffer, dataset_iter: iter) -> float:
        train_loss = 0
        optimized_minibatch_train_fn = self._optimized_minibatch_train_fn

        num_memory_items = replay_memory.num_frames()
        num_batches = math.ceil(num_memory_items / self._replay_memory_batch_size)

        for _ in range(num_batches):
            train_loss += optimized_minibatch_train_fn(dataset_iter=dataset_iter)
        return train_loss / num_batches

    def _get_train_step_fn(self) -> Callable:
        if self._replay_memory_batch_size is None:
            return function(self._train_step_batch_fn)
        else:
            return self._train_step_minibatch


# 2A. agents.tfagents.networks - networks.q_network.py


In [None]:
# coding=utf-8
# Copyright 2020 The TF-Agents Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Sample Keras networks for DQN."""

# from __future__ import absolute_import
# from __future__ import division
# from __future__ import print_function

# import gin
# import tensorflow as tf  # pylint: disable=g-explicit-tensorflow-version-import

# from tf_agents.networks import encoding_network
# from tf_agents.networks import network


def validate_specs(action_spec, observation_spec):
    """Validates the spec contains a single action."""
    del observation_spec  # not currently validated

    flat_action_spec = tf.nest.flatten(action_spec)
    if len(flat_action_spec) > 1:
        raise ValueError('Network only supports action_specs with a single action.')

    if flat_action_spec[0].shape not in [(), (1,)]:
        raise ValueError(
            'Network only supports action_specs with shape in [(), (1,)])')


@gin.configurable
class QNetwork(network.Network):
    """Feed Forward network."""

    def __init__(self,
                 input_tensor_spec,
                 action_spec,
                 preprocessing_layers=None,
                 preprocessing_combiner=None,
                 conv_layer_params=None,
                 fc_layer_params=(75, 40),
                 dropout_layer_params=None,
                 activation_fn=tf.keras.activations.relu,
                 kernel_initializer=None,
                 batch_squash=True,
                 dtype=tf.float32,
                 conv_type='1d',
                 q_layer_activation_fn=None,
                 name='QNetwork'):
        """Creates an instance of `QNetwork`.
        Args:
          input_tensor_spec: A nest of `tensor_spec.TensorSpec` representing the
            input observations.
          action_spec: A nest of `tensor_spec.BoundedTensorSpec` representing the
            actions.
          preprocessing_layers: (Optional.) A nest of `tf.keras.layers.Layer`
            representing preprocessing for the different observations.
            All of these layers must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          preprocessing_combiner: (Optional.) A keras layer that takes a flat list
            of tensors and combines them. Good options include
            `tf.keras.layers.Add` and `tf.keras.layers.Concatenate(axis=-1)`.
            This layer must not be already built. For more details see
            the documentation of `networks.EncodingNetwork`.
          conv_layer_params: Optional list of convolution layers parameters, where
            each item is a length-three tuple indicating (filters, kernel_size,
            stride).
          fc_layer_params: Optional list of fully_connected parameters, where each
            item is the number of units in the layer.
          dropout_layer_params: Optional list of dropout layer parameters, where
            each item is the fraction of input units to drop. The dropout layers are
            interleaved with the fully connected layers; there is a dropout layer
            after each fully connected layer, except if the entry in the list is
            None. This list must have the same length of fc_layer_params, or be
            None.
          activation_fn: Activation function, e.g. tf.keras.activations.relu.
          kernel_initializer: Initializer to use for the kernels of the conv and
            dense layers. If none is provided a default variance_scaling_initializer
          batch_squash: If True the outer_ranks of the observation are squashed into
            the batch dimension. This allow encoding networks to be used with
            observations with shape [BxTx...].
          dtype: The dtype to use by the convolution and fully connected layers.
          q_layer_activation_fn: Activation function for the Q layer.
          name: A string representing the name of the network.
        Raises:
          ValueError: If `input_tensor_spec` contains more than one observation. Or
            if `action_spec` contains more than one action.
        """
        validate_specs(action_spec, input_tensor_spec)
        action_spec = tf.nest.flatten(action_spec)[0]
        num_actions = action_spec.maximum - action_spec.minimum + 1
        encoder_input_tensor_spec = input_tensor_spec

        encoder = encoding_network.EncodingNetwork(
            encoder_input_tensor_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=preprocessing_combiner,
            conv_layer_params=conv_layer_params,
            fc_layer_params=fc_layer_params,
            dropout_layer_params=dropout_layer_params,
            activation_fn=activation_fn,
            kernel_initializer=kernel_initializer,
            batch_squash=batch_squash,
            conv_type=conv_type,
            dtype=dtype)

        q_value_layer = tf.keras.layers.Dense(
            num_actions,
            activation=q_layer_activation_fn,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-0.03, maxval=0.03),
            bias_initializer=tf.constant_initializer(-0.2),
            dtype=dtype)

        super(QNetwork, self).__init__(
            input_tensor_spec=input_tensor_spec,
            state_spec=(),
            name=name)

        self._encoder = encoder
        self._q_value_layer = q_value_layer

    def call(self, observation, step_type=None, network_state=(), training=False):
        """Runs the given observation through the network.
        Args:
          observation: The observation to provide to the network.
          step_type: The step type for the given observation. See `StepType` in
            time_step.py.
          network_state: A state tuple to pass to the network, mainly used by RNNs.
          training: Whether the output is being used for training.
        Returns:
          A tuple `(logits, network_state)`.
        """
        state, network_state = self._encoder(
            observation, step_type=step_type, network_state=network_state,
            training=training)
        q_value = self._q_value_layer(state, training=training)
        return q_value, network_state

# 2. agents.tfagents - tfagents.dqn.py

In [None]:
from typing import Callable
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers.tf_driver import TFDriver
from tf_agents.environments.tf_py_environment import TFPyEnvironment
from tf_agents.policies.random_tf_policy import RandomTFPolicy
from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
from tf_agents.utils.common import function

In [None]:
# import tensorflow as tf
# from typing import Callable
# from tf_agents.agents.dqn import dqn_agent
# from tf_agents.drivers.tf_driver import TFDriver
# from tf_agents.environments.tf_py_environment import TFPyEnvironment
# from tf_agents.policies.random_tf_policy import RandomTFPolicy
# from tf_agents.replay_buffers.tf_uniform_replay_buffer import TFUniformReplayBuffer
# from tf_agents.utils.common import function
# # from agents.tfagents.tfagent import TFAgentBase
# # from agents.tfagents.networks.q_network import QNetwork


class DQNAgent(TFAgentBase):
    def __init__(
            self,
            input_tensor_spec,
            action_spec,
            time_step_spec,
            env_batch_size: int,
            checkpoint_filepath: str,
            fc_layers: list or None,
            conv_layers: list[tuple[int, int, int]] or None = None,
            conv_type: str = '1d',
            preprocessing_layers: list or dict = None,
            preprocessing_combiner: list or dict or Callable = None,
            n_step: int = 3,
            double_dqn: bool = True,
            tau: float = 0.005,
            target_update_steps: int = 1,
            epsilon_init: float = 0.1,
            epsilon_min: float = 0.1,
            epsilon_decay_steps: int = 0,
            gamma: float = 0.99,
            replay_memory_capacity: int = 50000,
            replay_memory_batch_size: int = 64,
            initial_collect_steps: int = 10000,
            collection_steps_per_iteration: int = 1,
            train_step_counter: tf.Variable = tf.Variable(initial_value=0)
    ):
        assert epsilon_decay_steps == 0 or \
               epsilon_decay_steps > 0 and epsilon_init is not None and epsilon_min < epsilon_init <= 1.0, \
            'AssertionError: epsilon_decay_steps is expected to be zero or epsilon_min < epsilon_init <= 1.0' \
            f'got epsilon_decay_steps={epsilon_decay_steps}, epsilon_init={epsilon_init}, epsilon_min={epsilon_min}'

        if epsilon_decay_steps > 0:
            epsilon_greedy_fn = tf.keras.optimizers.schedules.PolynomialDecay(
                initial_learning_rate=epsilon_init,
                decay_steps=epsilon_decay_steps,
                end_learning_rate=epsilon_min
            )
            epsilon_greedy = lambda: epsilon_greedy_fn(train_step_counter)
        else:
            epsilon_greedy = epsilon_init

        q_network = QNetwork(
            input_tensor_spec=input_tensor_spec,
            action_spec=action_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=preprocessing_combiner,
            fc_layer_params=fc_layers,
            conv_layer_params=conv_layers,
            conv_type=conv_type,
            activation_fn='gelu'
        )
        target_network = QNetwork(
            input_tensor_spec=input_tensor_spec,
            action_spec=action_spec,
            preprocessing_layers=preprocessing_layers,
            preprocessing_combiner=preprocessing_combiner,
            fc_layer_params=fc_layers,
            conv_layer_params=conv_layers,
            conv_type=conv_type,
            activation_fn='gelu'
        )

        if double_dqn:
            agent = dqn_agent.DdqnAgent(
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                q_network=q_network,
                target_q_network=target_network,
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
                n_step_update=n_step,
                target_update_tau=tau,
                target_update_period=target_update_steps,
                gamma=gamma,
                epsilon_greedy=epsilon_greedy,
                train_step_counter=train_step_counter
            )
        else:
            agent = dqn_agent.DqnAgent(
                time_step_spec=time_step_spec,
                action_spec=action_spec,
                q_network=q_network,
                target_q_network=target_network,
                optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
                n_step_update=n_step,
                target_update_tau=tau,
                target_update_period=target_update_steps,
                gamma=gamma,
                epsilon_greedy=epsilon_greedy,
                train_step_counter=train_step_counter
            )

        super().__init__(
            agent=agent,
            checkpoint_filepath=checkpoint_filepath,
            env_batch_size=env_batch_size,
            replay_memory_capacity=replay_memory_capacity,
            replay_memory_batch_size=replay_memory_batch_size,
            trajectory_num_steps=n_step,
            clear_memory_after_train_iteration=False
        )

        self._n_step = n_step
        self._initial_collect_steps = initial_collect_steps
        self._collection_steps_per_iteration = collection_steps_per_iteration

    def _get_collect_driver(self, train_env: TFPyEnvironment, observers: list) -> TFDriver:
        return super()._get_step_driver(
            env=train_env,
            policy=self.collect_policy,
            observers=observers,
            num_steps=self._collection_steps_per_iteration * self._n_step
        )

    def _train_step_fn(self, replay_memory: TFUniformReplayBuffer, dataset_iter: iter) -> float:
        trajectories, _ = next(dataset_iter)
        return self._agent.train(trajectories).loss

    def _get_train_step_fn(self) -> Callable:
        return function(self._train_step_fn)

    def _train(
            self,
            train_env: TFPyEnvironment,
            eval_env: TFPyEnvironment,
            train_iterations: int,
            eval_episodes: int,
            iterations_per_eval: int,
            iterations_per_log: int,
            iterations_per_checkpoint: int,
            replay_memory: TFUniformReplayBuffer,
            replay_memory_dataset: tf.data.Dataset,
            train_collect_driver: TFDriver,
            save_best_only: bool
    ) -> list:
        print('Collecting Initial Samples...')

        initial_collect_policy = RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec())
        initial_collect_driver = super()._get_step_driver(
            env=train_env,
            policy=initial_collect_policy,
            observers=[replay_memory.add_batch],
            num_steps=self._initial_collect_steps
        )
        initial_collect_driver.run = function(initial_collect_driver.run)
        initial_collect_driver.run()

        return super()._train(
            train_env=train_env,
            eval_env=eval_env,
            train_iterations=train_iterations,
            eval_episodes=eval_episodes,
            iterations_per_eval=iterations_per_eval,
            iterations_per_log=iterations_per_log,
            iterations_per_checkpoint=iterations_per_checkpoint,
            replay_memory=replay_memory,
            replay_memory_dataset=replay_memory_dataset,
            train_collect_driver=train_collect_driver,
            save_best_only=save_best_only
        )


# environments.actions - environments.actions.py

In [None]:
from enum import Enum

In [None]:
# from enum import Enum


class Action(Enum):
    BUY = 0
    SELL = 1
    HOLD = 2

# metrics.metric - metrics.metric.py

In [None]:
# from abc import ABC, abstractmethod


class Metric(ABC):
    def __init__(self, name: str):
        self._name = name
        self._episode_metrics = []

    @property
    def name(self) -> str:
        return self._name

    @property
    def episode_metrics(self) -> list[float]:
        return self._episode_metrics

    @abstractmethod
    def reset(self):
        pass

    @abstractmethod
    def update(self, log_pnl: float):
        pass

    @abstractmethod
    def result(self) -> float:
        pass

    def register(self):
        self._episode_metrics.append(self.result())


# environments.environment - environments.environment.py


In [None]:
import gym

In [None]:
# import gym
# import numpy as np
# from environments.actions import Action
# from metrics.metric import Metric

class TradingEnvironment(gym.Env):
    def __init__(self, env_config: dict):
        assert 'states' in env_config, 'AssertionError: Expected "states" in env_config'
        assert 'reward_fn' in env_config, 'AssertionError: Expected "reward_function" in env_config'
        assert 'episode_steps' in env_config, 'AssertionError: Expected "episode_steps" in env_config'
        assert 'metrics' in env_config, 'AssertionError: Expected "metrics" in env_config'

        self._states = env_config['states']
        self._reward_function = env_config['reward_fn']
        self._episode_steps = env_config['episode_steps']

        self._metrics = env_config['metrics']

        if self._metrics is None:
            self._metrics = []

        self._num_states = self._states.shape[0] - 1

        assert self._num_states >= self._episode_steps, \
            'AssertionError: Not enough states are provided in the environment: ' \
            f'num_states = {self._num_states}, episode_steps = {self._episode_steps}'

        self._state_index = 0

        self.observation_space = gym.spaces.Box(
            low=self._states.min(axis=0),
            high=self._states.max(axis=0),
            shape=self._states.shape[1:],
            dtype=np.float32
        )
        self.action_space = gym.spaces.Discrete(n=len(Action))

        assert self._states.dtype == self.observation_space.dtype, \
            f'AssertionError: Expected states to have dtype = {self.observation_space.dtype}, got {self._states.dtype}'

    @property
    def metrics(self) -> list[Metric]:
        return self._metrics

    def update_metrics(self, log_pnl: float):
        for metric in self._metrics:
            metric.update(log_pnl=log_pnl)

    def register_metrics(self):
        for metric in self._metrics:
            metric.register()

    def reset(self) -> np.ndarray:
        for metric in self._metrics:
            metric.reset()
        return self._states[self._state_index]

    def step(self, action: int) -> (np.ndarray, float, bool):
        reward = self._reward_function.get_reward(i=self._state_index, action=action)

        self._state_index += 1
        next_state = self._states[self._state_index]

        if self._state_index == self._num_states:
            done = True
            self._state_index = 0
        elif self._state_index % self._episode_steps == 0:
            done = True
        else:
            done = False

        log_pnl = 0.0 if action == Action.HOLD.value else reward
        self.update_metrics(log_pnl=log_pnl)

        if done:
            self.register_metrics()
        return next_state, reward, done

    def render(self, mode=None):
        print('\n--- Current State ---')
        print(self._states[self._state_index])


# rules.rule import Rule


In [None]:
# import numpy as np
# from abc import ABC, abstractmethod


class Rule(ABC):
    @abstractmethod
    def filter(self, action: int) -> int:
        pass

# environments.wrappers.tf.tfenv import TFTradingEnvironment


In [None]:
from tf_agents.environments.py_environment import PyEnvironment
from tf_agents.specs.array_spec import BoundedArraySpec
from tf_agents.trajectories import time_step

In [None]:
# import numpy as np
# from tf_agents.environments.py_environment import PyEnvironment
# from tf_agents.specs.array_spec import BoundedArraySpec
# from tf_agents.trajectories import time_step
# from environments.actions import Action
# from environments.environment import TradingEnvironment
# from metrics.metric import Metric
# from rules.rule import Rule


class TFTradingEnvironment(PyEnvironment):
    def __init__(self, env: TradingEnvironment):
        super().__init__()

        self._env = env

        self._action_spec = BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=len(Action) - 1, name='action'
        )
        self._observation_spec = BoundedArraySpec(
            shape=self._env.observation_space.shape, dtype=self._env.observation_space.dtype,
            minimum=self._env.observation_space.low, maximum=self._env.observation_space.high, name='observation'
        )
        self._done = False
        self._discount_rate = 1.0

    def get_metrics(self) -> list[Metric]:
        return self._env.metrics

    def get_episode_metrics(self) -> dict[str, float]:
        metrics = self._env.metrics
        return {metric.name: metric.result() for metric in metrics}

    def action_spec(self) -> BoundedArraySpec:
        return self._action_spec

    def observation_spec(self) -> BoundedArraySpec:
        return self._observation_spec

    def _reset(self) -> time_step.TimeStep:
        self._done = False
        observation = self._env.reset()
        return time_step.restart(observation=observation)

    def _step(self, action_spec) -> time_step.TimeStep:
        if self._done:
            return self._reset()

        action = action_spec.item()
        next_observation, reward, self._done = self._env.step(action)

        if self._done:
            return time_step.termination(observation=next_observation, reward=reward)
        else:
            return time_step.transition(observation=next_observation, reward=reward, discount=self._discount_rate)

    def render(self, **kwargs):
        self._env.render()


class TFRuleTradingEnvironment(TFTradingEnvironment):
    def __init__(self, env: TradingEnvironment, rules: list[Rule] or None):
        super().__init__(env=env)

        self._rules = rules

    def _step(self, action_spec) -> time_step.TimeStep:
        if self._done:
            return self._reset()

        action = action_spec.item()

        if self._rules is not None:
            for rule in self._rules:
                action = rule.filter(action=action)

        next_observation, reward, self._done = self._env.step(action)

        if self._done:
            return time_step.termination(observation=next_observation, reward=reward)
        else:
            return time_step.transition(observation=next_observation, reward=reward, discount=self._discount_rate)


# environments.rewards.function import RewardFunction

In [None]:
# import numpy as np
# from abc import ABC, abstractmethod


class RewardFunction(ABC):
    def __init__(
            self,
            timeframe_size: int,
            target_horizon_len: int,
            highs: np.ndarray,
            lows: np.ndarray,
            closes: np.ndarray,
            fees_percentage: float,
            verbose: bool = False
    ):
        rewards_fn = self._build_reward_fn(
            timeframe_size=timeframe_size,
            target_horizon_len=target_horizon_len,
            highs=highs,
            lows=lows,
            closes=closes
        )

        fees = np.log((1 - fees_percentage)/(1 + fees_percentage))
        rewards_fn[:, 0:2] += fees
        hold_rewards = -rewards_fn.max(axis=1)
        hold_rewards[hold_rewards > 0] = 0
        self._rewards_fn = np.hstack((rewards_fn, np.expand_dims(hold_rewards, axis=-1)))

        if verbose:
            print(f'Rewards: {self._rewards_fn.shape}')

    @property
    def reward_fn(self) -> np.ndarray:
        return self._rewards_fn

    @reward_fn.setter
    def reward_fn(self, reward_fn: np.ndarray):
        self._rewards_fn = reward_fn

    def __call__(self, i: int, action: int) -> float:
        return self.get_reward(i=i, action=action)

    def get_reward(self, i: int, action: int) -> float:
        return self._rewards_fn[i, action]

    def get_reward_fn_shape(self):
        return self._rewards_fn.shape

    @abstractmethod
    def _build_reward_fn(
            self,
            timeframe_size: int,
            target_horizon_len: int,
            highs: np.ndarray,
            lows: np.ndarray,
            closes: np.ndarray
    ) -> np.ndarray:
        pass


#  environments.rewards.smurf import SmurfRewardFunction

In [None]:
# from environments.rewards.function import RewardFunction


class SmurfRewardFunction:
    def __init__(
            self,
            reward_function: RewardFunction
    ):
        self._reward_function = reward_function

        smurf_rf = self._reward_function.reward_fn
        smurf_rf[:, 2] = 0.0055
        self._reward_function.reward_fn = smurf_rf
        print(self._reward_function.reward_fn[0])

    def __call__(self, i: int, action: int) -> float:
        return self.get_reward(i=i, action=action)

    def get_reward(self, i: int, action: int) -> float:
        return self._reward_function.get_reward(i=i, action=action)

    def get_reward_fn_shape(self):
        return self._reward_function.get_reward_fn_shape()


# environments.rewards.marketorder import MarketOrderRF


In [None]:
# import numpy as np
# from environments.rewards.function import RewardFunction


class MarketOrderRF(RewardFunction):
    def __init__(
            self,
            timeframe_size: int,
            target_horizon_len: int,
            highs: np.ndarray,
            lows: np.ndarray,
            closes: np.ndarray,
            fees_percentage: float
    ):
        super().__init__(
            timeframe_size=timeframe_size,
            target_horizon_len=target_horizon_len,
            highs=highs,
            lows=lows,
            closes=closes,
            fees_percentage=fees_percentage
        )

    def _build_reward_fn(
            self,
            timeframe_size: int,
            target_horizon_len: int,
            highs: np.ndarray,
            lows: np.ndarray,
            closes: np.ndarray
    ) -> np.ndarray:
        return np.float32([[
            np.log(closes[i: i + target_horizon_len].max()/closes[i - 1]),
            np.log(closes[i - 1]/closes[i: i + target_horizon_len].min())
        ] for i in range(timeframe_size, closes.shape[0] - target_horizon_len + 1)])


# environments.rewards.marketlimitorder import MarketLimitOrderRF

In [None]:
# import numpy as np
# from environments.rewards.function import RewardFunction


class MarketLimitOrderRF(RewardFunction):
    def __init__(
            self,
            timeframe_size: int,
            target_horizon_len: int,
            highs: np.ndarray,
            lows: np.ndarray,
            closes: np.ndarray,
            fees_percentage: float
    ):
        super().__init__(
            timeframe_size=timeframe_size,
            target_horizon_len=target_horizon_len,
            highs=highs,
            lows=lows,
            closes=closes,
            fees_percentage=fees_percentage
        )

    def _build_reward_fn(
            self,
            timeframe_size: int,
            target_horizon_len: int,
            highs: np.ndarray,
            lows: np.ndarray,
            closes: np.ndarray
    ) -> np.ndarray:
        return np.float32([[
            np.log(highs[i: i + target_horizon_len].max()/closes[i - 1]),
            np.log(closes[i - 1]/lows[i: i + target_horizon_len].min())
        ] for i in range(timeframe_size, closes.shape[0] - target_horizon_len + 1)])


# metrics.trading.pnl import CumulativeLogReturn

In [None]:
# from metrics.metric import Metric


class CumulativeLogReturn(Metric):
    def __init__(self):
        super().__init__(name='Cumulative Log Returns')
        self._log_pnl_sum = 0

    def reset(self):
        self._log_pnl_sum = 0

    def update(self, log_pnl: float):
        self._log_pnl_sum += log_pnl

    def result(self) -> float:
        return self._log_pnl_sum


# metrics.trading.risk import InvestmentRisk

In [None]:
# from metrics.metric import Metric


class InvestmentRisk(Metric):
    def __init__(self):
        super().__init__(name='Investment Risk')
        self._sum_good_transactions = 0
        self._sum_bad_transactions = 0

    def reset(self):
        self._sum_good_transactions = 0
        self._sum_bad_transactions = 0

    def update(self, log_pnl: float):
        if log_pnl > 0:
            self._sum_good_transactions += 1
        elif log_pnl < 0:
            self._sum_bad_transactions += 1
        else:
            return

    def result(self) -> float:
        total_investments = (self._sum_bad_transactions + self._sum_good_transactions)
        return 0 if total_investments == 0 else self._sum_bad_transactions/total_investments


# metrics.trading.sharpe import SharpeRatio

In [None]:
# import numpy as np
# from metrics.metric import Metric


class SharpeRatio(Metric):
    def __init__(self):
        super().__init__(name='Sharpe')
        self._episode_log_pnls = []

    def reset(self):
        self._episode_log_pnls = []

    def update(self, log_pnl: float):
        self._episode_log_pnls.append(log_pnl)

    def result(self) -> float:
        episode_log_returns = np.float64(self._episode_log_pnls)
        average_returns = episode_log_returns.mean()
        std_returns = episode_log_returns.std()
        return np.exp(average_returns/std_returns)


# metrics.trading.sortino import SortinoRatio

In [None]:
# import numpy as np
# from metrics.metric import Metric


class SortinoRatio(Metric):
    def __init__(self):
        super().__init__(name='Sortino')
        self._episode_log_pnls = []

    def reset(self):
        self._episode_log_pnls = []

    def update(self, log_pnl: float):
        self._episode_log_pnls.append(log_pnl)

    def result(self) -> float:
        episode_log_returns = np.float64(self._episode_log_pnls)
        average_returns = episode_log_returns.mean()
        std_downfall_returns = episode_log_returns[episode_log_returns < 0].std()
        return np.exp(average_returns/std_downfall_returns)


# metrics.trading.drawdown import MaximumDrawdown

In [None]:
# from metrics.metric import Metric


class MaximumDrawdown(Metric):
    def __init__(self):
        super().__init__(name='Maximum Drawdown')
        self._log_pnl_sum = 0
        self._log_pnl_sum_peak = 0
        self._hourly_mdds = []

    def reset(self):
        self._log_pnl_sum = 0
        self._log_pnl_sum_peak = 0
        self._hourly_mdds = []

    def update(self, log_pnl: float):
        self._log_pnl_sum += log_pnl

        if self._log_pnl_sum_peak < self._log_pnl_sum:
            self._log_pnl_sum_peak = self._log_pnl_sum

        self._hourly_mdds.append(1 if self._log_pnl_sum_peak == 0 else self._log_pnl_sum/self._log_pnl_sum_peak)

    def result(self) -> float:
        log_mdd = min(self._hourly_mdds)
        return 1 - log_mdd


# rules.nconsecutive import NConsecutive    NOT YET IMPLEMENTED

In [None]:
# import numpy as np
# from environments.actions import Action
# from rules.rule import Rule


class NConsecutive(Rule):
    def __init__(self, window_size: int):
        self._window_size = window_size
        self._actions_queue = []

    def filter(self, action: int) -> int:
        if len(self._actions_queue) < self._window_size:
            self._actions_queue.insert(0, action)
            return Action.HOLD.value

        self._actions_queue.pop(-1)
        self._actions_queue.insert(0, action)
        return action if len(set(self._actions_queue)) == 1 else Action.HOLD.value


# Training **agents**

In [None]:
# !pip install config

# database.entities.crypto import Crypto

In [None]:
# from database.entities.crypto import Crypto

class Crypto:
    def __init__(self, symbol: str, name: str, start_year: int):
        self._symbol = symbol
        self._name = name
        self._start_year = start_year

    @property
    def symbol(self) -> str:
        return self._symbol

    @property
    def name(self) -> str:
        return self._name

    @property
    def start_year(self) -> int:
        return self._start_year


# database.network.network import DatasetDownloader

In [None]:
# import pandas as pd
# from database.entities.crypto import Crypto
# from abc import ABC, abstractmethod


class DatasetDownloader(ABC):
    def __init__(self, date_column_name: str, verbose: bool):
        self._date_column_name = date_column_name
        self._verbose = verbose

    @property
    def date_column_name(self) -> str:
        return self._date_column_name

    @property
    def verbose(self) -> bool:
        return self._verbose

    def _store_dataset(self, dataset_df: pd.DataFrame, filepath: str, columns: list or None = None):
        assert not dataset_df.duplicated(subset=self.date_column_name).any(), \
            f'AssertionError: Date column is expected to be unique, got duplicates'

        assert dataset_df[self.date_column_name].is_monotonic_increasing, \
            f'AssertionError: Date column is expected to be monotonic and increasing'

        dataset_df.to_csv(filepath, columns=columns, index=False)

    @abstractmethod
    def download_historical_data(self, crypto: Crypto, history_filepath: str) -> bool:
        pass

    @abstractmethod
    def update_historical_data(self, crypto: Crypto, history_filepath: str) -> bool:
        pass


# database.network.coinapi.coinapi import CoinAPIDownloader


In [None]:
import requests
from urllib.parse import urlencode

In [None]:
# import requests
# from abc import ABC, abstractmethod
# from urllib.parse import urlencode
# from database.network.network import DatasetDownloader


class CoinAPIDownloader(DatasetDownloader, ABC):
    def __init__(self, verbose: bool):
        super().__init__(date_column_name=self._get_date_column_name(), verbose=verbose)

        self._api_key_list = [
            '70E10174-E29D-449F-9F2E-6E8362931DD9',
            '27E5E40C-7A6B-45EB-A5C8-8311B049A741',
            '8F6252DE-0AD7-478F-91C7-141141E8BE8B',
            '3B49210E-100B-4F8D-9011-2BA5D38274BA',
            'BF6BF46F-B44B-416E-9656-2D2AAFBC058B',
            'B21A98A2-C953-4C73-84CF-CFFB6F712200',
            '51667E99-7686-4496-B23D-6DA54F7E37AE',
            '0921F87B-BF55-4B78-B8B0-E023B4D7A2E2',
            '3F9E3251-029C-457A-9ADA-7F21A440AAF9',
            '41EBEA2D-1A4B-4654-8A41-186639B9AB9F',
            '6B93AEC2-910C-4064-80FB-91AED487AB97',
            '83049379-23DE-4CB0-8299-7137BB836D48',
            'B08FCA1F-F454-4C34-AC01-42F16354BCBC',
            '12E5D72C-25A6-4ED6-8384-7C291EC43768',
            '4F287859-5A00-47EF-AC91-8A2629F8C6A1',
            '3744F705-2C4A-406C-AA96-EB1B557A84EF',
            '3F77D500-457E-4A96-9CE1-1DEF3FC7033B',
            '455C2228-0D6F-4B62-8336-4BAA24C1A46E',
            '7E37E058-670C-4ED6-B7BE-DC00F309D9FF',
            '0F517C3D-162C-4C5E-AE18-544B201C9BC0'
        ]

    @property
    def api_key_list(self) -> list[str]:
        return self._api_key_list

    @abstractmethod
    def _get_date_column_name(self) -> str:
        pass

    @abstractmethod
    def _get_request_params(self) -> dict[str, str]:
        pass

    @staticmethod
    def _encode_request_url(base_url: str, request_params: dict, api_key: str) -> str:
        request_params['apikey'] = api_key
        encoded_params = urlencode(request_params)
        return f'{base_url}?{encoded_params}'

    def _get_response(self, base_url: str, request_params: dict) -> requests.Response or None:
        for api_key in self._api_key_list:
            if self._verbose:
                print(f'Using apikey: {api_key}')

            encoded_request_url = self._encode_request_url(
                base_url=base_url,
                request_params=request_params,
                api_key=api_key
            )
            response = requests.get(encoded_request_url)

            if self._verbose:
                print(f'Response Status: {response.status_code} - {response.reason}')

            if response.status_code == 200:
                return response
        return None


# database.network.coinapi.ohlcv import OHLCVDownloader

In [None]:
import io

In [None]:
# import io
# import pandas as pd
# from enum import Enum
# from database.entities.crypto import Crypto
# from database.network.coinapi.coinapi import CoinAPIDownloader


class OHLCVDownloader(CoinAPIDownloader):
    class HistoricalFrequency(Enum):
        MINUTE = '1MIN'
        HOUR = '1HRS'

    def __init__(self, historical_frequency: HistoricalFrequency or str, verbose: bool):
        if isinstance(historical_frequency, str):
            if historical_frequency == '1HRS':
                self._historical_frequency = self.HistoricalFrequency.HOUR
            elif historical_frequency == '1MIN':
                self._historical_frequency = self.HistoricalFrequency.MINUTE
            else:
                raise NotImplementedError(f'"{historical_frequency}" frequency has not been implemented yet')
        else:
            self._historical_frequency = historical_frequency

        super().__init__(verbose=verbose)

        self._history_request_url = 'https://rest.coinapi.io/v1/ohlcv/{}/USD/history'
        self._latest_request_url = 'https://rest.coinapi.io/v1/ohlcv/{}/USD/latest'
        self._download_limit = 100000
        self._update_limit = 1000

    def _get_date_column_name(self) -> str:
        return 'time_period_end'

    def _get_request_params(self) -> dict:
        return {
            'period_id': self._historical_frequency.value,
            'output_format': 'csv',
            'csv_set_delimiter': ',',
            'time_start': '{}-{}-{}T00:00:00',
            'limit': '{}'
        }

    def download_historical_data(self, crypto: Crypto, history_filepath: str) -> bool:
        if self.verbose:
            print(f'Downloading {crypto.name} market history data for {crypto.start_year}')

        request_params = self._get_request_params()
        request_params['time_start'] = request_params['time_start'].format(crypto.start_year, '01', '01')
        request_params['limit'] = request_params['limit'].format(self._download_limit)
        base_url = self._history_request_url.format(crypto.symbol)

        response = self._get_response(
            base_url=base_url,
            request_params=request_params
        )
        if response is not None and response.status_code == 200:
            ohlcv_df = pd.read_csv(io.StringIO(response.text), sep=',')
            super()._store_dataset(dataset_df=ohlcv_df, filepath=history_filepath)
            return True
        else:
            return False

    def update_historical_data(self, crypto: Crypto, history_filepath: str) -> bool:
        if self.verbose:
            print(f'Updating {crypto.name} market latest data for {crypto.start_year}')

        request_params = self._get_request_params()
        request_params['limit'] = request_params['limit'].format(self._update_limit)
        del request_params['time_start']
        base_url = self._latest_request_url.format(crypto.symbol)

        response = self._get_response(
            base_url=base_url,
            request_params=request_params
        )

        if response is not None and response.status_code == 200:
            history_df = pd.read_csv(history_filepath)
            latest_df = pd.read_csv(io.StringIO(response.text), sep=',').sort_values(
                by=self.date_column_name, ascending=True
            )
            merged_df = pd.concat((history_df, latest_df), ignore_index=True)
            merged_df.drop_duplicates(subset=self.date_column_name, inplace=True)

            super()._store_dataset(dataset_df=merged_df, filepath=history_filepath)
            return True
        return False


# config - import config.py

In [None]:
# from database.entities.crypto import Crypto
# from database.network.coinapi.ohlcv import OHLCVDownloader

class Config:

    # --- Database ---
    supported_cryptos = {
        'BTC': Crypto(symbol='BTC', name='bitcoin', start_year=2017),
        'ETH': Crypto(symbol='ETH', name='ethereum', start_year=2017),
        'SOL': Crypto(symbol='SOL',  name='solana', start_year=2020),
        'ADA': Crypto(symbol='ADA', name='ada', start_year=2017),
        'BNB': Crypto(symbol='BNB', name='bnb', start_year=2019),
        'XRP': Crypto(symbol='XRP', name='xrp', start_year=2019),
        'DOGE': Crypto(symbol='DOGE', name='doge', start_year=2020),
        'MATIC': Crypto(symbol='MATIC', name='polygon', start_year=2020),
        'TRON': Crypto(symbol='TRON', name='tron', start_year=2018),
        'LTC': Crypto(symbol='LTC', name='litecoin', start_year=2018),
        'DOT': Crypto(symbol='DOT', name='polkadot', start_year=2021),
        'AVAX': Crypto(symbol='AVAX', name='avalanche', start_year=2021),
        'XMR': Crypto(symbol='XMR', name='monero', start_year=2018),
        'BAT': Crypto(symbol='BAT', name='basic authentication token', start_year=2018),
        'LRC': Crypto(symbol='LRC', name='loopring', start_year=2018)
    }

    ohlcv_dataset_period_id = OHLCVDownloader.HistoricalFrequency.HOUR
    ohlcv_history_filepath = 'database/storage/downloads/ohlcv/{}.csv'
    gtrends_history_filepath = 'database/storage/downloads/gtrends/{}.csv'
    dataset_save_filepath = '/content/drive/MyDrive/Colab Notebooks/Sample Tradernet/Binance_BTCUSDT_d_EDIT.csv'
    # dataset_save_filepath = 'database/storage/datasets/{}.csv'
    all_features = [
        'date', 'open', 'high', 'low', 'close', 'volume', 'trades',
        'open_log_returns', 'high_log_returns', 'low_log_returns',
        'close_log_returns', 'volume_log_returns', 'trades_log_returns', 'hour',
        'dema', 'vwap', 'bband_up', 'bband_down', 'adl', 'obv',
        'macd_signal_diffs', 'stoch', 'aroon_up', 'aroon_down', 'rsi', 'adx', 'cci',
        'close_dema', 'close_vwap', 'bband_up_close', 'close_bband_down', 'adl_diffs2', 'obv_diffs2', 'trends'
    ]
    regression_features = [
        'open_log_returns', 'high_log_returns', 'low_log_returns',
        'close_log_returns', 'volume_log_returns', 'trades_log_returns', 'hour',
        'macd_signal_diffs', 'stoch', 'aroon_up', 'aroon_down', 'rsi', 'adx', 'cci',
        'close_dema', 'close_vwap', 'bband_up_close', 'close_bband_down', 'adl_diffs2', 'obv_diffs2', 'trends'
    ]

    # --- Model ---
    checkpoint_dir = 'database/storage/checkpoints/'

    # --- Clustering ---
    crypto_clusters = [
        ['BTC', 'ETH', 'SOL', 'ADA', 'XPR', 'DOGE', 'DOT', 'AVAX', 'BAT', 'LRC'],
        ['ETH', 'BNB', 'MATIC', 'TRON', 'LTC', 'XMR']
    ]


In [None]:
# import config
from sklearn.preprocessing import MinMaxScaler

In [None]:
# Reading Datasets for Tradernet


def read_dataset_for_tradernet(
        dataset_filepath,
        timeframe_size,
        target_horizon_len,
        num_eval_samples,
        fees,
        reward_fn_instance
):
    # Reading dataset
    crypto_dataset_df = pd.read_csv(Config.dataset_save_filepath.format(dataset_filepath))
    samples_df = crypto_dataset_df[Config.regression_features]

    # Scaling data
    scaler = MinMaxScaler(feature_range=(0, 1.0))
    samples = samples_df.to_numpy(dtype=np.float32)

    num_train_scale_samples = samples.shape[0] - num_eval_samples - target_horizon_len - timeframe_size + 1
    samples[: num_train_scale_samples] = scaler.fit_transform(samples[: num_train_scale_samples])
    samples[num_train_scale_samples: ] = scaler.transform(samples[num_train_scale_samples: ])

    # Constructing timeframes for train, test
    inputs = np.float32([samples[i: i + timeframe_size] for i in range(samples.shape[0] - timeframe_size - target_horizon_len + 1)])

    # Splitting inputs to train-test data
    num_train_inputs = inputs.shape[0] - num_eval_samples
    x_train = inputs[: num_train_inputs]
    x_eval = inputs[num_train_inputs:]

    # Computing reward functions for train, test data
    closes = crypto_dataset_df['close'].to_numpy(dtype=np.float32)
    highs = crypto_dataset_df['high'].to_numpy(dtype=np.float32)
    lows = crypto_dataset_df['low'].to_numpy(dtype=np.float32)

    train_reward_fn = reward_fn_instance(
        timeframe_size=timeframe_size,
        target_horizon_len=target_horizon_len,
        highs=highs[: samples.shape[0] - num_eval_samples],
        lows=lows[: samples.shape[0] - num_eval_samples],
        closes=closes[: samples.shape[0] - num_eval_samples],
        fees_percentage=fees
    )

    eval_reward_fn = reward_fn_instance(
        timeframe_size=timeframe_size,
        target_horizon_len=target_horizon_len,
        highs=highs[samples.shape[0] - num_eval_samples - timeframe_size - target_horizon_len + 1:],
        lows=lows[samples.shape[0] - num_eval_samples - timeframe_size - target_horizon_len + 1:],
        closes=closes[samples.shape[0] - num_eval_samples - timeframe_size - target_horizon_len + 1:],
        fees_percentage=fees
    )

    assert x_train.shape[0] == train_reward_fn.get_reward_fn_shape()[0], \
        f'AssertionError: DimensionMismatch: x_train: {x_train.shape}, train_reward_fn: {train_reward_fn.get_reward_fn_shape()}'
    assert x_eval.shape[0] == eval_reward_fn.get_reward_fn_shape()[0], \
        f'AssertionError: DimensionMismatch: x_eval: {x_eval.shape}, eval_reward_fn: {eval_reward_fn.get_reward_fn_shape()}'

    return x_train, train_reward_fn, x_eval, eval_reward_fn

In [None]:
# Reading Datasets for Smurf Agent


def read_dataset_for_smurf_agent(  #### CHANGE VARIABLE
        dataset_filepath,
        timeframe_size,
        target_horizon_len,
        num_eval_samples,
        fees,
        reward_fn_instance
):
    # Reading dataset
    crypto_dataset_df = pd.read_csv(Config.dataset_save_filepath.format(dataset_filepath))     #### CHANGE VARIABLE
    samples_df = crypto_dataset_df[Config.regression_features]        #### CHANGE VARIABLE

    # Scaling data
    scaler = MinMaxScaler(feature_range=(0, 1.0))         #### CHANGE VARIABLE
    samples = samples_df.to_numpy(dtype=np.float32)         #### CHANGE VARIABLE

    num_train_scale_samples = samples.shape[0] - num_eval_samples - target_horizon_len - timeframe_size + 1         #### CHANGE VARIABLE
    samples[: num_train_scale_samples] = scaler.fit_transform(samples[: num_train_scale_samples])           #### CHANGE VARIABLE
    samples[num_train_scale_samples: ] = scaler.transform(samples[num_train_scale_samples: ])         #### CHANGE VARIABLE

    # Constructing timeframes for train, test
    inputs = np.float32([samples[i: i + timeframe_size] for i in range(samples.shape[0] - timeframe_size - target_horizon_len + 1)])      #### CHANGE VARIABLE

    # Splitting inputs to train-test data
    num_train_inputs = inputs.shape[0] - num_eval_samples         #### CHANGE VARIABLE
    x_train = inputs[: num_train_inputs]              #### CHANGE VARIABLE
    x_eval = inputs[num_train_inputs:]                #### CHANGE VARIABLE

    # Computing reward functions for train, test data
    closes = crypto_dataset_df['close'].to_numpy(dtype=np.float32)      #### CHANGE VARIABLE
    highs = crypto_dataset_df['high'].to_numpy(dtype=np.float32)        #### CHANGE VARIABLE
    lows = crypto_dataset_df['low'].to_numpy(dtype=np.float32)          #### CHANGE VARIABLE

    train_reward_fn = SmurfRewardFunction(reward_function=reward_fn_instance(       #### CHANGE VARIABLE
        timeframe_size=timeframe_size,
        target_horizon_len=target_horizon_len,
        highs=highs[: samples.shape[0] - num_eval_samples],
        lows=lows[: samples.shape[0] - num_eval_samples],
        closes=closes[: samples.shape[0] - num_eval_samples],
        fees_percentage=fees
    ))

    eval_reward_fn = SmurfRewardFunction(reward_function=reward_fn_instance(        #### CHANGE VARIABLE
        timeframe_size=timeframe_size,
        target_horizon_len=target_horizon_len,
        highs=highs[samples.shape[0] - num_eval_samples - timeframe_size - target_horizon_len + 1:],
        lows=lows[samples.shape[0] - num_eval_samples - timeframe_size - target_horizon_len + 1:],
        closes=closes[samples.shape[0] - num_eval_samples - timeframe_size - target_horizon_len + 1:],
        fees_percentage=fees
    ))

    assert x_train.shape[0] == train_reward_fn.get_reward_fn_shape()[0], \
        f'AssertionError: DimensionMismatch: x_train: {x_train.shape}, train_reward_fn: {train_reward_fn.get_reward_fn_shape()}'
    assert x_eval.shape[0] == eval_reward_fn.get_reward_fn_shape()[0], \
        f'AssertionError: DimensionMismatch: x_eval: {x_eval.shape}, eval_reward_fn: {eval_reward_fn.get_reward_fn_shape()}'

    return x_train, train_reward_fn, x_eval, eval_reward_fn                          #### CHANGE VARIABLE

In [None]:
# Building Tradernet Agent


def build_tradernet_agent(            #### CHANGE VARIABLE
        agent_instance,
        observation_spec,
        action_spec,
        time_step_spec,
        env_batch_size,
        checkpoint_filepath,
        fc_layers,
        conv_layers
):
    return agent_instance(            #### CHANGE VARIABLE
        input_tensor_spec=observation_spec,
        action_spec=action_spec,
        time_step_spec=time_step_spec,
        env_batch_size=env_batch_size,
        checkpoint_filepath=checkpoint_filepath,
        fc_layers=fc_layers,
        conv_layers=conv_layers
    )

In [None]:
# Building Smurf Agent


def build_smurf_agent(            #### CHANGE VARIABLE
        agent_instance,
        observation_spec,
        action_spec,
        time_step_spec,
        env_batch_size,
        checkpoint_filepath,
        fc_layers,
        conv_layers
):
    return agent_instance(        #### CHANGE VARIABLE
        input_tensor_spec=observation_spec,
        action_spec=action_spec,
        time_step_spec=time_step_spec,
        env_batch_size=env_batch_size,
        checkpoint_filepath=checkpoint_filepath,
        fc_layers=fc_layers,
        conv_layers=conv_layers
    )

In [None]:
# Building Tradernet Agent Trainer


def train_tradernet_agent(
        dataset_filepath,
        timeframe_size,
        target_horizon_len,
        num_eval_samples,
        fees,
        reward_fn_instance,
        agent_instance,
        checkpoint_filepath,
        fc_layers,
        conv_layers,
        train_episode_steps,
        train_iterations,
        eval_episodes,
        steps_per_eval,
        steps_per_log,
        steps_per_checkpoint,
        save_best_only
):
    x_train, train_reward_fn, x_eval, eval_reward_fn = read_dataset_for_tradernet(        #### CHANGED
        dataset_filepath=dataset_filepath,
        timeframe_size=timeframe_size,
        target_horizon_len=target_horizon_len,
        num_eval_samples=num_eval_samples,
        fees=fees,
        reward_fn_instance=reward_fn_instance
    )
    train_env = TFTradingEnvironment(
        env=TradingEnvironment(env_config={
            'states': x_train,
            'reward_fn': train_reward_fn,
            'episode_steps': train_episode_steps,
            'metrics': [CumulativeLogReturn(), InvestmentRisk(), SharpeRatio(), SortinoRatio(), MaximumDrawdown()]
        })
    )
    eval_env = TFTradingEnvironment(
        env=TradingEnvironment(env_config={
            'states': x_eval,
            'reward_fn': eval_reward_fn,
            'episode_steps': x_eval.shape[0] - 1,
            'metrics': [CumulativeLogReturn(), InvestmentRisk(), SharpeRatio(), SortinoRatio(), MaximumDrawdown()]
        })
    )

    tf_train_env = TFPyEnvironment(environment=train_env)
    tf_eval_env = TFPyEnvironment(environment=eval_env)

    agent = build_tradernet_agent(        #### CHANGE VARIABLE
        agent_instance=agent_instance,
        observation_spec=tf_train_env.observation_spec(),
        action_spec=tf_train_env.action_spec(),
        time_step_spec=tf_train_env.time_step_spec(),
        env_batch_size=tf_train_env.batch_size,
        checkpoint_filepath=checkpoint_filepath,
        fc_layers=fc_layers,
        conv_layers=conv_layers,
    )

    agent.initialize()

    eval_avg_returns = agent.train(       #### CHANGE VARIABLE
        train_env=tf_train_env,
        eval_env=tf_eval_env,
        train_iterations=train_iterations,
        eval_episodes=eval_episodes,
        iterations_per_eval=steps_per_eval,
        iterations_per_log=steps_per_log,
        iterations_per_checkpoint=steps_per_checkpoint,
        save_best_only=save_best_only
    )
    eval_metrics = eval_env.get_metrics()
    return eval_avg_returns, eval_metrics

In [None]:
# Building Smurf Agent Trainer


def train_smurf_agent(
        dataset_filepath,
        timeframe_size,
        target_horizon_len,
        num_eval_samples,
        fees,
        reward_fn_instance,
        agent_instance,
        checkpoint_filepath,
        fc_layers,
        conv_layers,
        train_episode_steps,
        train_iterations,
        eval_episodes,
        steps_per_eval,
        steps_per_log,
        steps_per_checkpoint,
        save_best_only
):
    x_train, train_reward_fn, x_eval, eval_reward_fn = read_dataset_for_smurf_agent(      #### CHANGED
        dataset_filepath=dataset_filepath,
        timeframe_size=timeframe_size,
        target_horizon_len=target_horizon_len,
        num_eval_samples=num_eval_samples,
        fees=fees,
        reward_fn_instance=reward_fn_instance
    )
    train_env = TFTradingEnvironment(      #### CHANGE VARIABLE
        env=TradingEnvironment(env_config={
            'states': x_train,
            'reward_fn': train_reward_fn,
            'episode_steps': train_episode_steps,
            'metrics': [CumulativeLogReturn(), InvestmentRisk(), SharpeRatio(), SortinoRatio(), MaximumDrawdown()]
        })
    )
    eval_env = TFTradingEnvironment(       #### CHANGE VARIABLE
        env=TradingEnvironment(env_config={
            'states': x_eval,
            'reward_fn': eval_reward_fn,
            'episode_steps': x_eval.shape[0] - 1,
            'metrics': [CumulativeLogReturn(), InvestmentRisk(), SharpeRatio(), SortinoRatio(), MaximumDrawdown()]
        })
    )

    tf_train_env = TFPyEnvironment(environment=train_env)          #### CHANGE VARIABLE
    tf_eval_env = TFPyEnvironment(environment=eval_env)              #### CHANGE VARIABLE

    agent = build_smurf_agent(                   #### CHANGE VARIABLE
        agent_instance=agent_instance,
        observation_spec=tf_train_env.observation_spec(),
        action_spec=tf_train_env.action_spec(),
        time_step_spec=tf_train_env.time_step_spec(),
        env_batch_size=tf_train_env.batch_size,
        checkpoint_filepath=checkpoint_filepath,
        fc_layers=fc_layers,
        conv_layers=conv_layers,
    )

    agent.initialize()

    eval_avg_returns = agent.train(            #### CHANGE VARIABLE
        train_env=tf_train_env,
        eval_env=tf_eval_env,
        train_iterations=train_iterations,
        eval_episodes=eval_episodes,
        iterations_per_eval=steps_per_eval,
        iterations_per_log=steps_per_log,
        iterations_per_checkpoint=steps_per_checkpoint,
        save_best_only=save_best_only
    )
    eval_metrics = eval_env.get_metrics()          #### CHANGE VARIABLE
    return eval_avg_returns, eval_metrics

In [None]:
# Building Train Configs for Tradernet Agent

tradernet_datasets_dict = {'BTC': 'BTC', 'ETH': 'ETH', 'ADA': 'ADA', 'XRP': 'XRP', 'LTC': 'LTC'}
tradernet_rewards_dict = {
    'Market-Orders':  MarketOrderRF,
    'Market-Limit Orders': MarketLimitOrderRF
}
tradernet_agents_configs = {
    'DDQN': {
        'agent_instance': DQNAgent,
        'train_iterations': 50000,
        'steps_per_eval': 500,
        'steps_per_log': 500,
        'steps_per_checkpoint': 500
    },
    'PPO': {
        'agent_instance': PPOAgent,
        'train_iterations': 1000,
        'steps_per_eval': 10,
        'steps_per_log': 10,
        'steps_per_checkpoint': 10
    }
}
tradernet_train_dict = {
    'timeframe_size': 12,
    'target_horizon_len': 20,
    'num_eval_samples': 2250,
    'fees': 0.007,
    'fc_layers': [256, 256],
    'conv_layers': [(32, 3, 1)],
    'train_episode_steps': 100,
    'eval_episodes': 1,
    'save_best_only': True
}

In [None]:
# Building Train Configs for Smurf Agent

smurf_datasets_dict = {'BTC': 'BTC', 'ETH': 'ETH', 'ADA': 'ADA', 'XRP': 'XRP', 'LTC': 'LTC'}
smurf_rewards_dict = {
    'Market-Orders':  MarketOrderRF,
    'Market-Limit Orders': MarketLimitOrderRF
}
smurf_agents_configs = {
    'PPO': {
        'agent_instance': PPOAgent,
        'train_iterations': 1000,
        'steps_per_eval': 10,
        'steps_per_log': 10,
        'steps_per_checkpoint': 10
    },
    'DDQN': {
        'agent_instance': DQNAgent,
        'train_iterations': 50000,
        'steps_per_eval': 500,
        'steps_per_log': 500,
        'steps_per_checkpoint': 500
    }
}
smurf_train_dict = {
    'timeframe_size': 12,
    'target_horizon_len': 20,
    'num_eval_samples': 2250,
    'fees': 0.01,
    'fc_layers': [256, 256],
    'conv_layers': [(32, 3, 1)],
    'train_episode_steps': 100,
    'eval_episodes': 1,
    'save_best_only': True
}

In [None]:
# Run TraderNet Experiments

tradernet_results = {
    'PPO': {dataset_name: {} for dataset_name in tradernet_datasets_dict.keys()},
    'DDQN': {dataset_name: {} for dataset_name in tradernet_datasets_dict.keys()}
}

tradernet_colors = {
    'BTC': 'green',
    'ETH': 'blue',
    'XRP': 'red',
    'ADA': 'black',
    'LTC': 'orange'
}
tradernet_linestyles = {
    'Market-Orders': '--',
    'Market-Limit-Orders': '-'
}

for agent_name, agent_config in tradernet_agents_configs.items():
    for dataset_name, dataset_filepath in tradernet_datasets_dict.items():
        for reward_fn_name, reward_fn_instance in tradernet_rewards_dict.items():
            tf.random.set_seed(seed=0)

            train_params = {
                'dataset_filepath': dataset_filepath,
                'reward_fn_instance': reward_fn_instance,
                'checkpoint_filepath': f'database/storage/checkpoints/experiments/tradernet/{agent_name}/{dataset_name}/{reward_fn_name}/',
                **tradernet_train_dict,
                **agent_config
            }
            eval_avg_returns, eval_metrics = train_tradernet_agent(**train_params)

            tradernet_results[agent_name][dataset_name][reward_fn_name] = (eval_avg_returns, eval_metrics)

        for reward_fn_name, reward_fn_results in tradernet_results[agent_name][dataset_name].items():
            eval_avg_returns, eval_metrics = reward_fn_results

            metrics_dict = {
                'steps': [10000*i for i in range(len(eval_avg_returns))],
                'average_returns': eval_avg_returns,
                **{metric.name: metric.episode_metrics for metric in eval_metrics}
            }
            metrics_df = pd.DataFrame(metrics_dict)
            metrics_df.to_csv(f'experiments/tradernet/{agent_name}/{dataset_name}_{reward_fn_name}.csv', index=False)


KeyError: ignored

In [None]:
# Run TraderNet Experiments with Smurf

smurf_results = {
    'PPO': {dataset_name: {} for dataset_name in smurf_datasets_dict.keys()},
    'DDQN': {dataset_name: {} for dataset_name in smurf_datasets_dict.keys()}
}

smurf_colors = {
    'BTC': 'green',
    'ETH': 'blue',
    'XRP': 'red',
    'ADA': 'black',
    'LTC': 'orange'
}
smurf_linestyles = {
    'Market-Orders': '--',
    'Market-Limit-Orders': '-'
}

for agent_name, agent_config in smurf_agents_configs.items():
    for dataset_name, dataset_filepath in smurf_datasets_dict.items():
        for reward_fn_name, reward_fn_instance in smurf_rewards_dict.items():
            tf.random.set_seed(seed=0)

            train_params = {
                'dataset_filepath': dataset_filepath,
                'reward_fn_instance': reward_fn_instance,
                'checkpoint_filepath': f'database/storage/checkpoints/experiments/smurf/{agent_name}/{dataset_name}/{reward_fn_name}/',
                **smurf_train_dict,
                **agent_config
            }
            eval_avg_returns, eval_metrics = train_smurf_agent(**train_params)

            smurf_results[agent_name][dataset_name][reward_fn_name] = (eval_avg_returns, eval_metrics)

        for reward_fn_name, reward_fn_results in smurf_results[agent_name][dataset_name].items():
            eval_avg_returns, eval_metrics = reward_fn_results

            metrics_dict = {
                'steps': [10000*i for i in range(len(eval_avg_returns))],
                'average_returns': eval_avg_returns,
                **{metric.name: metric.episode_metrics for metric in eval_metrics}
            }
            metrics_df = pd.DataFrame(metrics_dict)
            metrics_df.to_csv(f'experiments/smurf/{agent_name}/{dataset_name}_{reward_fn_name}.csv', index=False)