In [None]:
#!pip install tensorflow-metal -U #decorator==4.4.2  # gymnasium==0.26.3 #gymnasium[box2d] tensorboard keras tensorflow moviepy

In [None]:
import copy
import datetime
import os
import pickle
import random
import time
from collections import defaultdict
from typing import List

import gymnasium as gym
import keras
import numpy as np
import tensorflow as tf
from gymnasium.utils.save_video import save_video
from keras import layers
from keras.layers import Dense

# Setting to CPU
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [None]:
import gymnasium

print(gymnasium.__version__)


gpu = len(tf.config.list_physical_devices('GPU')) > 0
print("GPU is", "available" if gpu else "NOT AVAILABLE")

In [None]:
def set_seed(seed: int):
    """Set seed across random and numpy libraries"""
    random.seed(seed)
    np.random.seed(seed)
    return None


def create_env(seed, hardcore: bool = False):
    # use gymnasium to create an BipedalWalker-v3 environment
    env = gym.make("BipedalWalker-v3", hardcore=hardcore, render_mode="rgb_array_list")

    # Reset Environment
    env.reset(seed=seed)

    return env


class Networks:
    def __init__(
        self,
        backbone: str,
        critic_network: List[List[int]],
        actor_network: List[int],
        agent=None,
        twin=False,
    ):
        # example:
        #  actor_network [96, 96]
        # critic_network [[],     action network, maximum one element
        #                 [96, 96] state network, two element
        #                 [192]   after Concatenation,  maximum one element  ]

        # define the backbone of your network
        self.backbone = backbone

        # define the network architecture
        self.critic_network = critic_network
        self.actor_network = actor_network

        # passing an agent
        self.agent = agent

        # when using this class for TD3, Twin shall be True
        self.twin = twin

    # Changes have been made here.
    def log_model(self, model: tf.keras.Model, model_type: str):
        try:
            logdir = os.path.join(
                "logs",
                self.agent.name,
                model_type,
                datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
            )
            TensorBoardCall = tf.keras.callbacks.TensorBoard(
                log_dir=logdir, update_freq=1
            )
            TensorBoardCall.set_model(model=model)
        except:  # this is not good practise, what error were you encountering? execpt should be specific
            pass

    def generate_actor_network(self):
        if self.backbone in ["MLP", "LSTM"]:
            inputs = layers.Input(
                shape=(num_parameters_recorded_per_states), name="state_input"
            )
            out = layers.Dense(
                self.actor_network[0], activation="relu", name="MLP_state_layer"
            )(inputs)

            if self.backbone == "LSTM":
                # expand dimensions of input layers to have the timestamps
                out = layers.Lambda(
                    lambda x: tf.expand_dims(x, 1), name="dimension_expand"
                )(out)
                out = layers.LSTM(
                    self.actor_network[1],
                    return_sequences=True,
                    return_state=True,
                    dropout=0.5,
                    name="LSTM_backbone_1",
                )(
                    out
                )  # additional layer
                out = layers.LSTM(
                    self.actor_network[1], dropout=0.5, name="LSTM_backbone_2"
                )(out)

            else:
                out = layers.Dense(
                    self.actor_network[1], activation="relu", name="MLP_backbone"
                )(out)

            # actions are from -1 to 1 so tanh is a natural choice, sigmoid can be used for scaled values
            outputs = layers.Dense(
                num_actions_per_step, activation="tanh", name="MLP_last_layer"
            )(out)
            model = tf.keras.Model(inputs, outputs)

            # Changes have been made here.
            self.log_model(model=model, model_type="actor")

            return model

        elif self.backbone == "Transformers":
            pass

    def generate_critic_network(self):
        if self.backbone in ["MLP", "LSTM"]:
            # Input the State
            state_input = layers.Input(
                shape=num_parameters_recorded_per_states, name="state_input"
            )

            if len(self.critic_network[0]) >= 1:
                state_out = layers.Dense(
                    self.critic_network[0][0],
                    activation="relu",
                    name="MLP_state_layer",
                )(state_input)
            else:
                state_out = state_input

            # Input the Action
            action_input = layers.Input(shape=num_actions_per_step, name="action_input")
            action_output = layers.Dense(
                self.critic_network[1][0], activation="relu", name="MLP_action_layer"
            )(action_input)

            if self.backbone == "LSTM":
                # expand dimensions of input layers to count for the timesteps
                out = layers.Lambda(lambda x: tf.expand_dims(x, 1))(action_output)
                out = layers.LSTM(
                    self.critic_network[1][1],
                    return_sequences=True,
                    return_state=True,
                    dropout=0.5,
                    name="LSTM_action_layer_backbone_1",
                )(
                    out
                )  # additional layer
                action_output = layers.LSTM(
                    self.critic_network[1][1],
                    dropout=0.5,
                    name="LSTM_action_layer_backbone_2",
                )(out)

            else:
                action_output = layers.Dense(
                    self.critic_network[1][1],
                    activation="relu",
                    name="MLP_action_layer_backbone",
                )(action_output)

            # Both are passed through separate layer before concatenating
            concat = layers.Concatenate()([state_out, action_output])

            out = layers.Dense(
                self.critic_network[2][0], activation="relu", name="after_Concatenation"
            )(concat)
            outputs = layers.Dense(1)(out)

            # Outputs single value for give state-action
            model = tf.keras.Model([state_input, action_input], outputs)
            # Changes have been made here.
            self.log_model(model=model, model_type='critic')

            return model

        elif self.backbone == "Transformers":
            pass


class Actor(keras.Model):
    """Creates an actor network"""

    def __init__(self, state_dim, action_dim):
        """
        Args:
        state_dim: Observation environment dimensions, this will be used to determine network input size.
        action_dim: Action dimensions, this will be used to determine network output.
        """
        super(Actor, self).__init__()
        self.inp_layer = Dense(
            state_dim,
        )
        self.layer_1 = Dense(
            256,
            activation='relu',
        )
        self.layer_2 = Dense(
            256,
            activation='relu',
        )
        # self.layer_3 = Dense(200, activation='relu',)
        self.out_layer = Dense(
            action_dim,
            activation='tanh',
        )

    def call(self, observation):
        x = self.inp_layer(observation)
        x = self.layer_1(x)
        x = self.layer_2(x)
        # x = self.layer_3(x)
        x = self.out_layer(x)
        return x


class CriticDDPG(keras.Model):
    """Creates two critic networks"""

    def __init__(self, state_dim, action_dim):
        """
        Args:
            state_dim: The dimensions of the state the environment will produce.
                The first input for the network.
            action_dim: The dimensions of the actions the environment can take.
                The second input for the network.
        """
        super(CriticDDPG, self).__init__()
        # The First Critic NN
        self.inp_layer_1 = Dense(
            state_dim + action_dim,
        )
        self.layer_1_1 = Dense(
            256,
            activation='relu',
        )
        self.layer_2_1 = Dense(
            256,
            activation='relu',
        )
        # self.layer_3_1 = Dense(200, activation='relu',)
        self.out_layer_1 = Dense(
            1,
        )

    def call(self, observation_plus_action):
        x0 = tf.concat(observation_plus_action, 1)
        # forward propagate the first NN
        x1 = self.inp_layer_1(x0)
        x1 = self.layer_1_1(x1)
        x1 = self.layer_2_1(x1)
        # x1 = self.layer_3_1(x1)
        x1 = self.out_layer_1(x1)

        return x1


class NetworksV2(Networks):
    def __init__(self):
        super(Networks, self)

    def generate_critic_network(self, name: str = 'critic'):
        critic = CriticDDPG(num_parameters_recorded_per_states, num_actions_per_step)
        self.log_model(critic, name)
        return critic

    def generate_actor_network(self, name: str = 'actor'):
        actor = Actor(num_parameters_recorded_per_states, num_actions_per_step)
        self.log_model(actor, name)
        return actor


class ReplayBuffer:
    def __init__(
        self, buffer_size: int, batch_size: int, td_error_prioritization: bool = False
    ):
        """This class creates np zeroes for the buffer of all properties in SARS and updates them with a FIFO method.
        There are also two sampling methods, uniform or prioritised based on td-loss.
        """
        self.capacity = buffer_size
        self.batch_size = batch_size
        self.td_error_prioritization = td_error_prioritization

        # logs number of times the replay.record has been called
        self.counter = 0

        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros(
            (self.capacity, num_parameters_recorded_per_states)
        )
        self.action_buffer = np.zeros((self.capacity, num_actions_per_step))
        self.reward_buffer = np.zeros((self.capacity, 1))
        self.next_state_buffer = np.zeros(
            (self.capacity, num_parameters_recorded_per_states)
        )
        # Changes have been made here.
        self.terminal_or_truncated = np.ones((self.capacity, 1), dtype=bool)
        if self.td_error_prioritization:
            self.priority_value = np.zeros(self.capacity)

    def store(self, observation, td_loss: float):
        # Get sampling range
        index = self.counter % self.capacity

        (
            self.state_buffer[index],
            self.action_buffer[index],
            self.reward_buffer[index],
            self.next_state_buffer[index],
            self.terminal_or_truncated[index],  # Changes have been made here.
        ) = observation

        if self.td_error_prioritization:
            self.priority_value[index] = td_loss

        self.counter += 1

    def mini_batch_sample(self):
        # Get sampling range
        record_range = min(self.counter, self.capacity)

        if self.td_error_prioritization:
            weights = self.priority_value[:record_range] / np.sum(
                self.priority_value[:record_range]
            )
            batch_indices = np.random.choice(record_range, self.batch_size, p=weights)
        else:
            # Randomly sample indices
            batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        # Changes have been made here.
        terminal_or_truncated_batch = tf.convert_to_tensor(
            self.terminal_or_truncated[batch_indices]
        )

        return (
            state_batch,
            action_batch,
            reward_batch,
            next_state_batch,
            terminal_or_truncated_batch,
        )


class OrnsteinUhlenbeckNoise:
    def __init__(
        self, action_size: int, mu: float = 0, theta: float = 0.15, sigma: float = 0.2
    ):
        # Initialize parameters and noise process.
        self.mu = mu * np.ones(action_size)
        self.theta = theta
        self.sigma = sigma
        self.noise_state = copy.copy(self.mu)
        self.noise_len = len(self.noise_state)

    def sample(self):
        # return a noise sample.
        # Changes have been made here.
        delta_noise = self.theta * (
            self.mu - self.noise_state
        ) + self.sigma * np.random.normal(
            0, 0.5, self.noise_len
        )  # np.random.randn(self.noise_len)
        self.noise_state = self.noise_state + delta_noise
        return self.noise_state


def save_object(obj, filename):
    with open(filename, 'wb') as outp:
        pickle.dump(obj, outp)


def load_object(filename):
    with open(filename, 'rb') as outp:
        object_ = pickle.load(outp)
    return object_

In [None]:
seed = 42
set_seed(seed)

# set up
env = create_env(seed)
num_parameters_recorded_per_states = env.observation_space.shape[0]
num_actions_per_step = env.action_space.shape[0]
upper_limit = env.action_space.high[0]
lower_limit = env.action_space.low[0]

In [None]:
class DDPGAgent:
    # Changes have been made here.
    def __init__(
        self,
        backbone: str,  # Network setup variable
        critic_network: List[List[int]],  # Network setup variable
        actor_network: List[int],  # Network setup variable
        tdl_priority: bool,  # Sampling variable
        buffer_size: int,  # Sampling variable
        batch_size: int,  # Sampling variable
        critic_lr: float,  # Network learning parameter
        actor_lr: float,  # Network learning parameter
        gamma: float,  # Network learning parameter
        beta: float,  # Network learning parameter
        ou_mu: float = 0,  # Exploration noise parameter
        ou_theta: float = 0.15,  # Exploration noise parameter
        ou_sigma: float = 0.2,  # Exploration noise parameter
        name: str = 'basic',  # appends to agent name for folder structure purposes
        network_type: object = Networks,  # Which class to use Networks or NetworksV2
    ):
        self.name = f"DDPG_{backbone}_{name}"

        # for creating
        self.tdl_priority = tdl_priority

        # initialise a replay buffer D
        self.buffer = ReplayBuffer(buffer_size, batch_size, tdl_priority)

        # define the network
        ntk = (
            network_type(backbone, critic_network, actor_network, agent=self)
            if network_type.__name__ == 'Networks'
            else network_type()
        )
        print(f'Building networks with class {network_type.__name__}.')

        # create target and regular actor network
        self.actor_network = ntk.generate_actor_network()
        self.target_actor = ntk.generate_actor_network(name='target_actor')
        # Making the weights equal initially
        self.target_actor.set_weights(self.actor_network.get_weights())

        # create target and regular critic network
        self.critic_network = ntk.generate_critic_network()
        self.target_critic = ntk.generate_critic_network(name='target_critic')
        # Making the weights equal initially
        self.target_critic.set_weights(self.critic_network.get_weights())

        # Learning rate for actor-critic models
        # from tensorflow.keras.optimizers.legacy import Adam
        self.critic_optimizer = tf.keras.optimizers.legacy.Adam(critic_lr)
        self.actor_optimizer = tf.keras.optimizers.legacy.Adam(actor_lr)

        # Discount factor for future rewards
        self.gamma = gamma

        # target learning rate
        self.beta = beta

        # initiate noise
        self.noise = OrnsteinUhlenbeckNoise(
            num_actions_per_step, ou_mu, ou_theta, ou_sigma
        )

    def train(self, observation_tuple):
        # calculate TD-error when replay buffer is using TD-error prioritization
        if self.tdl_priority:
            prev_state, action, reward, state, terminal_or_truncated = observation_tuple
            state = tf.expand_dims(tf.convert_to_tensor(state), 0)
            action = tf.expand_dims(tf.convert_to_tensor(action), 0)
            # Changes have been made here. Setting terminal state action predictions to 0.
            # True value, y
            q_s_dash_best_a_dash = (
                self.target_critic([state, self.target_actor(state)])
                if not terminal_or_truncated
                else 0
            )
            y = reward + self.gamma * q_s_dash_best_a_dash
            # Prediction, y_hat
            y_chapeau = self.critic_network([state, action])
            # td_loss is only used for tdl_priority!
            td_loss = tf.math.abs(tf.math.subtract(y, y_chapeau)).numpy()
            # changed this to inverse loss, aka accuracy kind of.
            # td_loss = (1 - (tf.math.abs(tf.math.square(y) - tf.math.square(y_chapeau)) / (tf.math.square(y) + tf.math.square(y_chapeau)))).numpy()

        else:
            td_loss = 0

        # store transition in buffer
        self.buffer.store(observation_tuple, td_loss)

        if self.buffer.counter > 10_000:
            # minibatch sampled from buffer
            (
                state_batch,
                action_batch,
                reward_batch,
                next_state_batch,
                terminal_or_truncated,
            ) = self.buffer.mini_batch_sample()  # Changes have been made here

            # perform gradient descent step for critic network
            with tf.GradientTape() as tape:
                # tape.watch(self.critic_network.trainable_variables) # watch_accessed_variables=False
                target_actions = self.target_actor(next_state_batch, training=True)
                # True value for the action state.
                q_s_dash_best_a_dash_tensor = tf.Variable(
                    self.target_critic(
                        [next_state_batch, target_actions], training=True
                    )
                )
                # Changes have been made here. Updating max Q(s', a') for terminal states to be 0.
                q_s_dash_best_a_dash_tensor.assign(
                    tf.where(
                        terminal_or_truncated == True, 0, q_s_dash_best_a_dash_tensor
                    )
                )
                q_s_dash_best_a_dash_tensor = tf.convert_to_tensor(
                    q_s_dash_best_a_dash_tensor
                )
                y = tf.math.add(
                    reward_batch,
                    tf.math.scalar_mul(self.gamma, q_s_dash_best_a_dash_tensor),
                )

                # Agent's prediction for the current action-state value
                critic_value = self.critic_network(
                    [state_batch, action_batch], training=True
                )
                # MSE loss
                critic_loss = tf.math.reduce_mean(
                    tf.math.square(tf.math.subtract(y, critic_value))
                )

            # Update agent's critic's parameters only
            critic_grad = tape.gradient(
                critic_loss, self.critic_network.trainable_variables
            )
            self.critic_optimizer.apply_gradients(
                zip(critic_grad, self.critic_network.trainable_variables)
            )

            # Changes have been made here. The order of the networks updates are swapped.
            # perform gradient descent step for actor network
            with tf.GradientTape() as tape:
                # tape.watch(self.actor_network.trainable_variables)  # watch_accessed_variables=False
                actions = self.actor_network(state_batch, training=True)
                critic_value = self.critic_network(
                    [state_batch, actions], training=True
                )
                # Used negative value for maximizing the value given by the critic of the input action
                actor_loss = tf.math.scalar_mul(-1, tf.math.reduce_mean(critic_value))

            actor_grad = tape.gradient(
                actor_loss, self.actor_network.trainable_variables
            )

            self.actor_optimizer.apply_gradients(
                zip(actor_grad, self.actor_network.trainable_variables)
            )

            # update target actor network parameters
            w2 = self.target_actor.get_weights()
            w1 = self.actor_network.get_weights()
            self.target_actor.set_weights(
                [self.beta * x + (1 - self.beta) * y for x, y in zip(w1, w2)]
            )

            # update target critic network parameters
            theta2 = self.target_critic.get_weights()
            theta1 = self.critic_network.get_weights()
            self.target_critic.set_weights(
                [self.beta * x + (1 - self.beta) * y for x, y in zip(theta1, theta2)]
            )

    def predict_action(self, state, env):
        if self.buffer.counter >= 50_000:
            # using actor network, predict actions value
            selected_actions = self.actor_network(state)
            # print((selected_actions), len(selected_actions.numpy()), selected_actions.numpy(), 'action stats')
            # calculating noise
            self.prev_ou_level = self.noise.sample()
            self.prev_ou_level = np.expand_dims(self.prev_ou_level, 0)
            # print((self.prev_ou_level), len(self.prev_ou_level), self.prev_ou_level, 'noise stats')
            # print(np.add(self.prev_ou_level, selected_actions.numpy()))
            # Adding noise to action

            selected_actions = np.add(
                self.prev_ou_level, selected_actions.numpy()
            )  # selected_actions + self.prev_ou_level

            # We make sure action is within bounds
            action = np.clip(selected_actions, lower_limit, upper_limit)  # [0]
            # print(tf.convert_to_tensor(action)[0])
            return tf.convert_to_tensor(action)[0]

        else:
            # explore till replay buffer is full
            return env.action_space.sample()

In [None]:
def run_experiment(
    env, agent, total_n_episodes, seed, training=True, record_modulo: int = 100
):
    entered = True
    # colect the 3 top performance
    three_best_performing_agent = {x: -200 for x in range(3)}

    # path to save videos
    top_3_agents_path = os.path.join(
        agent.name, 'top_3_agents', datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    )
    capped_cubic_path = os.path.join(
        agent.name,
        'capped_cubic_video',
        datetime.datetime.now().strftime("%Y%m%d-%H%M%S"),
    )

    # record statistics
    env = gym.wrappers.RecordEpisodeStatistics(env, deque_size=total_n_episodes)

    # set seed
    random.seed(seed)
    np.random.seed(seed)

    record_episode_actions_log = defaultdict(list)

    # count number of times the agents falls
    falls_counter = 0
    timesteps_in_environment = 0
    # Train the agent for N episodes
    for episode in range(total_n_episodes):
        # collect actions of some episode to use them to analyze algorithm performance
        record_episode_actions = False
        if episode % record_modulo == 0:
            record_episode_actions = True

        # convert the observation to tensor to run fast in colab gpu
        # and add one dimension to fit with the input shape requirement of the network
        prev_state = tf.expand_dims(tf.convert_to_tensor(env.reset(seed=seed)[0]), 0)

        episode_reward = 0
        step_starting_index = 0
        steps = 0
        while True:
            steps += 1
            action = agent.predict_action(prev_state, env)

            if record_episode_actions:
                record_episode_actions_log[episode].append(action)

            # Receive state and reward from environment.
            state, reward, terminal, truncated, info = env.step(action)

            if training:
                agent.train((prev_state, action, reward, state, terminal))

            if reward == -100:
                falls_counter += 1

            episode_reward += reward

            if terminal or truncated:
                if (
                    min(three_best_performing_agent.values()) < episode_reward
                ) and episode > 100:
                    # find the index of the previously less performing agent and its path
                    index = [
                        i
                        for i in three_best_performing_agent
                        if three_best_performing_agent[i]
                        == min(three_best_performing_agent.values())
                    ][0]
                    remove_video_path = (
                        top_3_agents_path + '/rl-video-episode-' + str(index) + '.mp4'
                    )

                    # remove videos where agents performance is no longer in the top 3 best performing agents
                    if os.path.isfile(remove_video_path):
                        os.remove(remove_video_path)

                    # remove it from the list and add the new agent
                    three_best_performing_agent.pop(index)
                    three_best_performing_agent[episode] = episode_reward

                    # save its video
                    save_video(
                        env.render(),
                        top_3_agents_path,
                        fps=env.metadata["render_fps"],
                        episode_trigger=lambda x: x == x,
                        step_starting_index=step_starting_index,
                        episode_index=episode,
                    )
                    entered = True

                if not entered:
                    # save capped cubic episode indices 0, 1, 4, 8, 27, …, K^3, …, 729, 1000, 2000, 3000, …
                    save_video(
                        env.render(),
                        capped_cubic_path,
                        fps=env.metadata["render_fps"],
                        step_starting_index=step_starting_index,
                        episode_index=episode,
                    )
                entered = False
                break

            prev_state = tf.expand_dims(tf.convert_to_tensor(state), 0)

        # share progress every 100 episodes about the last 100 episodes
        # if episode % 100 == 0 and episode != 0:

        R_100MA_episode_rewards = np.round(
            np.mean(np.array(env.return_queue).flatten()[:-100]), 2
        )
        R_100MA_episode_lengths = np.round(
            np.mean(np.array(env.length_queue).flatten()[:-100]), 2
        )
        # print(type(R_100MA_episode_rewards), np.round(R_100MA_episode_rewards))
        print(
            f"In episode {episode}, the reward is {round(episode_reward, 2)},"
            f"\nthe 100 moving avg reward is ==> {np.round(R_100MA_episode_rewards)},*"
            f"\nAvg episode length is ==> {R_100MA_episode_lengths}"
            f"\n{agent.buffer.counter} steps have been completed"
            f"\nand {steps} timesteps were taken this episode.\n"
        )

    return agent, env, record_episode_actions_log

In [None]:
agent_ddpg = DDPGAgent(
    backbone="MLP",
    critic_network=[[400], [400, 300], [200]],  # 256, 256
    actor_network=[400, 300],  # 256, 256
    tdl_priority=True,
    buffer_size=50_000,
    batch_size=64,
    critic_lr=0.0004,  # 0.0004
    actor_lr=0.0004,  # 0.0004
    gamma=0.98,
    beta=0.005,  # tau, target_lr # 0.005
    ou_mu=0,
    ou_theta=0.3,  # 0.3
    ou_sigma=0.8,  # 1.0
    name='50k_buffer',
    network_type=NetworksV2,
)

total_n_episodes = 1_000
ddpg_env = create_env(seed)
start_time = time.time()
ddpg_agent, ddpg_env, ddpg_record_episode_actions_log = run_experiment(
    ddpg_env, agent_ddpg, total_n_episodes, seed
)

end_time = time.time()
print(f'{total_n_episodes} took {round(end_time - start_time, 2)} seconds.')

In [None]:
def save_object(obj, filename):
    # os.mkdir(filename)
    with open(filename, 'wb') as outp:
        pickle.dump(obj, outp)


ep_rewards = np.concatenate(list(ddpg_env.return_queue))
ep_duration = np.concatenate(list(ddpg_env.length_queue))
ep_actions = ddpg_record_episode_actions_log

job_timestamp = '20230423-094145'
local_path = '/path/to/save/to'

video_render_path = f'{local_path}/top_3_agents/{job_timestamp}'


save_object(ep_rewards, f"{video_render_path}/ep_rewards_{total_n_episodes}")
save_object(ep_duration, f"{video_render_path}/ep_duration_{total_n_episodes}")
save_object(ep_actions, f"{video_render_path}/ep_actions_{total_n_episodes}")

In [None]:
# metrics
import matplotlib.pyplot as plt
import pandas as pd


def success_rate(env):
    # calculate the success rate of an agent out of all training epsiodes
    # a successful trail is when the reward exceed 200 ???
    reward_queue = np.array(env.return_queue).flatten()
    return len(np.where(reward_queue >= 200)) / len(reward_queue)


def R100_MA(env, total_n_episodes, name):
    # calculate the moving average of reward for every 100 episodes
    R100_MA = np.empty(total_n_episodes, dtype=float)

    for i in range(total_n_episodes):
        R100_MA[i] = np.mean(np.array(env.return_queue).flatten()[i : i + 100])

    MA = {}
    MA[name] = R100_MA
    return MA


def plot_R100_MA(*R100_MA_env_Z):
    # plot 100-episodes-moving-average of reward and scatter point of rewards
    for R100_MA_env in R100_MA_env_Z:
        reward_env = np.array(R100_MA_env[1].return_queue)
        for agent_name, value in R100_MA_env[0].items():
            plt.plot(value, 'r', label='best fit')
            plt.scatter(np.arange(0, len(reward_env), 1), reward_env, label='rewards')
            plt.legend(
                bbox_to_anchor=(1.04, 0.0, 0.2, 1),
                loc="lower left",  # agent_name,
                borderaxespad=0,
                mode='expand',
            )
            plt.xlabel('episodes')
            plt.ylabel('reward')

    plt.title('Moving average of Reward for 100 episodes')
    plt.savefig('moving_average_plot.png', dpi=1024)
    plt.show()


def plot_episode_actions(action_log, agent_name):
    # plot episode actions through several episodes and its density
    fig, axs = plt.subplots(
        nrows=len(action_log), width_ratios=[3, 1], ncols=2, figsize=(15, 18)
    )
    plt.subplots_adjust(hspace=1.1)

    fig.suptitle(f"{agent_name}\nactions in an episode ", fontsize=18, y=0.95)
    for k_episode, ax, ax1 in zip(
        action_log.keys(), axs.ravel()[::2], axs.ravel()[1::2]
    ):
        df = pd.DataFrame(action_log[k_episode])
        df.index = np.arange(0, len(action_log[k_episode]), 1)
        df.plot(ax=ax)
        ax.set(xlabel='timestep', ylabel='action value')
        df.plot(ax=ax1, kind='density')

        ax.title.set_text(f'episode {k_episode}')
        ax.get_legend().remove()

        ax1.title.set_text(f'episode {k_episode}')
        ax1.get_legend().remove()

    labels = ['Hip 1 Torque', 'Hip 2 Torque', 'Knee 1 Torque', 'Knee 2 Torque']
    fig.legend(labels=labels, loc="upper right")
    plt.savefig('actions_plot.png', dpi=1024)
    plt.show()


def sample_efficiency(*n_env, threshold=0.7):
    # sample efficiency is generally measured as the number of required agent-environment
    # interactions until a specified performance threshold is reached

    sample_efficiency = [0] * len(n_env)
    for i, env in enumerate(n_env):
        try:
            index_episode_threshold_is_reached = np.where(
                np.array(env.return_queue).flatten() > int(threshold * 300)
            )[0][0]
            sample_efficiency[i] = sum(
                np.array(env.length_queue).flatten()[
                    :index_episode_threshold_is_reached
                ]
            )

        except:
            sample_efficiency[i] = 0

    return sample_efficiency


def robustness(agent, actor_weights, seed, n_episodes):
    hardcore_env = create_env(seed, hardcore=True)
    agent, hardcore_env, record_episode_actions_log = run_experiment(
        hardcore_env, agent, n_episodes, seed, training=False
    )
    average_reward = np.mean(np.array(hardcore_env.return_queue).flatten())
    return average_reward


# sucess rate of the agent
success_rate_ddpg = success_rate(ddpg_env)  # episode, episodic_reward

# a list of reward moving average of 100 episodes
R100_MA_ddpg = R100_MA(ddpg_env, total_n_episodes, ddpg_agent.name)

# plot actions of various episodes
plot_episode_actions(ddpg_record_episode_actions_log, ddpg_agent.name)

# sample efficiency for ddpg  agent
ddpg_sample_efficiency = sample_efficiency(ddpg_env, threshold=0.7)

# measure robustness of ddpg agent by asking it to run for n episode in
# hardcore enviroemnet
# robustness_ddpg = robustness(ddpg_agent, seed=42, n_episodes=10)

# plot various moving averages
plot_R100_MA((R100_MA_ddpg, ddpg_env))