<a href="https://colab.research.google.com/github/rihat99/openai_gym/blob/main/SAC_half_cheetah_slow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import tensorflow_probability as tfp

# !pip install box2d-py
!pip install pybullet
import pybullet
import pybullet_envs
import gym
print(pybullet_envs.getList())

env = gym.make("HalfCheetahBulletEnv-v0")
max_steps = env._max_episode_steps
# max_steps = 1600
# env._max_episode_steps = max_steps

print(tf.__version__)
print(gym.__version__)

In [None]:
%load_ext tensorboard

In [None]:
num_states = env.observation_space.shape[0]
print("Size of State Space ->  {}".format(num_states))
num_actions = env.action_space.shape[0]
print("Size of Action Space ->  {}".format(num_actions))
#num_actions = 3

upper_bound = env.action_space.high[0]
lower_bound = env.action_space.low[0]

print("Max Value of Action ->  {}".format(upper_bound))
print("Min Value of Action ->  {}".format(lower_bound))

Size of State Space ->  26
Size of Action Space ->  6
Max Value of Action ->  1.0
Min Value of Action ->  -1.0


In [None]:
layer_size = 256
activation = "elu"
w_bound = 3e-3

def get_actor():
    # Initialize weights between -3e-3 and 3-e3
    #last_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)

    inputs = layers.Input(shape=(num_states))
    out = layers.Dense(layer_size, activation=activation)(inputs)
    out = layers.Dense(layer_size, activation=activation)(out)
    # out = layers.Dense(layer_size//4, activation=activation)(out)
    mean = layers.Dense(num_actions, 
                        kernel_initializer=tf.random_uniform_initializer(-w_bound, w_bound),
                        bias_initializer=tf.random_uniform_initializer(-w_bound, w_bound)
    )(out)
    log_std = layers.Dense(num_actions,
                        kernel_initializer=tf.random_uniform_initializer(-w_bound, w_bound),
                        bias_initializer=tf.random_uniform_initializer(-w_bound, w_bound)
    )(out)

    # Our upper bound is 2.0 for Pendulum.
    # outputs = outputs * upper_bound
    model = tf.keras.Model(inputs, [mean, log_std])
    return model


def get_critic():
    # State as input
    state_input = layers.Input(shape=(num_states))
    state_out = layers.Dense(layer_size//2, activation=activation)(state_input)
    state_out = layers.Dense(layer_size, activation=activation)(state_out)

    # Action as input
    action_input = layers.Input(shape=(num_actions))
    action_out = layers.Dense(layer_size//4, activation=activation)(action_input)

    # Both are passed through seperate layer before concatenating
    concat = layers.Concatenate()([state_out, action_out])

    out = layers.Dense(layer_size, activation=activation)(concat)
    out = layers.Dense(layer_size, activation=activation)(out)
    # out = layers.Dense(layer_size//4, activation=activation)(out)
    outputs = layers.Dense(1)(out)

    # Outputs single value for give state-action
    model = tf.keras.Model([state_input, action_input], outputs)

    return model

In [None]:
class OUActionNoise:
    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
        self.theta = theta
        self.mean = mean
        self.std_dev = std_deviation
        self.dt = dt
        self.x_initial = x_initial
        self.reset()

    def __call__(self):
        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process.
        x = (
            self.x_prev
            + self.theta * (self.mean - self.x_prev) * self.dt
            + self.std_dev * np.sqrt(self.dt) * np.random.normal()
        )
        # Store x into x_prev
        # Makes next noise dependent on current one
        self.x_prev = x
        return x

    def reset(self):
        if self.x_initial is not None:
            self.x_prev = self.x_initial
        else:
            self.x_prev = np.zeros_like(self.mean)

In [None]:
class Agent:
    def __init__(self, buffer_capacity=100000, batch_size=64, alpha = 0.2):
        # Number of "experiences" to store at max
        self.buffer_capacity = buffer_capacity
        # Num of tuples to train on.
        self.batch_size = batch_size
        self.alpha = tf.Variable(alpha, dtype=tf.float32)
        self.target_entropy = tf.constant(-num_actions, dtype=tf.float32)
        self.sigma_noise = 1e-6
        # Its tells us num of times record() was called.
        self.buffer_counter = 0
        # Instead of list of tuples as the exp.replay concept go
        # We use different np.arrays for each tuple element
        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))
        self.done_buffer = np.zeros((self.buffer_capacity, 1))

    # Takes (s,a,r,s') obervation tuple as input
    def record(self, obs_tuple):
        # Set index to zero if buffer_capacity is exceeded,
        # replacing old records
        index = self.buffer_counter % self.buffer_capacity

        self.state_buffer[index] = obs_tuple[0]
        self.action_buffer[index] = obs_tuple[1]
        self.reward_buffer[index] = obs_tuple[2]
        self.next_state_buffer[index] = obs_tuple[3]
        self.done_buffer[index] = obs_tuple[4]

        self.buffer_counter += 1


    def policy(self, state, noise=0):
        mu, log_sigma = actor_model(state)
        log_sigma = tf.clip_by_value(log_sigma, -20, 2);

        sigma = tf.exp(log_sigma)
        dist = tfp.distributions.Normal(mu, sigma)
        raw_actions = dist.sample()
        log_action = dist.log_prob(raw_actions)

        mu = tf.tanh(mu)
        actions = tf.tanh(raw_actions)

        log_action -= tf.math.log(1.0 - tf.math.pow(actions, 2) + self.sigma_noise)
        log_action = tf.reduce_sum(log_action, axis=1, keepdims=True)

        actions = (actions + noise) * upper_bound
        mu = mu * upper_bound
        actions = tf.clip_by_value(actions, lower_bound, upper_bound)
        return actions, log_action, mu

    @tf.function
    def update(
        self, state_batch, action_batch, reward_batch, next_state_batch, done_batch,
    ):
        next_actions, next_log_action, _ = self.policy(next_state_batch)

        target_Q_values_a = target_critic_a([next_state_batch, next_actions], training=True)
        target_Q_values_b = target_critic_b([next_state_batch, next_actions], training=True)
        min_target_Q_values = tf.math.minimum(target_Q_values_a, target_Q_values_b)
        soft_Q_target = min_target_Q_values - self.alpha * next_log_action

        y = reward_batch + gamma * soft_Q_target

        with tf.GradientTape() as tape:
            critic_value_a = critic_model_a([state_batch, action_batch], training=True)
            critic_loss_a = tf.math.reduce_mean(tf.math.square(y - critic_value_a))

        critic_grad_a = tape.gradient(critic_loss_a, critic_model_a.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad_a, critic_model_a.trainable_variables)
        )

        with tf.GradientTape() as tape:
            critic_value_b = critic_model_b([state_batch, action_batch], training=True)
            critic_loss_b = tf.math.reduce_mean(tf.math.square(y - critic_value_b))

        critic_grad_b = tape.gradient(critic_loss_b, critic_model_b.trainable_variables)
        critic_optimizer.apply_gradients(
            zip(critic_grad_b, critic_model_b.trainable_variables)
        )

        with tf.GradientTape() as tape:
            actions, log_action, _ = self.policy(state_batch)

            critic_value_a = critic_model_a([state_batch, actions], training=True)
            critic_value_b = critic_model_b([state_batch, actions], training=True)
            min_Q = tf.math.minimum(critic_value_a, critic_value_b)
            soft_Q = self.alpha * log_action - min_Q
            # Used `-value` as we want to maximize the value given
            # by the critic for our actions
            actor_loss = tf.reduce_mean(soft_Q)

        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
        actor_optimizer.apply_gradients(
            zip(actor_grad, actor_model.trainable_variables))
        

        with tf.GradientTape() as tape:
            actions, log_action, _ = self.policy(state_batch)
            # log_alpha = tf.math.log(self.alpha)
            alpha_loss = -tf.reduce_mean(self.alpha * (log_action + self.target_entropy))
        
        alpha_grads = tape.gradient(alpha_loss, [self.alpha])
        alpha_optimizer.apply_gradients(zip(alpha_grads, [self.alpha]))

        return critic_loss_a, critic_loss_b, actor_loss

    # We compute the loss and update parameters
    def learn(self):
        # Get sampling range
        record_range = min(self.buffer_counter, self.buffer_capacity)
        # Randomly sample indices
        batch_indices = np.random.choice(record_range, self.batch_size)

        # Convert to tensors
        state_batch = tf.convert_to_tensor(self.state_buffer[batch_indices])
        action_batch = tf.convert_to_tensor(self.action_buffer[batch_indices])
        reward_batch = tf.convert_to_tensor(self.reward_buffer[batch_indices])
        reward_batch = tf.cast(reward_batch, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(self.next_state_buffer[batch_indices])
        done_batch = tf.convert_to_tensor(self.done_buffer[batch_indices])
        done_batch = tf.cast(done_batch, dtype=tf.float32)

        critic_loss_a, critic_loss_b, actor_loss = self.update(
            state_batch, action_batch, reward_batch, next_state_batch, done_batch
        )

        with train_summary_writer.as_default():
            tf.summary.scalar('train/critic_loss_a', critic_loss_a, step=total_step)
            tf.summary.scalar('train/critic_loss_b', critic_loss_b, step=total_step)
            tf.summary.scalar('train/actor_loss', actor_loss, step=total_step)
            tf.summary.scalar('train/alpha', self.alpha, step=total_step)


# This update target parameters slowly
# Based on rate `tau`, which is much less than one.
@tf.function
def update_target(target_weights, weights, tau):
    for (a, b) in zip(target_weights, weights):
        a.assign(b * tau + a * (1 - tau))

In [None]:
actor_model = get_actor()

critic_model_a = get_critic()
critic_model_b = get_critic()

target_critic_a = get_critic()
target_critic_b = get_critic()
# Making the weights equal initially
target_critic_a.set_weights(critic_model_a.get_weights())
target_critic_b.set_weights(critic_model_b.get_weights())
# Learning rate for actor-critic models
critic_lr = 0.001
actor_lr = 0.001
alpha_lr = 0.001

critic_optimizer = tf.keras.optimizers.Adam(critic_lr)
actor_optimizer = tf.keras.optimizers.Adam(actor_lr)
alpha_optimizer = tf.keras.optimizers.Adam(alpha_lr)

total_episodes = 100
# Discount factor for future rewards
gamma = 0.99
alpha = 0.0
# Used to update target networks
tau = 0.005
std_dev_action = 0.1
start_size = 10000

agent = Agent(200000, 64,  alpha)

from IPython.display import clear_output
total_step = 0

In [None]:
for ep in range(400, 600):

    prev_state = env.reset()
    episodic_reward = 0
    step = 0
    noise = tf.random.normal(mean=0.0, stddev=std_dev_action, shape=(max_steps, num_actions))

    while True:
        tf_prev_state = tf.expand_dims(tf.convert_to_tensor(prev_state), 0)

        if agent.buffer_counter < start_size:
            action = env.action_space.sample()
        else:
            action, _ , _ = agent.policy(tf_prev_state, noise[step])
            action = tf.squeeze(action)
        state, reward, done, info = env.step(action)
        # new_reward = reward + state[0]*10 + state[9]*10
        agent.record((prev_state, action, reward, state, done))
        episodic_reward += reward

        if agent.buffer_counter >= start_size:
            agent.learn()
            update_target(target_critic_a.variables, critic_model_a.variables, tau)
            update_target(target_critic_b.variables, critic_model_b.variables, tau)

        prev_state = state
        step += 1
        total_step += 1
        if done:
            break


    with episode_summary_writer.as_default():
        tf.summary.scalar('main/reward', episodic_reward, step=ep)
        tf.summary.scalar('main/steps', step, step=ep)

    clear_output(wait=True)
    print('Episode:', ep, 'Current Reward:', episodic_reward, 'Steps:', step)

Episode: 599 Current Reward: 2635.1066111801597 Steps: 1000


In [None]:
%tensorboard --logdir logs/gradient_tape

In [None]:
%reload_ext tensorboard

import datetime
!rm -rf ./logs/

current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
episode_log_dir = 'logs/gradient_tape/' + current_time + '/episode'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)
episode_summary_writer = tf.summary.create_file_writer(episode_log_dir)

In [None]:
keras.models.save_model(actor_model, 'model')

INFO:tensorflow:Assets written to: model/assets


In [None]:
!sudo apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!pip3 install pyvirtualdisplay && pip install pyvirtualdisplay
!pip install -U colabgymrender
from colabgymrender.recorder import Recorder

directory = './video'
video_env = Recorder(env, directory)

In [None]:
obs = video_env.reset()
while True:
    obs = tf.expand_dims(tf.convert_to_tensor(obs), 0)
    _, _, action = agent.policy(obs)
    action = tf.squeeze(action)
    obs, reward, done, info = video_env.step(action)
    if done:
        break
        
video_env.play()

In [None]:
from gym.wrappers import Monitor
env = Monitor(gym.make('CartPoleContinuousBulletEnv-v0'), './video', force=True)
state = env.reset()
while True:
    _ , _ , action = agent.policy(state[np.newaxis])
    # action = np.squeeze(action)
    state, reward, done, info = env.step(action)
    env.render()
    if done:
        break
env.close()