In [None]:
!pip install pybullet==3.2.4
!pip install gym==0.22.0

Collecting pybullet==3.2.4
  Downloading pybullet-3.2.4.tar.gz (80.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pybullet
  Building wheel for pybullet (setup.py) ... [?25l[?25hdone
  Created wheel for pybullet: filename=pybullet-3.2.4-cp311-cp311-linux_x86_64.whl size=99854772 sha256=bfd25919a516e9fae9bb0b7ed70758bb445b9c78f4912b0df44f05a6ce78016b
  Stored in directory: /root/.cache/pip/wheels/6d/f6/49/7f3f8698a836ce4f918823c333d8d64150c20430af0ca9c305
Successfully built pybullet
Installing collected packages: pybullet
Successfully installed pybullet-3.2.4
Collecting gym==0.22.0
  Downloading gym-0.22.0.tar.gz (631 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m631.1/631.1 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting

In [None]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque




## Step 1: We initialize the Experience Replay memory



 It acts as memory storage for past experiences **(state transitions)** that the agent has encountered, allowing it to learn from them efficiently.
*   Think of it like a **notebook**:
    - The agent writes down its experiences (state transitions) in this notebook.
    - Each experience consists of **(state, next_state, action, reward, done)**
      - State (s) → What the agent saw.
      - Next State (s ′) → What happened after taking an action.
      - Action (a) → What the agent did.
      - Reward (r) → What it got in return.
      - Done (d) → Whether the episode ended.

* Why use a Reply buffer?
    - It helps **break correlation** between consecutive experiences.
    - It allows for more efficient **re-use** of past data.
    - It improves training stability by **diversifying learning** across past and recent events.

In [None]:
class ReplayBuffer():
  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    """ Circular buffer to add experiences"""
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition # Only overwrite when full
      self.ptr = (self.ptr + 1) % self.max_size # update ptr
    else:
      self.storage.append(transition) # Add new elements initially

  def sample(self, batch_size):
    """Sampling a batch of random past experiences"""
    ind = np.random.randint(0, len(self.storage), size=batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    for i in ind:
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards), np.array(batch_dones)


## Step 2: We build NN for Actor

It maps states to actions in a deterministic manner.

```
1.   Actor model
2.   Target model
```


*   Think of it as a **decision-maker**:
    - The actor receives the** current state** (what it sees) and outputs an **action** (what to do next).
    - It learns to choose **optimal actions** by interacting with the environment and maximizing expected rewards.

*   Why Scale by max_action?

  - In environments like robotics or autonomous control, actions often have a physical limit (e.g., [-2, 2] for torque control).
  - The tanh function limits the output between [−1,1], so multiplying by max_action ensures actions stay within the required range.

* Summary:
  - This Actor network **learns how to map states to actions**.
  - It’s **deterministic**, meaning it always outputs the same action for the same state.
  - It plays a crucial role in policy-based RL by generating actions to explore and interact with the environment. 🚀



In [None]:
class Actor(nn.Module):
  """It maps states to actions in a deterministic manner"""
  def __init__(self, state_dim, action_dim, max_action):
    super().__init__()
    self.layer_1 = nn.Linear(state_dim, 400)
    self.layer_2 = nn.Linear(400, 400)
    self.layer_3 = nn.Linear(400, action_dim)
    self.max_action = max_action

  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
     # tanh to keep output in the range [-1, 1] &
     # scaled by max_action to match the action required by the env.
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x


## Step 3: We build NN for two Critics

It evaluates **how good an action is in a given state** by extimating the Q-value.

```
1.   Critic Model 1
2.   Critic Target 1
3.   Critic Model 2
4.   Critic Target 2
```


* Think of it as a **judge**:
    - The Critic receives a state (s) and an action (a) and predicts a Q-value.
    - This Q-value tells us how good or bad the action is in that state based on expected future rewards.
    - The policy (Actor) improves by following the actions that get higher Q-values.
*   Why two critics?
    - This is a key feature of TD3, which **reduces overestimation bias** common in Q-learning.
    - Instead of relying on a single Q-value, TD3 takes the minimum Q-value from two critics to improve stability.





In [None]:
class Critic(nn.Module):
  def __init__(self, state_dim, action_dim):
    super().__init__()
    # Define first critic NN
    self.layer_1 = nn.Linear(state_dim + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, 1)

    # Define second critic NN
    self.layer_4 = nn.Linear(state_dim + action_dim, 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, 1)

  def forward(self, x, u):
    xu = torch.cat([x,u], 1) # Takes both state and action as input. (concatenated)

    # Forward prop on the first critic NN
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)

    # Forward prop on the second critic NN
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)
    return x1, x2

  def Q1(self, x, u):
    """Runs only on first critic NN: Used during policy updates (Actor optimization)"""
    xu = torch.cat([x,u], 1)
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1



## Step 4 - 15: Training process

* Three key features:

    - **Twin Critic Networks**: Uses two Q-functions and takes the minimum Q-value.
    - **Delayed Policy Updates**: The actor updates less frequently than the critic (every policy_freq steps).
    - **Target Policy Smoothing**: Adds noise to target actions to make training more robust.

* Step-by-step Breakdown:
    - Sample a batch from the replay buffer.
    - Compute the next action using the target actor and add clipped Gaussian noise.
    - Compute target Q-values by taking the minimum Q-value from the two target critics.
    - Compute Bellman target using: Qt = r + (1 - done) * γ * min(Qt1, Qt2)
    - Compute critic loss (MSE between predicted and target Q-values).
    - Update Critic network by backpropagation.
    - Update Actor network every policy_freq steps using gradient ascent on Q1.
    - Polyak Averaging for target networks.

* Summary of Strengths
  - Implements Twin Critics for reducing overestimation.
 - Uses target policy smoothing to prevent overfitting to Q-value spikes.
  - Correctly delays policy updates (every policy_freq steps).
 - Implements Polyak Averaging for stable training.
 - Efficient training loop that samples from replay buffer.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole training process

class TD3():
  def __init__(self, state_dim, action_dim, max_action):
    """Actor-critic network setup"""
    # Actor
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

    # Critics
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def select_action(self, state):
    """state --> Actor ---> action"""
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
    for it in range(iterations):
      ## Q-LEARNING PART ######################################################
      # Step 4: We sample a batch of transitions (s, s', a, r) from the memory
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)

      # Step 5: From the next state s', the Actor target plays the next action a'
      next_action = self.actor_target(next_state)

      # Step 6: We add Gaussion noise to this next action a' and we clamp it in a range of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device) # Gaussian noise
      noise = noise.clamp(-noise_clip, noise_clip) # clamp noise
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action) # clamp action

      # Step 7: The two critic targets take each the couple (s', a') as input and return two Q values
      target_Q1, target_Q2 = self.critic_target(next_state, next_action)

      # Step 8: We keep the minimum of these two Q-values
      target_Q = torch.min(target_Q1, target_Q2) # Key feature: it prevents too optimistic estimates of the value of state

      # Step 9: We get the final target of the two critic models, which is: Qt = r + γ * min(Qt1, Qt2)
      target_Q = reward + ((1 - done) * discount * target_Q).detach() # Bellman target

      # Step 10: The two critic models take each the couple (s, a)
      current_Q1, current_Q2 = self.critic(state, action)

      # Step 11: We compute the loss coming from the two critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

      # Step 12: We backpropagate this critic loss and update the parameters of the two critic models with SGD
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()

      ## POLICY-LEARNING PART ###################################################
      # Step 13: Once every 2 iterations, we update our Actor model by performing gradient ascent on the output of first critic model
      if it % policy_freq == 0:
        actor_loss = -self.critic.Q1(state, self.actor(state)).mean() # Gradient ascent
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # Polyak averaging

        # Step 15: Still once every two iterations, we update the weights of the Critic target by Polyak averaging
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

  # Save method to save the trained model
  def save(self, filename, directory):
    torch.save(self.actor.state_dict(), '%s%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s%s_critic.pth' % (directory, filename))

  # Load method to load a pre-trained model
  def load(self, filename, directory):
    self.actor.load_state_dict(torch.load('%s%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.laod('%s%s_critic.pth' % (directory, filename)))


## We make a function that evaluates the policy by calculating its average reward over 10 episodes

In [None]:
def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(np.array(obs))
      obs, reward, done, _ = env.step(action)
      avg_reward += reward
  avg_reward /= eval_episodes
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  return avg_reward

## We set the parameters

In [None]:
env_name = "AntBulletEnv-v0"
seed = 0
start_timesteps = 1e4 # Number of iteration/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # how often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pretrained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the toal discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

## We create a filename for the two saved models: the actor and critic models

In [None]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_AntBulletEnv-v0_0
---------------------------------------


## We create a folder inside which will be saved the trained models

In [None]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")

## We create PyBullet environment

In [None]:
env = gym.make(env_name)

## We set seeds and we get the necessary information on the states and actions in the choosen environment

In [None]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

print(state_dim, action_dim, max_action)

28 8 1.0


## We create the policy network (The actor NN)

In [None]:
policy = TD3(state_dim, action_dim, max_action)

## We create the Experience replay memory

In [None]:
replay_buffer = ReplayBuffer()

## We define a list where all the evaluation results over 10 episodes are stored

In [None]:
evaluations = [evaluate_policy(policy)]

---------------------------------------
Average Reward over the Evaluation Step: 1.286181
---------------------------------------


## We create a new folder directory in which the final results (videos of the agent) will be populated

In [None]:
def mkdir(base, name):
  path = os.path.join(base, name)
  if not os.path.exists(path):
    os.makedirs(path)
  return path

work_dir = mkdir('exp', 'brs')
monitor_dir = mkdir(work_dir, 'monitor')
max_episode_steps = env._max_episode_steps
save_env_vid = False
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()

## We initialize the variables

In [None]:
total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
t0 = time.time()

## Training

In [None]:
# We start the main loop over 500000 timesteps
while total_timesteps < max_timesteps:

  # If episode is done
  if done:

    # If we are not at the very beginning, we start the training process of the model
    if total_timesteps != 0:
      print(f"Total Timesteps: {total_timesteps} Episode Num: {episode_num} Reward: {episode_reward}")
      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)

    # We evaluate the episode and we save the policy
    if timesteps_since_eval >= eval_freq:
      timesteps_since_eval %= eval_freq
      evaluations.append(evaluate_policy(policy))
      policy.save(file_name, directory="./pytorch_models")
      np.save("./results/%s" % (file_name), evaluations)

    # When the training step is done, we reset the state of the environment
    obs = env.reset()

    # Set the Done to False
    done = False

    # Set rewards and episode timesteps to zero
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1

  # Before 10000 timesteps, we play random actions
  if total_timesteps < start_timesteps:
    action = env.action_space.sample() # Exploration
  else: # After 10000 timesteps, we switch to the model
    action = policy.select_action(np.array(obs)) # Exploitation
    # If the explore_noise parameter is not 0, we add noise to the action and we clip it
    if expl_noise != 0:
      action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)

  # The agent performs the action in the environment, then reaches the next state and receives the reward
  new_obs, reward, done, _ = env.step(action)

  # We check if the episode is done
  done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)

  # We increase the total reward
  episode_reward += reward

  # We store the new transition into the Experience reply memory
  replay_buffer.add((obs, new_obs, action, reward, done_bool))

  # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
  obs = new_obs
  episode_timesteps += 1
  total_timesteps += 1
  timesteps_since_eval += 1

# We add the last policy evaluation to our list of evaluations and we save our model
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)



Total Timesteps: 1000 Episode Num: 1 Reward: 514.4501235746966


  critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)


Total Timesteps: 1242 Episode Num: 2 Reward: 118.51042804752649
Total Timesteps: 2242 Episode Num: 3 Reward: 476.7937800553202
Total Timesteps: 3242 Episode Num: 4 Reward: 509.2301179105158
Total Timesteps: 4242 Episode Num: 5 Reward: 505.0347477280715
Total Timesteps: 5242 Episode Num: 6 Reward: 460.95903656011495
---------------------------------------
Average Reward over the Evaluation Step: 21.067685
---------------------------------------
Total Timesteps: 6161 Episode Num: 7 Reward: 502.1212387917405
Total Timesteps: 6359 Episode Num: 8 Reward: 96.94527423135878
Total Timesteps: 6518 Episode Num: 9 Reward: 77.86445725471131
Total Timesteps: 7518 Episode Num: 10 Reward: 512.6795672101807
Total Timesteps: 8518 Episode Num: 11 Reward: 467.71708045839785
Total Timesteps: 9157 Episode Num: 12 Reward: 330.39261506138115
Total Timesteps: 9177 Episode Num: 13 Reward: 3.5263351028857852
Total Timesteps: 10177 Episode Num: 14 Reward: 544.500120543213
---------------------------------------


## Inference

In [None]:
class Actor(nn.Module):

  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.layer_1 = nn.Linear(state_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action

  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x

class Critic(nn.Module):

  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()
    # Defining the first Critic neural network
    self.layer_1 = nn.Linear(state_dim + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, 1)
    # Defining the second Critic neural network
    self.layer_4 = nn.Linear(state_dim + action_dim, 400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, 1)

  def forward(self, x, u):
    xu = torch.cat([x, u], 1)
    # Forward-Propagation on the first Critic Neural Network
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    # Forward-Propagation on the second Critic Neural Network
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)
    return x1, x2

  def Q1(self, x, u):
    xu = torch.cat([x, u], 1)
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1

# Selecting the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Building the whole Training Process into a class

class TD3(object):

  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def select_action(self, state):
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):

    for it in range(iterations):

      # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)

      # Step 5: From the next state s’, the Actor target plays the next action a’
      next_action = self.actor_target(next_state)

      # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clip, noise_clip)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)

      # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
      target_Q1, target_Q2 = self.critic_target(next_state, next_action)

      # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
      target_Q = torch.min(target_Q1, target_Q2)

      # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
      target_Q = reward + ((1 - done) * discount * target_Q).detach()

      # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
      current_Q1, current_Q2 = self.critic(state, action)

      # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

      # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()

      # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
      if it % policy_freq == 0:
        actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

        # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

  # Making a save method to save a trained model
  def save(self, filename, directory):
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))

  # Making a load method to load a pre-trained model
  def load(self, filename, directory):
    self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      action = policy.select_action(np.array(obs))
      obs, reward, done, _ = env.step(action)
      avg_reward += reward
  avg_reward /= eval_episodes
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  return avg_reward

env_name = "AntBulletEnv-v0"
seed = 0

file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")


eval_episodes = 10
save_env_vid = True
env = gym.make(env_name)
max_episode_steps = env._max_episode_steps
if save_env_vid:
  env = wrappers.Monitor(env, monitor_dir, force = True)
  env.reset()
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

# Load trained model from memory
policy = TD3(state_dim, action_dim, max_action)
policy.load(file_name, './pytorch_models/')

# Evaluate
_ = evaluate_policy(policy, eval_episodes=eval_episodes)