In [1]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt

from collections import UserDict

import gym
import gym.envs.registration

# Do this before importing pybullet_envs (adds an extra property env_specs as a property to the registry, so it looks like the <0.26 envspec version)
registry = UserDict(gym.envs.registration.registry)
registry.env_specs = gym.envs.registration.registry
gym.envs.registration.registry = registry

import pybullet_envs

import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

<h2> (1)  Experience Replay Memory</h2>

In [2]:
# experience replay memory
class ReplayBuffer(object):
    def __init__(self,max_size=1e6):
        self.storage = []
        self.max_size = max_size
        self.ptr = 0
    
    def add(self, transition):
        if len(self.storage) == self.max_size:
            self.storage[int(self.ptr)] = transition
            self.ptr = (self.ptr + 1) % self.max_size
        else:
            self.storage.append(transition)
    
    def sample(self,batch_size):
        ind = np.random.randint(0,len(self.storage),batch_size)
        batch_states, batch_next, batch_actions,batch_rewards, batch_done = [],[],[],[],[]
        for i in ind:
            state,next_state,action,reward,done = self.storage[i]
            batch_states.append(np.array(state,copy=False))
            batch_next.append(np.array(next_state,copy=False))
            batch_actions.append(np.array(action,copy=False))
            batch_rewards.append(np.array(reward,copy=False))
            batch_done.append(np.array(done,copy=False))
        return np.array(batch_states), np.array(batch_next), np.array(batch_actions),np.array(batch_rewards).reshape(-1,1), np.array(batch_done).reshape(-1,1)

<h2> (2) We build one neural network for Actor Model and one neural network for the Actor Target</h2>

In [3]:
class Actor(nn.Module):
    def __init__(self,state_dim, action_dim,max_action):
        super(Actor,self).__init__()
        self.layer_1 = nn.Linear(state_dim,400)
        self.layer_2 = nn.Linear(400,300)
        self.layer_3 = nn.Linear(300,action_dim)
        self.max_action = max_action
    
    def forward(self,x):
        x = F.relu(self.layer_1(x))
        x = F.relu(self.layer_2(x))
        x = self.max_action * torch.tanh(self.layer_3(x))
        return x
        

<h2>(3) Build 2 neural network for two Critic Model and two neural network for two Critic Target</h2>

In [4]:
class Critic(nn.Module):
    
    def __init__(self,state_dim,action_dim):
        super(Critic,self).__init__()
        # critic model 1 neural network
        self.layer_1 = nn.Linear(state_dim * action_dim, 400)
        self.layer_2 = nn.Linear(400,300)
        self.layer_3 = nn.Linear(300,1)
        # critic model 2 neural network
        self.layer_4 = nn.Linear(state_dim * action_dim, 400)
        self.layer_5 = nn.Linear(400,300)
        self.layer_6 = nn.Linear(300,1)
        
    def forward(self,x,u):
        xu = torch.cat([x,u],axis=1)
        # forward for first critic network
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        # forward for second critic network
        x2 = F.relu(self.layer_4(xu))
        x2 = F.relu(self.layer_5(x2))
        x2 = self.layer_6(x2)
        
        return x1,x2
    
    def Q1(self,x,u):
        xu = torch.cat([x,u],axis=1)
        x1 = F.relu(self.layer_1(xu))
        x1 = F.relu(self.layer_2(x1))
        x1 = self.layer_3(x1)
        return x1

<h2>(4) Training </h2>

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class TD3(object):
    
    def __init__(self,state_dim, action_dim,max_action):
        self.actor = Actor(state_dim, action_dim,max_action).to(device)
        self.actor_target = Actor(state_dim, action_dim,max_action).to(device)
        self.actor_target.load_state_dict(self.actor.state_dict())
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
        
        self.critic = Critic(state_dim,action_dim).to(device)
        self.critic_target = Critic(state_dim,action_dim).to(device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
        
        self.max_action = max_action
        
    def select_action(self,state):
        state = torch.Tensor(state.reshape(1,-1)).to(device)
        return self.actor(state).cpu().data.numpy().flatten()
    
    def train(self,replay_buffer, iterations, batch_size=100,discount=0.99, tau=0.005, 
              policy_noise=0.2, noise_clip=0.5, policy_freq=2):
        for it in range(iterations):
            
            # step 4 - sample a batch of transitions (s,s',a,r)
            batch_states, batch_next_states, batch_actions,batch_rewards, batch_dones = replay_buffer.sample(batch_size)
            states = torch.Tensor(batch_states).to(device)
            next_states = torch.Tensor(batch_next_states).to(device)
            actions = torch.Tensor(batch_actions).to(device)
            rewards = torch.Tensor(batch_rewards).to(device)
            dones = torch.Tensor(batch_dones).to(device)
            
            # step 5 - from next state s' , the actor target plays the next action a'
            next_action = self.actor_target.forward(next_states)
            
            # step 6 - add guassian noise to next action a' and clamp it in rane of values supported by environment.
            noise = torch.Tensor(batch_actions).data.normal_(0,policy_noise).to(device)
            noise = noise.clamp(-noise_clip,noise_clip)
            next_action = (next_action + noise).clamp(-self.max_action,self.max_action)
            
            # step 7 - The two critic targets take each (s',a') as inputs and return two The Q values Qt1(s',a') and Qt2 as ouputs
            target_Q1, target_Q2 = self.critic_target.forward(next_states,next_action)
            
            # step 8 - keep the minimum of two Q values
            target_Q = torch.min(target_Q1,target_Q2)
            
            # step 9 - final target of the two critic models, Qt = r + y * min(Qt1, Qt2)
            target_Q = rewards + ((1 - dones) * discount * target_Q).detach()
            
            # step 10 - The two Critic model takes the couple (s,a) and return 2 Q values as outputs
            current_Q1, current_Q2 = self.critic.forward(states,actions)
            
            # step 11 - Compute the loss from the 2 critic models, loss = MSE_Loss(Q1(s,a),Qt) + MSE_Loss(Q2(s,a),Qt)
            critic_loss = F.mse_loss(current_Q1,target_Q) + F.mse_loss(current_Q2,target_Q)
            
            # step 12 - backpropagate the critic loss and update parameters of the 2 critic model with a SGD optimizer
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            self.critic_optimizer.step()
            
            # step 13 once every 2 iteration, we update actor model by performing gradient descent
            # on the output of first critic model.
            if it % policy_freq == 0:
                actor_loss = -self.critic.Q1(states,self.actor(states)).mean()
                self.actor_optimizer.zero_grad()
                actor_loss.backward()
                self.actor_optimizer.step()
                
                # step 14 - we update the weights of the actor target by polyak averaging
                for param, target_param in zip(self.actor.parameters(),self.actor_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
                # step 15 - update tje weights of critic target by polyak averaging
                for param, target_param in zip(self.critic.parameters(),self.critic_target.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
    
    # save the trained model
    def save(self,filename,directory):
        torch.save(self.actor.state_dict() ,'%s/%s_actor.pth' %(directory,filename))
        torch.save(self.critic.state_dict(),'%s/%s_critic.pth' %(directory,filename))
    
    # load pre-trained model
    def load(self,filename,directory):
        self.actor.load_state_dict('%s/%s_actor.pth' %(directory,filename))
        self.critic.load_state_dict('%s/%s_critic.pth' %(directory,filename))

<h2>Function to evaluate policy by calculating its average reward over 10 episodes</h2>

In [6]:
def evaluate_policy(policy, eval_episodes=10):
  avg_reward = 0.
  for _ in range(eval_episodes):
    obs = env.reset()
    done = False
    while not done:
      random_action = policy.select_action(np.array(obs))
      obs,reward,done,info = env.step(random_action)
      #obs, reward, done, truncated = env.step(action)
      #obs, reward, done, _,_ = env.step(action)
      avg_reward += reward
  avg_reward /= eval_episodes
  print ("---------------------------------------")
  print ("Average Reward over the Evaluation Step: %f" % (avg_reward))
  print ("---------------------------------------")
  return avg_reward

<h2>Set the parameters</h2>

In [7]:
env_name = "HalfCheetahBulletEnv-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

## We create a file name for the two saved models: the Actor and Critic models

In [8]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_HalfCheetahBulletEnv-v0_0
---------------------------------------


## We create a folder inside which will be saved the trained models

In [9]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")

## We create the PyBullet environment

In [10]:
env = gym.make(env_name)

## We set seeds and we get the necessary information on the states and actions in the chosen environment

In [11]:
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])

## We create the policy network (the Actor model)

In [12]:
policy = TD3(state_dim, action_dim, max_action)

## We create the Experience Replay memory

In [13]:
replay_buffer = ReplayBuffer()

## We define a list where all the evaluation results over 10 episodes are stored

In [14]:
evaluations = [evaluate_policy(policy,eval_episodes=1)]

  logger.warn(
  logger.warn(
  logger.warn(
  logger.deprecation(


KeyboardInterrupt: 