In [0]:
!pip install pybullet

In [0]:
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque

In [0]:
# STEP 1
# We initialize the Experience Replay Memory with a size of 1e6.
# Then we populate it with new transitions
class ReplayBuffer(object):
  def __init__(self, max_size = 1e6):
    self.storage = []
    self.max_size = max_size
    selt.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr+1) % self.max_size
    else:
      self.storage.append(transition)
  
  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards,\
      batch_dones= [], [], [], [], [] 
    for i in ind:
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy = False))
      batch_next_states.append(np.array(next_state, copy = False))
      batch_actions.append(np.array(action, copy = False))
      batch_rewards.append(np.array(reward, copy = False))
      batch_dones.append(np.array(done, copy = False))
    return np.array(batch_states), np.array(batch_next_states),\
      np.array(batch_actions), np.array(batch_rewards).reshape(-1,1),\
        np.array(batch_dones).reshape(-1,1)


In [0]:
# STEP 2
# Build one DNN for the Actor model and one for Actor Target
class Actor(nn.Module):

  def __init__(self, state_dims, action_dim, max_action):
    #max_action is to clip in case we added too much noise
    super(Actor,self).__init__() #activate the inheritance
    self.layer_1 = nn.Linear(state_dims,400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,action_dim)
    self.max_action = max_action

  def forward(self,x):
    x = F.relu(self,layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x))
    return x


In [0]:
# STEP 3
# Build two DNNs for the two Critic models and two DNNs 
# for the two Critic Targets
class Critic(nn.Module):

  def __init__(self, state_dims, action_dim):
    #max_action is to clip in case we added too much noise
    super(Critic,self).__init__() #activate the inheritance
    #First Critic Network
    self.layer_1 = nn.Linear(state_dims + action_dim,400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,action_dim)
    #second critic network
    self.layer_4 = nn.Linear(state_dims + action_dim,400)
    self.layer_5 = nn.Linear(400,300)
    self.layer_6 = nn.Linear(300,action_dim)
    
  def forward(self, x, u): # x- state, u- action
    xu = torch.cat([x,u],1) # 1 for vertical concat, 0 for Horizontal
    # forward propogation on first Critic
    x = F.relu(self,layer_1(xu))
    x = F.relu(self.layer_2(x1))
    x = self.layer_3(x1)
    #forward propogation on second Critic
    x = F.relu(self,layer_4(xu))
    x = F.relu(self.layer_5(x1))
    x = self.layer_6(x2)
    
    return x1, x2

  def Q1(self, x, u): # x- state, u=  action, used for updating the Q values
    xu = torch.cat([x,u],1) #1 for vertical concat, 0 for horizontal
    x = F.relu(self,layer_1(xu))
    x = F.relu(self.layer_2(x1))
    x = self.layer_3(x1)
    
    return x1
    


In [0]:
# STEP 4-15
# training process , create a T3D class, initialize variables

# selecting the device- cpu/gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# building the whole training process into a class

class T3D(object):
  def __init__(self, state_dims, action_dim, max_action):
    #making sure out T3D class can work with any env
    self.actor = Actor(state_dims, action_dim, max_action).to(device) # GD
    self.actor_target = Actor(state_dims, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict)
    #initializing with model weights to keep them same
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

    self.critic = Critic(state_dims, action_dim).to(device) #GD
    self.critic_target = Critic(state_dims, action_dim).to(device) #polyak avg
    self.critic_target.load_state_dict(self.critic.state_dict)
    #initializing with model weights to keep them same
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action

  def select_action(self, state):
    state= torch.Tensor(state.reshape(1, -1).to(device))
    return self.actor(state).cpu().data.numpy().flatten()
    # need to convert to numpy, due to clipping

  



In [0]:
# STEP 4-15
# sample from a batch of transitions(s,s',a,r) from the memory 
def train(self, replay_buffer, iterations, batch_size=100, discount=0.99,
            tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
  for it in range(iterations):
      # we sample from a batch of transitions (s,s',a,r) from memory
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones\
        = replay_buffer.sample(batch_size)
    state = torch.Tensor(batch_states).to(device)
    next_state = torch.Tensor(batch_next_states).to(device)
    action = torch.Tensor(batch_actions).to(device)
    reward = torch.Tensor(batch_rewards).to(device)
    done = torch.Tensor(batch_dones).to(device)
# STEP 5
# from the next state s', the Actor target plays the next actions a'
    next_action = self.actor_target.forward(next_state)
# STEP 6
# we add gaussian noiseto this next action a' and we clamp it in a 
# range ofvalues suported by the environment
    noise = torch.Tensor(batch_actions).data.normal(0, policy_noise).to(device)
    action = torch.clamp(-noise_clip, noise_clip)
    next_action = (next_action + noise).clamp(-self.max_Action,self_max_action)
# STEP 7
# the two critic targets take each the couple (s',a') as input return two Q values
# Qt1(s',a')and Qt2(s',a') as ouputs   
    target_Q1, target_Q2 = self.critic_target.forward(next_state, next_action)
# STEP 8
# we keep the minimum of these two Q-values
# this is not target_Q, we are just being lazy, and want to use the same variable name later on
    target_Q = torch.min(torch_Q1, target_Q2)
# STEP 9
# we get the final target of the two critic model which is:
# Qt= r+ gamma*min(Qt1,Qt2)
# target_Q=reward+(1-done)* discount*target_Q
# 0 = episode not over, 1= episode over
# we cant run the above equation efficiently as some components are in 
# computational graphs and some are not. We need to make one minor modification
    target_Q = reward +((1-done)) * discount* target_Q.detach()
 # STEP 10
# the two critic models take each of couple(s,a)as input
# and return two Q values   
    current_Q1, curret_Q2= self.critic.forward(state,action)
# STEP 11
# we compute the loss coming from the two critic models    
    critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
# STEP 12
# we backpropagate this Critic loss and update the parameters of the two\
# critic  models with adam optimizer   
    self.critic_otpimizer.zero_grad() # initializing the gradients to zero
    critic_loss.backward() # computing the gradients
    self.critic_optimizer.step() # performing the weight updates
# STEP 13
# once every  two iterations, we update our Actor model by\
# performing gradient ascent on the output of the first critic model   
    if it % policy_freq == 0:  # this is DPG part
      actor_loss = -(self.critic.Q1(state, self.actor(state).mean()))
      self.actor_optimizer.grad_zero()
      actor_loss.backward()
      self.actor_optimizer.step()
# STEP 14
# still once every two iterations, we update the weights of the actor target\
# by polyak averaging     
    for param, target_param in zip(self.actor.parameters(),self.actor_target.parameters()):
      target_param.data.copy_(tau* param.data + (1-tau)* target_param.data)
# STEP 15
# still once every two iterations, we update the weights of the critic target\
# by polyak averaging    
    for param, target_param in zip(self.critic.parameters(),self.critic_target.parameters()):
      target_param.data.copy_(tau* param.data + (1-tau)* target_param.data)


In [0]:
# Individual steps below for better readability

In [0]:
# STEP 5
# from the next state s', the Actor target plays the next actions a'
next_action = self.actor_target.forward(next_state)


In [0]:
# STEP 6
# we add gaussian noiseto this next action a' and we clamp it in a 
# range ofvalues suported by the environment
noise = torch.Tensor(batch_actions).data.normal(0, policy_noise).to(device)
action = torch.clamp(-noise_clip, noise_clip)
next_action = (next_action + noise).clamp(-self.max_Action,self_max_action)


In [0]:
# STEP 7
# the two critictargets take each the couple (s',a') as input return two Q values
# Qt1(s',a')and Qt2(s',a') as ouputs
target_Q1, target_Q2 = self.critic_target.forward(next_state, next_action)


In [0]:
# STEP 8
# we keep the minimum of these two Q-values
# this is not target_Q, we are just being lazy, and want to use the same variable name later on
target_Q = torch.min(torch_Q1, target_Q2)

In [0]:
# STEP 9
# we get the final target of the two critic model which is:
# Qt= r+ gamma*min(Qt1,Qt2)
# target_Q=reward+(1-done)* discount*target_Q
# 0 = episode not over, 1= episode over
# we cant run the above equation efficiently as some components are in 
# computational graphs and some are not. We need to make one minor modification
target_Q = reward +((1-done)) * discount* target_Q).detach()


In [0]:
# STEP 10
# the two critic models take each of couple(s,a)as input
# and return two Q values
current_Q1, curret_Q2= self.critic.forward(state,action)

In [0]:
# STEP 11
# we compute the loss coming from the two critic models
critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)


In [0]:
# STEP 12
# we backpropagate this Critic loss and update the parameters of the two\
 # critic  models with adam optimizer
self.critic_otpimizer.zero_grad() # initializing the gradients to zero
critic_loss.backward() # computing the gradients
self.critic_optimizer.step() # performing the weight updates


In [0]:
# STEP 13
# once every  two iterations, we update our Actor model by\
# performing gradient ascent on the output of the first critic model
if it % policy_freq == 0:
  # this is DPG part
  actor_loss = -(self.critic.Q1(state, self.actor(state).mean()))
  self.actor_optimizer.grad_zero()
  actor_loss.backward()
  self.actor_optimizer.step()


In [0]:
# STEP 14
# still once every two iterations, we update the weights of the actor target\
# by polyak averaging 
for param, target_param in zip(self.actor.parameters(),self.actor_target.parameters()):
  target_param.data.copy_(tau* param.data + (1-tau)* target_param.data)


In [0]:
# STEP 15
# still once every two iterations, we update the weights of the critic target\
# by polyak averaging 
for param, target_param in zip(self.critic.parameters(),self.critic_target.parameters()):
  target_param.data.copy_(tau* param.data + (1-tau)* target_param.data)
