<a href="https://colab.research.google.com/github/sagar9926/ReinforcementLearning/blob/master/T3D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip3 install pybullet --upgrade
!pip install gym
import os
import time
import random
import numpy as np
import matplotlib.pyplot as plt
import pybullet_envs
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from gym import wrappers
from torch.autograd import Variable
from collections import deque


Requirement already up-to-date: pybullet in /usr/local/lib/python3.6/dist-packages (2.7.7)


In [0]:
class ReplayBuffer(object):
  def __init__(self,max_size = 1000000):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self,transition):
    if (len(self.storage) == self.max_size):
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size

    else:
      self.storage.append(transition)


  def sample(self,batch_size):
    ind = np.random.randint(0, len(self.storage),batch_size)
    batch_states,batch_next_states,batch_actions,batch_rewards,batch_dones = [],[],[],[],[]

    for i in ind:
      state,next_state,action,reward,done = self.storage[i]
      batch_states.append(np.array(state,copy = False))
      batch_next_states.append(np.array(next_state,copy = False))
      batch_actions.append(np.array(action,copy = False))
      batch_rewards.append(np.array(reward,copy = False))   
      batch_dones.append(np.array(done,copy = False))
    
    return np.array(batch_states),np.array(batch_next_states),np.array(batch_actions),np.array(batch_rewards).reshape(-1,1),np.array(batch_dones).reshape(-1,1)


In [0]:
class Actor(nn.Module):

  def __init__(self,state_dims,action_dim,max_action):
    #max action is to clip in case we added too much noise
    super(Actor,self).__init__()
    self.layer_1 = nn.Linear(state_dims,400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,action_dim)


    def forward(self,x):
      x = F.relu(self.layer_1(x))
      x = F.relu(self.layer_2(x))
      x = self.max_action * torch.tanh(self.layer_3(x)) #????????????????????????????
      return(x)



In [0]:
class Critic(nn.Module):

  def __init__(self,state_dims,action_dim):

    super(Critic,self)._init__()

    # First critic Network
    self.layer_1 = nn.Linear(state_dims + action_dim,400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,action_dim)

    # Second critic Network
    self.layer_1 = nn.Linear(state_dims + action_dim,400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300,action_dim)

  def forward(self,x,u): #x - state, u - action
    xu = torch.cat([x,u],1) # 1 for vertical concatenation
    # Forward propogation on first Critic
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)

    # Forward propogation on second Critic
    x2 = F.relu(self.layer_1(xu))
    x2 = F.relu(self.layer_2(x2))
    x2 = self.layer_3(x2)

    return x1,x2


  def Q1(self,x,u):
    xu = torch.cat([x,u],1) # 1 for vertical concatenation
    # Forward propogation on first Critic
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return(x1)
      



      



In [0]:
# Selecting the device CPU or GPU

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Building the whole training process into a Class

class T3D(object):
  def __init__(self,state_dims,action_dim,max_action):
    #making sure our T3D class can work with any environment

    self.actor = Actor(state_dims,action_dim,max_action).to(device) #GD
    self.actor_target = Actor(state_dims,action_dim,max_action).to(device) #Polyyak Avg
    self.actor_target.load_state_dict(self.actor.state_dict) #initialising with model weights to keep them same

    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())

    self.critic = Critic(state_dims,action_dim).to(device) #GD
    self.critic_target = Critic(state_dims,action_dim).to(device) # Polyyak Avg
    self.critic_target.load_state_dict(self.critic.state_dict)

    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action


  def select_action(self,state):
    state = torch.Tensor(state.reshape(1,-1)).to(device)
    return(self.actor(state).cpu().data.numpy().flatten())


  def train(self , replay_buffer,iterations,batch_size = 200 ,discount = 0.99 , tau = 0.005,policy_noise = 0.2,noise_clip = 0.5,policy_freq = 2): 
    for it in range(iterations):
      #We sample a batch of transitions (s, s`, a, r) from the memory. 
      batch_states,batch_next_states,batch_actions,batch_rewards,batch_dones = replay_buffer.sample(batch_Size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)

      #step 5 :Then from each element of the batch, From the next state s`, the Actor target plays the next action a`.

      next_action = self.actor_target.forward(next_state)

      # We add Gaussian noise to this next action a' and we clamp it in a range
      #of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0,policy_noise).to(device)
      noise = noise.clamp(-noise_clip,noise_clip)
      next_action = (next_action + noise).clamp(-self.max_action,self.max_action)


      # The two Critic targets take each the couple (s', a') as input and return two Q values,
      # Qt1(s', a') and Qt2(s', a') as outputs

      target_Q1, target_Q2 = self.critic_target.forward(next_state,next_action)

      # Keep the minimum of these two Q-Values
      target_Q = torch.min(target_Q1,target_Q2)

      target_Q = reward + ((1 - done)*discount*target_Q).detach()

      #Two critic models take (s, a) and return two Q-Values

      current_Q1, current_Q2 = self.critic.forward(state,action)

      # Compute the Critic Loss

      critic_loss = F.mse_loss(current_Q1,target_Q) + F.mse_loss(current_Q2,target_Q)

      #Backpropagate this critic loss and update the parameters of two Critic models
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()


      # Once every two iterations, we update our Actor model by performing gradient ASCENT on the output of the first Critic model

      if(it % policy_freq == 0):
        actor_loss = -(self.critic.Q1(state,self.actor(state)).mean())
        self.actor_optimizer.grad_zero()
        actor_loss.backward()
        self.actor_optimizer.step()

        # Still, in once every two iterations, we update our Actor Target by Polyak Averaging

        for param,target_param in zip(self.actor.parameters(),self.actor_target.parameters()):
          target_param.data.copy_(tau*param.data + (1- tau)*target_param.data)



        #Still, in once every two iterations, we update our Critic Target by Polyak Averaging

        for param,target_param in zip(self.critic.parameters(),self.critic_target.parameters()):
          target_param.data.copy_(tau*param.data + (1- tau)*target_param.data)







      



 


In [0]:
def train(replay_buffer,iterations,batch_size = 200 ,discount = 0.99 , tau = 0.005,policy_noise = 0.2,noise_clip = 0.5,policy_freq = 2): 
    for it in range(iterations):
      #We sample a batch of transitions (s, s`, a, r) from the memory. 
      batch_states,batch_next_states,batch_actions,batch_rewards,batch_dones = replay_buffer.sample(batch_Size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)

      #step 5 :Then from each element of the batch, From the next state s`, the Actor target plays the next action a`.

      next_action = self.actor_target.forward(next_state)

      # We add Gaussian noise to this next action a' and we clamp it in a range
      #of values supported by the environment
      noise = torch.Tensor(batch_actions).data.normal_(0,policy_noise).to(device)
      noise = noise.clamp(-noise_clip,noise_clip)
      next_action = (next_action + noise).clamp(-self.max_action,self.max_action)


      # The two Critic targets take each the couple (s', a') as input and return two Q values,
      # Qt1(s', a') and Qt2(s', a') as outputs

      target_Q1, target_Q2 = self.critic_target.forward(next_state,next_action)

      # Keep the minimum of these two Q-Values
      target_Q = torch.min(target_Q1,target_Q2)

      target_Q = reward + ((1 - done)*discount*target_Q).detach()

      #Two critic models take (s, a) and return two Q-Values

      current_Q1, current_Q2 = self.critic.forward(state,action)

      # Compute the Critic Loss

      critic_loss = F.mse_loss(current_Q1,target_Q) + F.mse_loss(current_Q2,target_Q)

      #Backpropagate this critic loss and update the parameters of two Critic models

      



 