In [0]:
import os
import random
import time
import numpy as np
import matplotlib.pyplot as plt
import torch
from collections import deque
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

In [0]:
import gym
from gym import wrappers
import pybullet_envs


In [14]:
# !pip install pybullet

Collecting pybullet
[?25l  Downloading https://files.pythonhosted.org/packages/d8/ac/a422ab8d1c57ab3f43e573b5a5f532e6afd348d81308fe66a1ecb691548e/pybullet-2.7.1-cp36-cp36m-manylinux1_x86_64.whl (95.0MB)
[K     |████████████████████████████████| 95.0MB 47kB/s 
[?25hInstalling collected packages: pybullet
Successfully installed pybullet-2.7.1


In [0]:
class ReplayBuffer(object):
  def __init__(self,max_size=1e6):
    self.storage=[]
    self.max_size=max_size
    self.ptr=0
  
  def add(self,transition):
    if len(self.storage)==self.max_size:
      self.storage[int(self.ptr)]=transition
      self.ptr=self.ptr+1 % self.max_size 
    else:
      self.storage.append(transition)
  
  def sample(self,batch_size):
    ind=np.random.randint(0,len(self.storage),batch_size)
    batch_data=np.array(self.sample(ind))
    #batch_states,batch_next_states,batch_actions,batch_rewards,batch_dones
    return batch_data[:,0],batch_data[:,1],batch_data[:,2],batch_data[:,3],batch_data[:,4]
  

In [0]:
class Actor(nn.Module):

  def __init__(self,state_dims,action_dim,max_action):

    super(Actor,self).__init__()
    self.layer1=nn.Linear(state_dims,400)
    self.layer2=nn.Linear(400,300)
    self.layer3=nn.Linear(300,action_dim)
    self.max_action=max_action

  def forward(self,x):
    x=F.relu(self.layer1(x))
    x=F.relu(self.layer2(x))
    return self.max_action*torch.tanh(self.layer3(x))

  

In [0]:
class Critic(nn.Module):
  def __init__(self,state_dims,action_dim):

    super(Critic,self).__init__()

    self.layer1=nn.Linear(state_dims+action_dim,400)
    self.layer2=nn.Linear(400,300)
    self.layer3=nn.Linear(300,action_dim)

    self.layer4=nn.Linear(state_dims+action_dim,400)
    self.layer5=nn.Linear(400,300)
    self.layer6=nn.Linear(300,action_dim)
    
  def forward(self,x,u):
    xu=torch.cat([x,u],1)
    #forward prop for critic1
    x1=F.relu(self.layer1(xu))
    x1=F.relu(self.layer2(x1))
    x1=self.layer3(x1)

    #forward prop for critic2
    x2=F.relu(self.layer4(xu))
    x2=F.relu(self.layer5(x2))
    x2=self.layer6(x2)
  
    return x1,x2
  

  def Q1(self,x,u):
    xu=torch.cat([x,u],1)
    x1=F.relu(self.layer1(xu))
    x1=F.relu(self.layer2(x1))
    return self.layer3(x1)
    

In [0]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [21]:
device

device(type='cuda')

In [0]:
class T3D(object):

  def __init__(self,state_dims,action_dim,max_action):

    self.actor=Actor(state_dims,action_dim,max_action).to(device)
    self.actor_target=Actor(state_dims,action_dim,max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict)

    #only actor model will have the optimizer as it gets updated by backprop
    self.actor_optimizer=torch.optim.Adam(self.actor.parameters())

    self.critic=Critic(state_dims,action_dim).to(device)
    self.critic_target=Critic(state_dims,action_dim).to(device)

    self.critic_target.load_state_dict(self.critic.state_dict)

    self.critic_optimizer=torch.optim.Adam(self.critic.parameters())
    self.max_action=max_action
  
  def select_action(self,state):
    state=torch.Tensor(state.reshape(1,-1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  def train(self,replay_buffer,iterations,batch_size=100,discount=0.99,tau=0.005,policy_noise=0.2,noise_clip=0.5,policy_freq=2):

    for it in range(iterations):
      batch_states,batch_next_states,batch_actions,batch_rewards,batch_dones=replay_buffer.sample(batch_size)
      state=torch.Tensor(batch_states).to(device)
      next_state=torch.Tensor(batch_next_states).to(device)
      action=torch.Tensor(batch_actions).to(device)
      reward=torch.Tensor(batch_rewards).to(device)
      done=torch.Tensor(batch_dones).to(device)

      next_action=self.action_target.forward(next_state)

      noise=torch.Tensor(batch_actions).data.normal_(0,policy_noise).to(device)
      noise=noise.clamp(-noise_clip,noise_clip)
      next_action=(next_action+noise).clamp(-self.max_action,self.max_action)
    
      target_Q1,target_Q2=self.critic_target.forward(next_state,next_action)
      target_Q=torch.min(target_Q1,target_Q2)

      target_Q=reward +((1-done)*discount*target_Q).detach()

      current_Q1,current_Q2=self.critic.forward(state,action)
      critic_loss=F.mse_loss(current_Q1,target_Q)+F.mse_loss(current_Q2,target_Q)


      self.critic_optimizer.zero_grad()
      critic_loss.backward()#computing the grads
      self.critic_optimizer.step()#weight updation

      if it % policy_freq==0:
        #This is DPG part

        actor_loss=-(self.critic.Q1(state,self.actor(state)).mean())
        self.actor_optimizer.grad_zero()
        actor_loss.backward()
        self.actor_optimizer.step()

        for param,target_param in zip(self.actor.parameters(),self.actor_target_parameters()):
          target_param.data.copy_(tau*param.data+(1-tau)*target_param.data)


        for param,target_param in zip(self.critic.parameters(),self.critic_target_parameters()):
          target_param.data.copy_(tau*param.data+(1-tau)*target_param.data)


![alt text](https://)