<a href="https://colab.research.google.com/github/rootAkash/reinforcement_learning/blob/master/Soft%20AC/SAC2_torch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from collections import deque
import random
from tqdm import tqdm
class Policy(nn.Module):

    def __init__(self,observation_size,action_size,hidden_units):
        super().__init__()
        self.observation_size=observation_size
        self.hidden_units=hidden_units
        self.action_size = action_size
        self.h1 = nn.Linear(self.observation_size, self.hidden_units)  
        self.h2 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h3 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h4 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h5 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h6 = nn.Linear(self.hidden_units,self.hidden_units)
        self.mu = nn.Linear(self.hidden_units,self.action_size)
        self.sigma = nn.Linear(self.hidden_units,self.action_size)
    def forward(self, x):
        x = F.relu(self.h1(x))
        x = F.relu(self.h2(x))
        x = F.relu(self.h3(x))
        x = F.relu(self.h4(x))
        x = F.relu(self.h5(x))
        x = F.relu(self.h6(x))
        mus = torch.tanh(self.mu(x))
        sigs= F.softplus(self.sigma(x))
        sigs= torch.clamp(sigs, min=0.001, max=100)#1e-22 , 1e+02
        return mus , sigs
    def predict(self, x):
        with torch.no_grad():
          output=self.forward(x)
        return output  

class Value(nn.Module):
    def __init__(self,observation_size,hidden_units):
        super().__init__()
        self.observation_size=observation_size
        self.hidden_units=hidden_units
        self.h1 = nn.Linear(self.observation_size, self.hidden_units)  
        self.h2 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h3 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h4 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h5 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h6 = nn.Linear(self.hidden_units,self.hidden_units)
        self.v = nn.Linear(self.hidden_units,1)
    def forward(self, x):
        x = F.relu(self.h1(x))
        x = F.relu(self.h2(x))
        x = F.relu(self.h3(x))
        x = F.relu(self.h4(x))
        x = F.relu(self.h5(x))
        x = F.relu(self.h6(x))
        vout = self.v(x)
        return vout
    def predict(self, x):
        with torch.no_grad():
          output=self.forward(x)
        return output    
class Q_net(nn.Module):

    def __init__(self,observation_size,action_size,hidden_units):
        super().__init__()
        self.observation_size=observation_size
        self.hidden_units=hidden_units
        self.action_size = action_size
        self.h1 = nn.Linear(self.observation_size, self.hidden_units) 
        self.a1 = nn.Linear(self.action_size, self.hidden_units) 
        self.h2 = nn.Linear(self.hidden_units*2,self.hidden_units)
        self.h3 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h4 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h5 = nn.Linear(self.hidden_units,self.hidden_units)
        self.h6 = nn.Linear(self.hidden_units,self.hidden_units)
        self.q = nn.Linear(self.hidden_units,1)
    def forward(self, x,a):
        x = F.relu(self.h1(x))
        a = F.relu(self.a1(a))
        x = torch.cat([x,a], dim=1)
        x = F.relu(self.h2(x))
        x = F.relu(self.h3(x))
        x = F.relu(self.h4(x))
        x = F.relu(self.h5(x))
        x = F.relu(self.h6(x))
        qout = self.q(x)
        return qout
    def predict(self, x,a):
        with torch.no_grad():
          output=self.forward(x,a)
        return output



In [5]:
def remember(s,a,r,ns,d):
  #s=s.ravel()
  #ns=ns.ravel()
  memory.append([s,a,np.array([r]),ns,np.array([d])])
def sample_games(buffer,batch_size):
  # Sample game from buffer either uniformly or according to some priority
  #print("samplig from .",len(buffer))
  return list(np.random.choice(len(buffer),batch_size))
def soft_update(target, source, tau):
  for target_param, param in zip(target.parameters(), source.parameters()):
    target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
  return target
def get_log_pdf_multi(x,mean,std):
  k= x.shape[1] #action dim
  pi = 3.1415926
  cov = std**2
  det = torch.prod(cov,dim=1,keepdim=True)
  norm_const = 1.0/ ( np.power(2*pi,k/2) * torch.pow(det,0.5) )
  prod  = (1/cov)*torch.square(x - mean)
  prod2 =torch.sum(prod,dim=1,keepdim=True) 
  pdf = norm_const * torch.exp( -0.5 *prod2)
  final_log_pdf = torch.log(pdf+1e-07)
  return final_log_pdf

def get_entropy_multi(x, mean, std,Act):
  #log pdf (squashed guassian) = log pdfguassian(mupolicy,sigma_policy) - sum of  log(1-A**2) ; where each A is component of tanh squahed action vector 
  log_pdf_final = get_log_pdf_multi(x,mean,std) - torch.sum(torch.log(1- torch.square(Act) +1e-07),dim=1,keepdim=True)
  return -log_pdf_final



def replay_and_train(policy,value,t_value,q_1,q_2,popt,vopt,qopt,size=128):
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  gamma=0.99
  mse = nn.MSELoss()
  alpha = 1.0
  sample_size=size
  if len(memory) < sample_size:
    return
  samples=random.sample(memory,sample_size)
  s,a,r,ns,d=zip(*samples)
  #s,a,r,ns,d = list(s),list(a),list(r),list(ns),list(d)
  #print(s,a,r,ns,d)
  s= torch.tensor(s).float().to(device)
  a= torch.tensor(a).float().to(device)
  r= torch.tensor(r).float().to(device)
  ns= torch.tensor(ns).float().to(device)
  d= torch.tensor(d).float().to(device)

  yq = r + gamma*(1-d)*t_value.predict(ns)
  new_mu,new_sig = policy(s)
  E =np.random.multivariate_normal(np.zeros_like(new_mu[0].detach().numpy()),np.diag(np.ones_like(new_mu[0].detach().numpy())),size) 
  E = torch.tensor(E).float().to(device)
  Act_guassian = new_mu + E*new_sig # guassian action using reparametrisation
  new_Action  =torch.tanh(Act_guassian)  #final squashed guassian action
  entropy = get_entropy_multi(Act_guassian,new_mu,new_sig,new_Action)
  yv = torch.minimum(q_1.predict(s,new_Action),q_2.predict(s,new_Action)) + alpha*entropy.detach() # value target should be bootstrapped to current policy
  #training nets 
  #train q nets
  qloss = mse(q_1(s,a),yq) + mse(q_2(s,a),yq)
  qopt.zero_grad()                                                                                                          #    
  qloss.backward()                                                                                                         #
  qopt.step() 
  #train v net
  vloss = mse(value(s),yv)
  vopt.zero_grad()                                                                                                          #    
  vloss.backward()                                                                                                         #
  vopt.step() 
  #train policy 
  policy_objective = q_1(s,new_Action)#state from buffer action from recent policy and not from buffer to train to maximise q
  final_policy_objective = policy_objective + alpha*entropy # maximise this
  final_policy_loss = - torch.mean(final_policy_objective)  #therefore minimise this
  popt.zero_grad()                                                                                                          #    
  final_policy_loss.backward()                                                                                                         #
  popt.step() 
  
  #train t_value
  soft_update(target=t_value, source=value, tau=0.01)

  


    

In [None]:
import gym

memory=deque(maxlen=5000000)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#env,networks ad optimisers
env=gym.make('Pendulum-v0')
#env=gym.make('MountainCarContinuous-v0')

env=env.unwrapped

s_dim = env.observation_space.shape[0]
print(s_dim)
a_dim = env.action_space.shape[0]
print(a_dim)
a_bound = env.action_space.high[0]
print(a_bound)

policy = Policy(s_dim,a_dim,100).to(device)
value = Value(s_dim,100).to(device)
t_value = Value(s_dim,100).to(device)
soft_update(target=t_value, source=value, tau=1)
q_1 = Q_net(s_dim,a_dim,100).to(device)
q_2 = Q_net(s_dim,a_dim,100).to(device)
qopt = optim.Adam(list(q_1.parameters()) + list(q_2.parameters()),lr=0.001)
popt = optim.Adam(policy.parameters(),lr=0.001)
vopt = optim.Adam(value.parameters(),lr=0.001)        
max_steps=5000
episodes = 500
train_iter = 5000  # 1000, batch of 8/16 works best for now
for e in range(episodes):
  done = False
  s = env.reset()
  rew = 0 
  stp=0
  while not done:
    new_mu,new_sig = policy.predict(torch.tensor([s]).float().to(device))
    new_mu,new_sig=new_mu.cpu().numpy(),new_sig.cpu().numpy()
    #print(new_mu,new_sig,new_mu[0],new_sig[0])
    E =np.random.multivariate_normal(np.zeros_like(new_mu[0]),np.diag(np.ones_like(new_mu[0])))#np.random.normal(mu=0,std_dev=1) diag cov of 1 is same as std dev of 1
    Act_guassian = new_mu[0] + new_sig[0]*E # guassian action using reparametrisation
    act  =np.tanh(Act_guassian)
    s_,r,done,_=env.step(act*a_bound)
    if stp>max_steps:
      done = True  
    remember(s,act,r,s_,done)
    s=s_
    rew+=r
    stp+=1
  print(e,rew)
  if e>0:
    print("training")
    for i in  tqdm(range(train_iter)):
      replay_and_train(policy,value,t_value,q_1,q_2,popt,vopt,qopt,size=8)


3
1
2.0
0 -37123.447086588305


  0%|          | 5/5000 [00:00<01:44, 47.65it/s]

1 -27819.903587189565
training


100%|██████████| 5000/5000 [01:44<00:00, 47.93it/s]
  0%|          | 5/5000 [00:00<02:06, 39.41it/s]

2 -38596.47850594417
training


100%|██████████| 5000/5000 [01:46<00:00, 46.85it/s]
  0%|          | 5/5000 [00:00<01:49, 45.70it/s]

3 -31738.923493831062
training


100%|██████████| 5000/5000 [01:49<00:00, 45.57it/s]
  0%|          | 5/5000 [00:00<01:52, 44.23it/s]

4 -17144.314777526182
training


100%|██████████| 5000/5000 [01:49<00:00, 45.49it/s]
  0%|          | 5/5000 [00:00<01:56, 42.85it/s]

5 -25140.598987865826
training


100%|██████████| 5000/5000 [01:49<00:00, 45.66it/s]
  0%|          | 5/5000 [00:00<01:58, 42.21it/s]

6 -1055.7201506140902
training


100%|██████████| 5000/5000 [01:49<00:00, 45.61it/s]
  0%|          | 6/5000 [00:00<01:29, 55.97it/s]

7 -158.49423783744794
training


100%|██████████| 5000/5000 [01:48<00:00, 46.07it/s]
  0%|          | 5/5000 [00:00<01:53, 44.06it/s]

8 -166.3775650373026
training


100%|█████████▉| 4994/5000 [01:49<00:00, 48.61it/s]

In [6]:
import gym

memory=deque(maxlen=5000000)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
#env,networks ad optimisers
#env=gym.make('Pendulum-v0')
env=gym.make('MountainCarContinuous-v0')

env=env.unwrapped

s_dim = env.observation_space.shape[0]
print(s_dim)
a_dim = env.action_space.shape[0]
print(a_dim)
a_bound = env.action_space.high[0]
print(a_bound)

policy = Policy(s_dim,a_dim,100).to(device)
value = Value(s_dim,100).to(device)
t_value = Value(s_dim,100).to(device)
soft_update(target=t_value, source=value, tau=1)
q_1 = Q_net(s_dim,a_dim,100).to(device)
q_2 = Q_net(s_dim,a_dim,100).to(device)
qopt = optim.Adam(list(q_1.parameters()) + list(q_2.parameters()),lr=0.001,betas=(0.5, 0.999))
popt = optim.Adam(policy.parameters(),lr=0.001,betas=(0.5, 0.999))
vopt = optim.Adam(value.parameters(),lr=0.001,betas=(0.5, 0.999))        
max_steps=5000
episodes = 5000
steps = 3500 
ctr = 0
render =False
train_iter = 1000  # 1000, batch of 8/16 works best for now

for ep in range(episodes):
	s = env.reset()
	done=False
	rews=0
	if ep>1500:
		render=1
	for step in range(steps):
		if done:
			s = env.reset()
		if render:
			env.render()	
   
		new_mu,new_sig = policy.predict(torch.tensor([s]).float().to(device))
		
		mu,sig=new_mu.cpu().numpy(),new_sig.cpu().numpy()
		E =np.random.multivariate_normal(np.zeros_like(mu[0]),np.diag(np.ones_like(sig[0])))#np.random.normal(0,1)

		Action  =np.tanh(mu[0] + sig[0]*E)
		if ep < 10 :
			# for some additional exploration not necesserily needed
			if E > 0.5:
				Action = np.clip(a_bound*E,-a_bound,a_bound)		
		s_,r,done,_=env.step(Action*a_bound)
		if done :
			r =r+10000	# to encourage reaching target more
			print("reached")
		remember(s,Action,r,s_,done)
		rews+=r	
		ctr+=1
		s=s_
	print("episode: "+str(ep)+ " rews: "+str(rews))		
	print("training")
	for i in  range(train_iter):
		replay_and_train(policy,value,t_value,q_1,q_2,popt,vopt,qopt,size=16)
		if i % (train_iter//10)==0:
			print('.',end='')#tqdm sometimes has issues in colab so did this 
	print('|')	

2
1
1.0
reached
episode: 0 rews: 9960.348283908465
training
..........|
episode: 1 rews: -149.48956947605896
training
..........|
episode: 2 rews: -150.34566090881796
training
..........|
episode: 3 rews: -214.8940358008331
training
..........|
episode: 4 rews: -162.43216157926918
training
..........|
reached
episode: 5 rews: 9935.953813575446
training
..........|
reached
episode: 6 rews: 9959.867563747235
training
..........|
episode: 7 rews: -153.62652802219037
training
..........|
episode: 8 rews: -199.9902604543084
training
..........|
episode: 9 rews: -154.51613932437454
training
..........|
episode: 10 rews: -193.8129400390111
training
..........|
episode: 11 rews: -196.23978337629833
training
..........|
episode: 12 rews: -188.7714696405506
training
..........|
episode: 13 rews: -185.67991707265128
training
..........|
episode: 14 rews: -165.0688008852172
training
..........|
episode: 15 rews: -178.15057243338563
training
..........|
episode: 16 rews: -167.71689517726006
trainin

KeyboardInterrupt: ignored