In [None]:

import sys
import gym
import numpy as np
from scipy.stats import norm
from keras.layers import Dense, Input, Lambda
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
import matplotlib.pyplot as plt
from collections import deque

import random

In [None]:
class act_2_critic:
    def __init__(self,env):
      self.env = env
      self.state_size = env.observation_space.shape[0]
      self.action_size = env.action_space.shape[0]
      self.g = 0.9
      self.lr = 0.001
      self.memory = deque(maxlen = 2000)
      self.actor = self.actor_nn_model()
      self.critic = self.critic_nn_model()
      self.optimizer = [self.actor_optimizer(), self.critic_optimizer()]
    
    
    def store(self, s, a, r, _s, d):
      self.memory.append((s, a, r, _s, d))    
        
    def actor_nn_model(self):
      state = Input(batch_shape=(None, self.state_size))
      actor_input = Dense(30, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')(state)
      mu_0 = Dense(self.action_size, activation='tanh', kernel_initializer='he_uniform')(actor_input)
      sigma_0 = Dense(self.action_size, activation='softplus', kernel_initializer='he_uniform')(actor_input)
      mu = Lambda(lambda x: x * 2)(mu_0) #output 1
      sigma = Lambda(lambda x: x + 0.0001)(sigma_0) # output 2
      model = Model(inputs = state, outputs = (mu, sigma))
      model._make_predict_function()
      return  model
        
        
    def act(self, S):
      m, sig = self.actor.predict(np.reshape(S, [1, self.state_size]))
      ep = np.random.randn(self.action_size)
      A = m + np.sqrt(sig) * ep
      A = np.clip(A, -2, 2)
      return A
    
    def critic_nn_model(self):
      state = Input(batch_shape=(None, self.state_size))
      critic_input = Dense(30, input_dim=self.state_size, activation='relu', kernel_initializer='he_uniform')(state)
      state_value = Dense(1, activation='linear', kernel_initializer='he_uniform')(critic_input)
      model = Model(inputs = state, outputs = state_value)
      model._make_predict_function()
      return  model

    def actor_optimizer(self):
        #the State, Action and Advantage values are being updated during the train() function
        action = K.placeholder(shape=(None, 1)) 
        advantages = K.placeholder(shape=(None, 1))
        mu, sigma_sq = self.actor.output # the joing asction values being predicted from the neural net
        #we are using tensors here and so we are are using Tensorflow functions (K.) for the equation below
        pdf = 1.0 / K.sqrt(2.0 * np.pi * sigma_sq) * K.exp(-K.square(action - mu) / (2.0 * sigma_sq))
        log_pdf = K.log(pdf + K.epsilon())
        entropy = K.sum(0.5 * (K.log(2.0 * np.pi * sigma_sq) + 1.0))
        exp_v = log_pdf * advantages
        exp_v = K.sum(exp_v + 0.01 * entropy)
        actor_loss = -exp_v
        optimizer = Adam(lr = 0.0001)
        updates = optimizer.get_updates(self.actor.trainable_weights, [], actor_loss)
        train = K.function([self.actor.input, action, advantages], [], updates=updates)
        return train

    def critic_optimizer(self):
        #State and target values are being updated from the train() function
        discounted_reward = K.placeholder(shape=(None, 1))
        value = self.critic.output
        loss = K.mean(K.square(discounted_reward - value))
        optimizer = Adam(lr = 0.001)
        updates = optimizer.get_updates(self.critic.trainable_weights, [], loss)
        train = K.function([self.critic.input, discounted_reward], [], updates=updates)
        return train

        
            
    def train(self, S, A, R, N_S, done):
      state_batch = []
    #   We are keepin the batch size as 4 because increasing the batch size is also increasing our tain time considerably.  
      if len(self.memory) < 1:
        return
      sample_batch = random.sample(self.memory, 1)
      for S, A, R, N_S, done in sample_batch:
        target = np.zeros((1, 1))
        advantage = np.zeros((1, self.action_size))
        val = self.critic.predict(S)[0]
        N_val = self.critic.predict(N_S)[0]

        if done:
          advantage[0] = R - val
          target[0][0] = R
      
        else:
        # print(R.shape, " --", N_S.shape, "------", val.shape) 
          advantage[0] = R + self.g * (N_val) - val
          target[0][0] = R + self.g * N_val

        self.optimizer[0]([S, A, advantage])
        self.optimizer[1]([S, target])
        


In [None]:

env = gym.make("Pendulum-v0")
a2c = act_2_critic(env)
scores_arr = []
epochs = 10000
e = []
done = False
A = env.action_space.sample()
end = 0
count = 0

for i in range(epochs):
    if count == 50:
        print("The agent has been trained at - ", i," epochs")
        break
    S = env.reset()
    S = np.reshape(S, [1,env.observation_space.shape[0]])
    done = False
    score = 0
    while not done:
        # env.render()
        A = a2c.act(S)        
        N_S, R, done, info = env.step(A)
        N_S = N_S.reshape((1, env.observation_space.shape[0]))
        R /= 10
        a2c.store(S, A, R, N_S, done)
        a2c.train(S, A, R, N_S, done)
        score += R
        S = N_S
        if done:
          scores_arr.append(score)
          if score > -20:
              count += 1
          e.append(i)
          print("epoch: ", i, "score: ", score)
        
print("the model has been trained in ", i , " epochs")


In [None]:
plt.plot(scores_arr)
plt.ylabel("epochs")
plt.xlabel("rewards")
plt.show()