In [1]:
import gym
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import copy
import collections 

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class neural_net:
  def __init__(self, name: str, obs, action):
    self.obs_shape = obs
    self.act_space = action

  # def build_policy(self, name: str):
    with tf.variable_scope(name):
      self.obs = tf.placeholder(dtype = tf.float32, shape = [None] + list(self.obs_shape.shape), name = 'obs' )
      with tf.variable_scope('policy_net'):
        l_1 = tf.layers.dense(inputs = self.obs, units = 20, activation = tf.tanh)
        l_2 = tf.layers.dense(inputs = l_1, units = 20, activation = tf.tanh)
        l_3 = tf.layers.dense(inputs = l_2, units = 20, activation = tf.tanh)
        self.p_of_A = tf.layers.dense(inputs = tf.divide(l_3, 0.1), units = self.act_space.n, activation = tf.nn.softmax)

      with tf.variable_scope('val_net'):
        f_l = tf.layers.dense(inputs = self.obs, units = 20, activation = tf.tanh)
        s_l = tf.layers.dense(inputs = f_l, units = 20, activation = tf.tanh)
        self.val_prediction = tf.layers.dense(inputs = s_l, units = 1, activation = None)

      self.stochastic = tf.multinomial(tf.log(self.p_of_A), num_samples = 1)
      self.stochastic = tf.reshape(self.stochastic, shape = [-1])
      self.deterministic = tf.argmax(self.p_of_A, axis = 1)
      self.scope = tf.get_variable_scope().name

  def act(self, obs, stochastic = True):
    if stochastic:
      return tf.get_default_session().run([self.stochastic, self.val_prediction], feed_dict = {self.obs : obs})
    return tf.get_default_session().run([self.deterministic, self.val_prediction], feed_dict = {self.obs : obs})

  def get_trainable_variables(self):
    return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)



In [3]:
class ppo_agent:
  def __init__(self, env, p, op):
    self.Policy_net = p
    self.Old_policy_net = op
    self.gamma = 0.95

    p = self.Policy_net.get_trainable_variables()

    op = self.Old_policy_net.get_trainable_variables()

    with tf.variable_scope('assign_old'): #assigning the parameters for the Old Policy Network
      self.assign_old = []
      for old_val, val in zip(op, p):
        self.assign_old.append(tf.assign(old_val, val))
    
    with tf.variable_scope('train_ip'):
      self.A = tf.placeholder(dtype = tf.int32, shape = [None], name = 'actions')
      self.R = tf.placeholder(dtype = tf.float32, shape = [None], name = 'rewards')
      self.Val_N = tf.placeholder(dtype = tf.float32, shape = [None], name = 'next_val_prediction')
      self.gaes = tf.placeholder(dtype = tf.float32, shape = [None], name = 'gaes') #generative advantage estimator

    A_predict = self.Policy_net.p_of_A #output layer of the nn
    old_A_predict = self.Old_policy_net.p_of_A #output layer of the nn

    #Prediction from the policy network
    A_predict = A_predict * tf.one_hot(indices = self.A, depth = A_predict.shape[1])
    A_predict = tf.reduce_sum(A_predict, axis = 1)

    #Prediction for the old network
    old_A_predict = old_A_predict * tf.one_hot(indices = self.A, depth = old_A_predict.shape[1])
    old_A_predict = tf.reduce_sum(old_A_predict, axis = 1)

    with tf.variable_scope('loss/clip'):
      ratios = tf.exp(tf.log(A_predict) - tf.log(old_A_predict))
      clipped_ratios = tf.clip_by_value(ratios, clip_value_min = 1 - 0.2, clip_value_max = 1 + 0.2)#0.2 is the cliping value(epsilon in the clipping formula)
      loss_clip = tf.minimum(tf.multiply(self.gaes, ratios), tf.multiply(self.gaes, clipped_ratios))
      loss_clip = tf.reduce_mean(loss_clip)
      tf.summary.scalar('loss/clip', loss_clip)

    with tf.variable_scope('loss/valf'):
      v_preds = self.Policy_net.val_prediction
      loss_val_f = tf.squared_difference(self.R + self.gamma * self.Val_N, v_preds)
      loss_val_f = tf.reduce_mean(loss_val_f)
      tf.summary.scalar('loss/valf', loss_val_f)

    with tf.variable_scope('loss/entropy'):
      enthropy = -tf.reduce_sum(self.Policy_net.p_of_A*tf.log(tf.clip_by_value(self.Policy_net.p_of_A, 1e-10, 1.0)), axis = 1)
      enthropy = tf.reduce_mean(enthropy, axis = 0)
      tf.summary.scalar('loss/enthropy', enthropy)

    with tf.variable_scope('loss'):
      loss = loss_clip - (1.0) * loss_val_f + (0.01) * enthropy
      loss = -loss
      tf.summary.scalar('loss', loss)
     

    self.merge_summary = tf.summary.merge_all()
    optimizer = tf.train.AdamOptimizer(learning_rate = 0.000057)


    self.train_op = optimizer.minimize(loss, var_list = p) #computing and updating the gradients

  def assign_policy_params(self):
    return tf.get_default_session().run(self.assign_old)

  def get_gaes(self, R, p_val, p_val_next):
    # print(type(R))
    # print(type(p_val))
    # print(type(self.gamma))
    delta = [r_t + self.gamma * v_next - v for r_t, v_next, v in zip(R, p_val, p_val_next)] #δt = rt + γV (st+1) − V (st) 
    # delta = [self.gamma * v_next  for r_t, v_next, v in zip(R, p_val, p_val_next)] #δt = rt + γV (st+1) − V (st) 
    gaes = copy.deepcopy(delta)
    for t in reversed(range(len(gaes) - 1)):
      gaes[t] = gaes[t] + self.gamma * gaes[t + 1]
    return gaes



  def summary(self,S, A, R, Val_N, gaes ):
    return tf.get_default_session().run([self.merge_summary], feed_dict = {self.Policy_net.obs: S, self.Old_policy_net.obs : S, self.A: A, self.R: R, self.Val_N: Val_N, self.gaes: gaes})

  def train(self,S, A, R, Val_N, gaes ):
    return tf.get_default_session().run([self.train_op], feed_dict = {self.Policy_net.obs: S, self.Old_policy_net.obs : S, self.A: A, self.R: R, self.Val_N: Val_N, self.gaes: gaes})

In [4]:

epochs = 10000
tf.reset_default_graph()
env = gym.make('Acrobot-v1')
env.seed(0)
tf.set_random_seed(0)
obs_space = env.observation_space
Policy_net = neural_net('new', obs_space, env.action_space)
Old_policy_net = neural_net('old', obs_space, env.action_space)
agent = ppo_agent(env, Policy_net, Old_policy_net)
save = tf.train.Saver()
stoc = True
mov_avg = collections.deque(maxlen = 100)

with tf.Session() as sess:
  writer = tf.summary.FileWriter('./logacro/21', sess.graph)
  sess.run(tf.global_variables_initializer())
  S = env.reset()
  R = 0
  num = 0

  for e in range(epochs):
    observations = []
    actions = []
    rewards = []
    v_preds = []
    steps = 0
    while True:
#       if e % 200 == 0:
#         stoc = False
#       else:
#         stoc = True
      steps += 1
      S = np.stack([S]).astype(dtype = np.float32)
      A, v_pred = Policy_net.act(obs = S, stochastic = True)
      A = np.asscalar(A)
      v_pred = np.asscalar(v_pred)

      observations.append(S)
      rewards.append(R)
      actions.append(A)
      v_preds.append(v_pred)

      N_S, R, done, info = env.step(A)

      if done:
        v_preds_next = v_preds[1:] + [0]
        S = env.reset()
        reward = -1
        break
      S = N_S
    
    mov_avg.append(sum(rewards))
    writer.add_summary(tf.Summary(value = [tf.Summary.Value(tag = 'episode_length', simple_value = steps)]),e)
    writer.add_summary(tf.Summary(value = [tf.Summary.Value(tag = 'moving avg', simple_value = np.mean(mov_avg))]),e)
    print(np.mean(mov_avg))
    
    if np.mean(mov_avg) > -100 and e > 99:
        save.save(sess, './model/acro21.ckpt')
        print("Saved!!!!!!!!!!!!!!!!!!!!!!!!!!!")
        break
    else: 
      num = 0
    gaes = agent.get_gaes(rewards, v_preds, v_preds_next) 

    #converting all the lists into array for using it in tensorflow
    observations = np.reshape(observations, newshape = [-1] + list(obs_space.shape))
    actions = np.array(actions).astype(dtype = np.int32)
    rewards = np.array(rewards).astype(dtype = np.float32)
    v_preds_next = np.array(v_preds_next).astype(dtype = np.float32)
    gaes = np.array(gaes).astype(dtype = np.float32)
    gaes = (gaes - gaes.mean()) / gaes.std()

    agent.assign_policy_params()

    ip = [observations, actions, rewards, v_preds_next, gaes]

    for epochs in range(4):
      sample = np.random.randint(low = 0, high = observations.shape[0], size = 64)
      sampled_stuff = [np.take( a = i, indices = sample, axis = 0) for i in ip]
      agent.train(sampled_stuff[0], sampled_stuff[1], sampled_stuff[2], sampled_stuff[3], sampled_stuff[4])
      summary = agent.summary(ip[0], ip[1], ip[2], ip[3], ip[4] )[0]
      writer.add_summary(summary, e)
  writer.close()

Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use `tf.random.categorical` instead.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




-161.0
-179.0
-179.33333333333334
-178.0
-173.8
-171.83333333333334
-172.0
-170.0
-165.88888888888889
-164.5
-167.1818181818182
-169.58333333333334
-169.15384615384616
-170.35714285714286
-168.4
-167.5
-165.11764705882354
-164.05555555555554
-164.52631578947367
-164.2
-163.28571428571428
-163.27272727272728
-162.04347826086956
-162.54166666666666
-162.8
-162.84615384615384
-163.37037037037038
-164.60714285714286
-165.24137931034483
-165.2
-164.2258064516129
-164.0625
-163.5151515151515
-164.02941176470588
-165.22857142857143
-166.38888888888889
-167.32432432432432
-167.60526315789474
-169.76923076923077
-170.275
-169.46341463414635
-170.4047619047619
-172.25581395348837
-172.0
-171.33333333333334
-170.69565217391303
-169.85106382978722
-168.75
-168.3469387755102
-168.72
-168.37254901960785
-167.80769230769232
-167.60377358490567
-167.07407407407408
-166.92727272727274
-167.53571428571428
-167.80701754385964
-167.51724137931035
-166.88135593220338
-166.8
-166.68852459016392
-165.9677419