In [12]:
# %pip install gym==0.22
# %pip install tensorflow_probability
# %pip install tensorflow
%pip install pygame

Collecting pygame
  Downloading pygame-2.5.0-cp311-cp311-win_amd64.whl (10.5 MB)
                                              0.0/10.5 MB ? eta -:--:--
                                              0.1/10.5 MB 1.1 MB/s eta 0:00:10
                                              0.2/10.5 MB 1.8 MB/s eta 0:00:06
     -                                        0.4/10.5 MB 2.8 MB/s eta 0:00:04
     ---                                      0.9/10.5 MB 4.5 MB/s eta 0:00:03
     ------                                   1.8/10.5 MB 7.5 MB/s eta 0:00:02
     -------                                  2.1/10.5 MB 8.3 MB/s eta 0:00:02
     ---------                                2.5/10.5 MB 7.5 MB/s eta 0:00:02
     -----------------                        4.6/10.5 MB 11.4 MB/s eta 0:00:01
     -----------------------                  6.3/10.5 MB 13.8 MB/s eta 0:00:01
     ------------------------------           8.1/10.5 MB 16.2 MB/s eta 0:00:01
     -------------------------------------   10.0/10.5

In [8]:
import tensorflow as tf
import numpy as np
import gym
import tensorflow_probability as tfp
import tensorflow.keras.losses as kls

In [13]:
env= gym.make("CartPole-v1")
low = env.observation_space.low
high = env.observation_space.high

In [16]:
class critic(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(128,activation='relu')
    self.v = tf.keras.layers.Dense(1, activation = None)

  def call(self, input_data):
    x = self.d1(input_data)
    v = self.v(x)
    return v
  
class actor(tf.keras.Model):
  def __init__(self):
    super().__init__()
    self.d1 = tf.keras.layers.Dense(128,activation='relu')
    self.a = tf.keras.layers.Dense(2,activation='softmax')

  def call(self, input_data):
    x = self.d1(input_data)
    a = self.a(x)
    return a
  
class agent():
    def __init__(self):
        self.a_opt = tf.keras.optimizers.Adam(learning_rate=7e-3)
        self.c_opt = tf.keras.optimizers.Adam(learning_rate=7e-3)
        self.actor = actor()
        self.critic = critic()
        self.clip_pram = 0.2

    # agent's action function
    def act(self,state):
        # convert state into tensor
        prob = self.actor(np.array([state]))
        prob = prob.numpy()
        dist = tfp.distributions.Categorical(probs=prob, dtype=tf.float32)
        action = dist.sample()
        return int(action.numpy()[0])
    
    def actor_loss(self, probs, actions, adv, old_probs, closs):
        probability = probs      
        entropy = tf.reduce_mean(tf.math.negative(tf.math.multiply(probability,tf.math.log(probability))))
        #print(probability)
        #print(entropy)
        sur1 = []
        sur2 = []
        for pb, t, op,a  in zip(probability, adv, old_probs, actions):
                        t =  tf.constant(t)
                        #op =  tf.constant(op)
                        #print(f"t{t}")
                        #ratio = tf.math.exp(tf.math.log(pb + 1e-10) - tf.math.log(op + 1e-10))
                        ratio = tf.math.divide(pb[a],op[a])
                        #print(f"ratio{ratio}")
                        s1 = tf.math.multiply(ratio,t)
                        #print(f"s1{s1}")
                        s2 =  tf.math.multiply(tf.clip_by_value(ratio, 1.0 - self.clip_pram, 1.0 + self.clip_pram),t)
                        #print(f"s2{s2}")
                        sur1.append(s1)
                        sur2.append(s2)

        sr1 = tf.stack(sur1)
        sr2 = tf.stack(sur2)
        #closs = tf.reduce_mean(tf.math.square(td))
        loss = tf.math.negative(tf.reduce_mean(tf.math.minimum(sr1, sr2)) - closs + 0.001 * entropy)
        #print(loss)
        return loss
      
    def learn(self, states, actions,  adv , old_probs, discnt_rewards):
        discnt_rewards = tf.reshape(discnt_rewards, (len(discnt_rewards),))
        adv = tf.reshape(adv, (len(adv),))
        old_p = old_probs
        old_p = tf.reshape(old_p, (len(old_p),2))
        with tf.GradientTape() as tape1, tf.GradientTape() as tape2:
            p = self.actor(states, training=True)
            v =  self.critic(states,training=True)
            v = tf.reshape(v, (len(v),))
            td = tf.math.subtract(discnt_rewards, v)
            c_loss = 0.5 * kls.mean_squared_error(discnt_rewards, v)
            a_loss = self.actor_loss(p, actions, adv, old_probs, c_loss)
            
        grads1 = tape1.gradient(a_loss, self.actor.trainable_variables)
        grads2 = tape2.gradient(c_loss, self.critic.trainable_variables)
        self.a_opt.apply_gradients(zip(grads1, self.actor.trainable_variables))
        self.c_opt.apply_gradients(zip(grads2, self.critic.trainable_variables))
        return a_loss, c_loss

In [18]:
def test_reward(env):
  total_reward = 0
  state = env.reset()
  done = False
  while not done:
    action = np.argmax(agentoo7.actor(np.array([state])).numpy())
    next_state, reward, done, _ = env.step(action)
    state = next_state
    total_reward += reward

  return total_reward

In [19]:
def preprocess1(states, actions, rewards, done, values, gamma):
    g = 0
    lmbda = 0.95
    returns = []
    for i in reversed(range(len(rewards))):
       delta = rewards[i] + gamma * values[i + 1] * done[i] - values[i]
       g = delta + gamma * lmbda * dones[i] * g
       returns.append(g + values[i])
    returns.reverse()
    adv = np.array(returns, dtype=np.float32) - values[:-1]
    adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-10)
    states = np.array(states, dtype=np.float32)
    actions = np.array(actions, dtype=np.int32)
    returns = np.array(returns, dtype=np.float32)
    return states, actions, returns, adv    


tf.random.set_seed(336699)
agentoo7 = agent()
steps = 5000
ep_reward = []
total_avgr = []
target = False 
best_reward = 0
avg_rewards_list = []

# stepping through
for s in range(steps):
  if target == True:
      break
  
  done = False
  state = env.reset()
  all_aloss = []
  all_closs = []
  rewards = []
  states = []
  actions = []
  probs = []
  dones = []
  values = []
  print("new episode")

  for e in range(128):
    action = agentoo7.act(state)
    value = agentoo7.critic(np.array([state])).numpy()
    next_state, reward, done, _ = env.step(action)
    dones.append(1-done)
    rewards.append(reward)
    states.append(state)
    #actions.append(tf.one_hot(action, 2, dtype=tf.int32).numpy().tolist())
    actions.append(action)
    prob = agentoo7.actor(np.array([state]))
    probs.append(prob[0])
    values.append(value[0][0])
    state = next_state
    if done:
      env.reset()
  
  value = agentoo7.critic(np.array([state])).numpy()
  values.append(value[0][0])
  np.reshape(probs, (len(probs),2))
  probs = np.stack(probs, axis=0)
  states, actions,returns, adv  = preprocess1(states, actions, rewards, dones, values, 1)
  for epocs in range(10):
      al,cl = agentoo7.learn(states, actions, adv, probs, returns)
      # print(f"al{al}") 
      # print(f"cl{cl}")   

  avg_reward = np.mean([test_reward(env) for _ in range(5)])
  print(f"total test reward is {avg_reward}")
  avg_rewards_list.append(avg_reward)
  if avg_reward > best_reward:
        print('best reward=' + str(avg_reward))
        agentoo7.actor.save('model_actor_{}_{}'.format(s, avg_reward), save_format="tf")
        agentoo7.critic.save('model_critic_{}_{}'.format(s, avg_reward), save_format="tf")
        best_reward = avg_reward
  if best_reward == 200:
        target = True
  env.reset()

env.close()
    

new episode
total test reward is 27.8
best reward=27.8
INFO:tensorflow:Assets written to: model_actor_0_27.8\assets


INFO:tensorflow:Assets written to: model_actor_0_27.8\assets


INFO:tensorflow:Assets written to: model_critic_0_27.8\assets


INFO:tensorflow:Assets written to: model_critic_0_27.8\assets


new episode
total test reward is 26.4
new episode
total test reward is 180.6
best reward=180.6
INFO:tensorflow:Assets written to: model_actor_2_180.6\assets


INFO:tensorflow:Assets written to: model_actor_2_180.6\assets


INFO:tensorflow:Assets written to: model_critic_2_180.6\assets


INFO:tensorflow:Assets written to: model_critic_2_180.6\assets


new episode
total test reward is 345.8
best reward=345.8
INFO:tensorflow:Assets written to: model_actor_3_345.8\assets


INFO:tensorflow:Assets written to: model_actor_3_345.8\assets


INFO:tensorflow:Assets written to: model_critic_3_345.8\assets


INFO:tensorflow:Assets written to: model_critic_3_345.8\assets


new episode
total test reward is 500.0
best reward=500.0
INFO:tensorflow:Assets written to: model_actor_4_500.0\assets


INFO:tensorflow:Assets written to: model_actor_4_500.0\assets


INFO:tensorflow:Assets written to: model_critic_4_500.0\assets


INFO:tensorflow:Assets written to: model_critic_4_500.0\assets


new episode
total test reward is 62.2
new episode
total test reward is 45.0
new episode
total test reward is 45.2
new episode
total test reward is 210.2
new episode
total test reward is 194.8
new episode
total test reward is 155.6
new episode
total test reward is 157.4
new episode
total test reward is 283.4
new episode
total test reward is 500.0
new episode
total test reward is 240.6
new episode
total test reward is 196.0
new episode
total test reward is 113.8
new episode
total test reward is 123.0
new episode
total test reward is 129.8
new episode
total test reward is 97.6
new episode
total test reward is 108.0
new episode
total test reward is 83.6
new episode
total test reward is 125.2
new episode
total test reward is 107.0
new episode
total test reward is 112.6
new episode
total test reward is 119.8
new episode
total test reward is 158.8
new episode
total test reward is 214.6
new episode
total test reward is 248.6
new episode
total test reward is 241.4
new episode
total test reward 

: 

: 