In [2]:
import gym
from tensorflow import keras
import tensorflow_probability as tfp
from tensorflow.keras.optimizers import Adam
import numpy as np
import tensorflow as tf

In [3]:
class PolicyGradientNetwork(keras.Model):
    def __init__(self, l1_dims=62, l2_dims=32):
        super(PolicyGradientNetwork, self).__init__()
        self.l1_dims = l1_dims
        self.l2_dims = l2_dims

        # input shape is equivalent to the shape of observation space
        self.d1 = tf.keras.layers.Dense(self.l1_dims, activation='relu', input_shape=[8,])
        self.d2 = tf.keras.layers.Dense(self.l2_dims, activation='relu')

        # output layer is equivalent to the shape of action space, because network will provide the best possible policy which is best action in any given state.
        self.o = tf.keras.layers.Dense(4, activation='softmax')

    ## used for forward network and it will take the state of game as an input
    def call(self, state):
        value = self.d1(state)
        value = self.d2(value)

        o = self.o(value)

        return o

In [4]:
lr = 0.003
gamma = 0.95
pgn = PolicyGradientNetwork()
pgn.compile(optimizer=Adam(learning_rate=lr))

Metal device set to: Apple M1


2021-12-11 15:58:33.374796: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-11 15:58:33.375026: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
def take_action(observation):

  # converting the observation to tensor, to pass in out Polic Gradient Network
    state = tf.convert_to_tensor([observation], dtype=tf.float32)
  
  # it will return the probability of each 4 actions
    probs = pgn(state)

  # converting the probability to categorical variable
    action_probs = tfp.distributions.Categorical(probs=probs)

  # selecting the action
    action = action_probs.sample()
  
  # action return from sampling will be a array of one element i.e., the best possible action
    return action.numpy()[0]

In [14]:
def learn(state_memory, action_memory, reward_memory, update_wts):
    
    actions = tf.convert_to_tensor(action_memory, dtype=tf.float32)
    rewards = np.array(reward_memory)
    
    '''
      It will take action memory, state memory and reward memory as an
      input. These memory are build using random action for a random state, 
      and what is the reward for the action taken in given state.
      The rewards, actions and state are saved in array, thenn passed as n input 
      parameter in this function. Using the random actions, random states and rewards,
      our policy gradient network is going to find the best possible action for each 
      state, so that network can maximise the reward.
    '''
    

  # calculating reward for all time till the end of game.
    G = np.zeros_like(rewards)
    for t in range(len(rewards)):
        G_sum = 0
        discount = 1
        for k in range(t, len(rewards)):
            G_sum += rewards[k] * discount
            discount *= gamma
            G[t] = G_sum

  # Gradient tape is added in tensorflow2.0, which will increase the computation speed.    
    with tf.GradientTape() as tape:
        loss = 0

    # using the given state memory for an episode, network is trying to minimize 
    #the loss and maximize the reward
        for idx, (g, state) in enumerate(zip(G, state_memory)):
        

      # converting the state/observation to tensor so that we can pass into out network.
            state = tf.convert_to_tensor([state], dtype=tf.float32)
      
      # predicting the probability for each action
            probs = pgn(state)
      
            action_probs = tfp.distributions.Categorical(probs=probs)
      
            log_prob = action_probs.log_prob(actions[idx])
      
      # minus sign means we are trying to maximize the reward, becasue gradient descent will bydefault try to minimize it.
            loss += -g * tf.squeeze(log_prob)

  # calcuating the gradient for losses over each action taken in given state during single episode.
    gradient = tape.gradient(loss, pgn.trainable_variables)
  
    pgn.optimizer.apply_gradients(zip(gradient, pgn.trainable_variables))

    pgn.save("best_model")
  
    if update_wts:
        # updatung wts
        print("Updating Wts")
        wts = np.array(pgn.get_weights())
        pgn.set_weights(wts + 0.001)
  


  

In [16]:
if __name__ == '__main__':
    
    env = gym.make("LunarLander-v2")
    
    score_history = []

    n_games = 200


    # looping over each episode and each episode consist of 100 rounds
    for i in range(n_games):
        done = False
        score = 0
        update_wts = False

      # it will contains the state sampled in each round of a given episodes
        state_memory = []

      # it will contains the action samples for a given state in each round of a given episodes
        action_memory = []

      # it will contains the reward for a given action and state in each round of a given episodes.
        reward_memory = []

      # resetting the environment on each episode, so that we start with fresh slate
        observation = env.reset()

        j = 0
      # looping over for max of 1000 round in episode
        while j < 1000 and not done:

        # calling take action function get one random action
            action = take_action(observation)

        # perfoming the action in a given state of a game
            observation_, reward, done, info = env.step(action)

        # saving the state, action and reward in an array, so that we can use the states, memory and reward from each episode to learn the best policy
            state_memory.append(observation)
            action_memory.append(action)
            reward_memory.append(reward)

            observation = observation_

        # cummalating the rewards
        score += reward
        score_history.append(score)
        j+=1
        if i == 14:
            update_wts = True

      # learning the best possible action for a particular episode
        learn(state_memory, action_memory, reward_memory, update_wts)
        avg_score = np.mean(score_history[-100:])
        print('episode: ', i,'score: %.1f' % score,'average score %.1f' % avg_score)

INFO:tensorflow:Assets written to: best_model/assets


2021-12-11 16:02:49.703772: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


episode:  0 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  1 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  2 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  3 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  4 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  5 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  6 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  7 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  8 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  9 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets


  wts = np.array(pgn.get_weights())


INFO:tensorflow:Assets written to: best_model/assets
episode:  15 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  16 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  17 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  18 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  19 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  20 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  21 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  22 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  23 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  24 score: -100.0 average sco

episode:  96 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  97 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  98 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  99 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  100 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  101 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  102 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  103 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  104 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  105 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: be

INFO:tensorflow:Assets written to: best_model/assets
episode:  177 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  178 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  179 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  180 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  181 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  182 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  183 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  184 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  185 score: -100.0 average score -100.0
INFO:tensorflow:Assets written to: best_model/assets
episode:  186 score: -100.0 a