# Import statements

In [0]:
import gym
import numpy as np
from skimage.transform import resize
import tensorflow as tf
from skimage.color import rgb2gray

## Implementation Details
1. Initialize the Parameters of Actor Network and Critic Network
2. For number of episodes:
  1. Sample Reward by taking a step from state s to s1
  2. The new loss minimization for vanilla policy gradients is log * advantage funtion
    1. Advantage Function= r+V(s1)-V(s)
  3. Traditional updates for the Q networks which uses the policy actions as it's target value

  Details taken from [link](https://towardsdatascience.com/understanding-actor-critic-methods-931b97b6df3f)

# Setting up game environment

In [0]:
env = gym.make("SpaceInvaders-v0")
env = env.unwrapped
env.seed(1)
state = env.reset()

# Hyperparameters

In [0]:
action_number=env.action_space.n
critic_learning_rate=0.00025
action_learning_rate=0.00025      
total_episodes = 50
gamma=0.9

# Preprocessing

In [0]:
def frame_preprocessing(image_frame):
  gray = rgb2gray(image_frame)
  normalized_frame = gray/255.0
  preprocessed_frame = resize(normalized_frame, [84,84])
  return preprocessed_frame

# A2C Vanilla Policy Actor

In [0]:
tf.compat.v1.disable_eager_execution()
actor_input_state=tf.compat.v1.placeholder(tf.float32, shape=[None,84,84,1],name="actor_input_state")
actor_action_space=tf.compat.v1.placeholder(tf.int32, shape=[None, action_number],name="actor_action_space")
actor_advantage_function=tf.compat.v1.placeholder(tf.float32, shape=[None,],name="actor_advantage_funtion")
actor_cnn_layer_1=tf.keras.layers.Conv2D(filters=16,kernel_size=(8,8),strides=(4,4),activation="relu")(actor_input_state)
actor_cnn_layer_2=tf.keras.layers.Conv2D(filters=32,kernel_size=(4,4),strides=(2,2),activation="relu")(actor_cnn_layer_1)
actor_flatten_layer=tf.keras.layers.Flatten()(actor_cnn_layer_2)
actor_fully_connected_layer_1=tf.keras.layers.Dense(256,activation='relu',name="actor_fully_connected_layer_1")(actor_flatten_layer)
actor_output_layer=tf.keras.layers.Dense(action_number,name="actor_output_layer")(actor_fully_connected_layer_1)
actor_action_probability=tf.keras.layers.Softmax(name="actor_action_probability")(actor_output_layer)
actor_neg_loss_prob=tf.compat.v1.nn.softmax_cross_entropy_with_logits_v2(logits = actor_output_layer, labels = actor_action_space)
actor_loss=tf.math.reduce_mean(actor_advantage_function*actor_neg_loss_prob)
actor_training=tf.compat.v1.train.AdamOptimizer(action_learning_rate).minimize(actor_loss)

# A2C Value Network Critic

In [0]:
tf.compat.v1.disable_eager_execution()
critic_input_state=tf.compat.v1.placeholder(tf.float32, shape=[None, 84,84,1],name="critic_input_state")
critic_value_target=tf.compat.v1.placeholder(tf.float32, shape=[None],name="critic_value_target")
critic_action_space=tf.compat.v1.placeholder(tf.float32, shape=[None, action_number],name="critic_action_space")
critic_cnn_layer_1=tf.keras.layers.Conv2D(filters=16,kernel_size=(8,8),strides=(4,4),activation="relu")(critic_input_state)
critic_cnn_layer_2=tf.keras.layers.Conv2D(filters=32,kernel_size=(4,4),strides=(2,2),activation="relu")(critic_cnn_layer_1)
critic_flatten_layer=tf.keras.layers.Flatten()(critic_cnn_layer_2)
critic_fully_connected_layer_1=tf.keras.layers.Dense(256,activation='relu',name="critic_fully_connected_layer_1")(critic_flatten_layer)
critic_output_layer=tf.keras.layers.Dense(action_number,name="critic_output_layer")(critic_fully_connected_layer_1)
# critic_action_output=tf.keras.layers.Softmax(name="critic_action_output")(critic_output_layer)
critic_Q_value=tf.math.reduce_sum(tf.math.multiply(critic_output_layer, critic_action_space))
critic_loss=tf.math.reduce_mean(tf.math.square(critic_value_target-critic_Q_value))
critic_training=tf.compat.v1.train.AdamOptimizer(critic_learning_rate).minimize(critic_loss)

# Integration of Actor and Critic to form A2C

In [0]:
saver = tf.compat.v1.train.Saver()
with tf.compat.v1.Session() as sess:
  sess.run(tf.compat.v1.global_variables_initializer())
  for i in range(total_episodes):
    episode_rewards = []
    state = env.reset()
    while True:
      state=frame_preprocessing(state)
      action_probability=sess.run(actor_action_probability,feed_dict={actor_input_state:state.reshape((1,84,84,1))})
      action = np.random.choice(range(action_probability.shape[1]), p=action_probability.ravel())
      action_ = np.zeros(action_number)
      action_[action] = 1
      next_state, reward, done, _ = env.step(action)
      episode_rewards.append(reward)
      if done:
        total_reward = np.sum(episode_rewards)
        print('Episode: {}'.format(i),
              'Total reward: {}'.format(total_reward))
        break
      present_state_value=sess.run(critic_output_layer,feed_dict={critic_input_state:state.reshape((1,84,84,1))})
      preprocessed_next_state=frame_preprocessing(next_state)
      next_state_value=sess.run(critic_output_layer,feed_dict={critic_input_state:preprocessed_next_state.reshape((1,84,84,1))})
      actor_adv_func=reward+(gamma*np.max(next_state_value))-np.max(present_state_value)
      critic_update=reward+(gamma*np.max(next_state_value))
      agent_training,agent_loss=sess.run([actor_training,actor_loss],feed_dict={actor_input_state:preprocessed_next_state.reshape((1,84,84,1)),
                                                                                actor_action_space:action_.reshape((1,6)),
                                                                                actor_advantage_function:np.array([actor_adv_func])})
      teacher_training,teacher_loss=sess.run([critic_training,critic_loss],feed_dict={critic_input_state:preprocessed_next_state.reshape((1,84,84,1)),
                                                                                critic_action_space:action_.reshape((1,6)),
                                                                                critic_value_target:np.array([critic_update])})
      state=next_state
    if i % 10 == 0:
      saver.save(sess, "./models/model.ckpt")
      print("Model saved")  

Episode: 0 Total reward: 25.0
Model saved
