# Tutorial for A2C Reinforcement Learning using Keras

### MD Muhaimin Rahman
sezan92[at]gmail[dot]com

#### Target Readers

If you already have idea about Q Learning ,Deep Q Learning and Deep Neural Networks , then this tutorial is for you. Otherwise, you should learn them first

<a id ="libraries"></a>
### Importing Libraries


In [None]:
from __future__ import print_function,division
import gym
import keras
from keras import layers
from keras import backend as K
from collections import deque
from tqdm import tqdm
import random
import numpy as np
import copy
SEED =123
np.random.seed(SEED)

Important constants

In [None]:
num_episodes = 1000
steps_per_episode=200
BATCH_SIZE=256
TAU=0.001
GAMMA=0.99
actor_lr=0.001
critic_lr=0.001
SHOW= False
action_list = [0,1]#,2]

In [None]:
from keras.models import Model

<a id ="model"></a>
### Model Definition

After some trials and errors, I have selected this network. The Actor Network is 3 layer MLP with 320 hidden nodes in each layer. The critic network is also a 3 layer MLP with 640 hidden nodes in each layer.Notice that the return arguments of function ```create_critic_network```.

In [None]:
def create_actor_network(state_shape,action_shape):
    state=layers.Input(shape=state_shape,name="state")
    action_prob =layers.Input(shape=(1,),name="action_index",dtype="int64")
    l1 =layers.Dense(320,activation="relu")(state)
    l2 =layers.Dense(320,activation="relu")(l1)
    l3 =layers.Dense(320,activation="relu")(l2)
    action =layers.Dense(action_shape,activation="softmax")(l3)
    actor= Model(state,action)
    return actor,action_prob

def create_critic_network(state_shape):
    state = layers.Input(shape=state_shape,name="state")
    R_tensor = layers.Input(shape=(1,),name="R_tensor")
    l1 = layers.Dense(640,activation="relu")(state)
    l2 = layers.Dense(640,activation="relu")(l1)
    l3 = layers.Dense(640,activation="relu")(l2)
    value = layers.Dense(1)(l3)
    critic = Model(inputs=state,outputs=value)
    return critic,state,R_tensor

I am chosing ```MountainCar-v0``` game. Mainly because my GPU is not that good to work on higher dimensional state space

In [None]:
env = gym.make("CartPole-v0")

In [None]:
state_shape= env.observation_space.sample().shape

In [None]:
action_shape=(env.action_space.n,)

In [None]:
action_shape

In [None]:
actor,action_index = create_actor_network(state_shape,action_shape[0])
critic,state_tensor,R_tensor = create_critic_network(state_shape)

In [None]:
R_tensor

I have chosen ```RMSProp``` optimizer, due to more stability compared to Adam . I found it after trials and errors, no theoritical background on chosing this optimizer

In [None]:
actor_optimizer = keras.optimizers.RMSprop(actor_lr)

critic_optimizer = keras.optimizers.RMSprop(critic_lr)

In [None]:
critic.compile(loss="mse",optimizer=critic_optimizer)

#### Actor training

I think this is the most critical part of ddpg in keras. The object ```critic``` and ```actor``` has a ```__call__``` method inside it, which will give output tensor if you give input a tensor. So to get the tensor object of ```Q``` we will use this functionality.

In [None]:
CriticValues = critic([state_tensor])


In [None]:
advantage = R_tensor-CriticValues

In [None]:
action_prob=actor(state_tensor)

In [None]:
action_prob

In [None]:
action_index

In [None]:
logp=K.log(action_prob[0][action_index[0][0]])

In [None]:
logp

In [None]:
TD=logp*advantage

In [None]:
advantage

In [None]:
logp

In [None]:
TD

In [None]:
entropy= -action_prob*logp

In [None]:
action_loss = -TD-0.01*entropy

In [None]:
action_loss

In [None]:
actor.trainable_weights

In [None]:
K.mean(action_loss)

In [None]:
updates = actor_optimizer.get_updates(params=actor.trainable_weights,loss=action_loss)

Now we will create a function which will train the actor network.

In [None]:
actor_train = K.function(inputs=[state_tensor,R_tensor,action_index],outputs=[actor(state_tensor),
                                                                      K.sum(action_loss)],
                   updates=updates)


<a id ="training"></a>
### Training

In [None]:
steps_per_episodes=200
max_total_reward=0
for episode in range(num_episodes):
    values =[]
    action_probs=[]
    rewards=[]
    states=[]
    action_probs=[]
    terminals=[]
    R_list=[]
    advantages=[]
    state= env.reset()
    state = state.reshape((-1),)
    total_reward=0
    value_loss=0
    action_loss=0
#     states =deque(max)
    for step in range(steps_per_episodes):
        action_probability= actor.predict(state.reshape(1,-1))
        action = np.random.choice(action_list,p=action_probability[0])
        action_probability[action_probability!=action_probability[0][action]]=0
        action_probability[action_probability==action_probability[0][action]]=1
        action_probs.append(action)
        next_state,reward,done,_ = env.step(action)
        total_reward=total_reward+reward
        states.append(state)
        rewards.append(reward)
        terminals.append(done)
        value = critic.predict(state.reshape(1,-1))
        if SHOW:
            env.render()
        if done or step==(steps_per_episodes-1):
            if total_reward<-199:
                print("Failed!",end=" ")
                R=0
            elif total_reward>-199:
                print("Passed!",end=" ")
                R = value
            break
        
        state=next_state
        
    print("Episode %d Total Reward %f"%(episode,total_reward))
    
    for t in reversed(range(len(rewards))):
        R = rewards[t]+GAMMA*R
        R_list.append(R)
        advantage = R-critic.predict(states[t].reshape(1,-1))
        advantages.append(advantage)
#         value_loss =value_loss+advantage**2
#         action_prob = actor.predict(states[t].reshape(1,-1))
#         action_log_prob = np.log(action_prob[0][action_probs[t]]+1e-5)
#         entropy= -action_prob[0][action_probs[t]]*action_log_prob
#         policy_loss = policy_loss-action_log_prob*advantage-0.01*entropy
    states = np.vstack(states)
    R_list= np.vstack(R_list)
    action_probs = np.vstack(action_probs)
    loss=critic.train_on_batch(x=states,y=R_list)
    _,action_loss = actor_train(inputs=[states,R_list,action_probs])
    print("action loss %f"%action_loss)
    #print("Weights ")
    #print(actor.get_weights()[:-1])
    
        

In [None]:
action_probability

In [None]:
action_probability

In [None]:
action_loss

### Video

Please watch at 2x speed. I changed some simple mistakes after the video so the rewards are not exactly the same

[![](http://img.youtube.com/vi/9Fe_n-ovIaA/0.jpg)](http://www.youtube.com/watch?v=9Fe_n-ovIaA "Keras tutorial DDPG")