# Reinforcement Learning - Discrete and Continuous action spaces

R. Vasudevan

Here, we will explore two different RL algorithms: 

1. Deep Q Learning, which is useful for discrete action spaces,
2. Actor-Critic algorithm (A2C) which is a policy gradient method, and is good for continuous action spaces.

# 1. Deep Q Learning

Here we will attempt to 'solve' the game of 'Cartpole', which is a very basic game where the goal is to keep a pole upright as it wobbles on a cart. You have two choices at each state: move the cart left or right.

In [3]:
#Let's import some necessary packages
import numpy as np
import matplotlib.pyplot as plt
import math

#And then for the agent
from tensorflow.keras import Model, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.initializers import GlorotUniform
import tensorflow as tf
import gym
import sys
#tf.device('/cpu:0') # Put it on cpu



## Define the necessary classes

In [4]:
# create agent class
class Agent():
    def __init__(self, gamma, epsilon, lr,
                  batchSize, 
                 nActions=5, maxMem=45, epsEnd=0.01,
                 epsDec=0.005, ddqn = False):
        
        self.gamma = gamma #Discount Factor
        self.epsilon = epsilon #Epsilon, from greedy
        self.lr = lr
        self.frame_history = []
        self.batchSize = batchSize
        self.epsMin = epsEnd
        self.epsDec = epsDec
        self.memsize = maxMem
        self.nActions = nActions
        self.qvals_list = []
        self.episode_history = []
        self.episode_number = 0

        # call and evaluate using DQN class (and make target)
        self.q_eval = DQN_Keras(dim_actions=self.nActions, actor_lr=self.lr)
        self.q_eval.compile()
    
    def choose_action(self, obsv):
        if np.random.random() > self.epsilon:
            qvals = self.q_eval(obsv)
            action = [np.argmax(qvals)]
            #print('agent action, is {}'.format(action))
        else:
            action = np.random.choice(np.arange(self.nActions), size=1)
            #print('random action, is {}'.format(action))
        self.action = action
        return action

    def store_transitions(self, transition):
        self.episode_history.append(transition)
        self.episode_history = self.episode_history[-self.memsize:]
        
    def learn(self):
    
        size_to_select = min(len(self.episode_history), self.batchSize)

        # experience replay
        batch = np.random.choice(np.arange(len(self.episode_history)),
                          size_to_select, replace=False)

        q_values_list, state_buffer, action_hist = [], [],[]
        batch_history = np.array(self.episode_history, dtype = 'object')

        for state, action, reward, done, next_state in batch_history[batch]:
            
            state_buffer.append(state)
    
            # qval (s,a) = reward + gamma*max(a')qval(s',a')
            #print(state.shape)
            qvals_initial = agent.q_eval(state)[0]
            qvals_initial = qvals_initial.numpy()
            q_update = reward + self.gamma * np.max(agent.q_eval(next_state)[0]) * done
            qvals_initial[action] = q_update
            q_update = qvals_initial
            
            q_values_list.append(q_update)
            action_hist.append(action)
                
        q_values_list = tf.stack(q_values_list)
        
      
        with tf.GradientTape() as tape:
            q_output = tf.squeeze(tf.stack([agent.q_eval(state)for state in state_buffer]))
            loss = tf.square(q_output - q_values_list)
            valueGradient = tape.gradient(loss, agent.q_eval.trainable_variables)
            
        agent.q_eval.optimizer.apply_gradients(zip(valueGradient, agent.q_eval.trainable_variables))
        agent.epsilon = agent.epsilon - agent.epsDec if agent.epsilon > agent.epsMin else agent.epsMin

        return loss
    
# create DQN class
class DQN_Keras(Model):
    def __init__(self, input_dim=4,
                 dim_actions=2,  num_hidden_nodes_1=128,
                 num_hidden_nodes_2=128,
                 actor_lr=0.001):
        
        self.initializer = tf.keras.initializers.he_uniform()
        self.num_hidden_nodes_1 = num_hidden_nodes_1
        self.num_hidden_nodes_2 = num_hidden_nodes_2
        self.dim_actions = dim_actions
        self.input_dim = input_dim
        self.actor_lr = actor_lr
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        super(DQN_Keras, self).__init__()
        self.actor = self.build_actor()
        self.actor.compile(loss=tf.keras.losses.mean_squared_error, optimizer=self.optimizer,
                           metrics=['accuracy'])

    def build_actor(self):
        
        Input = layers.Input(shape=self.input_dim)
        cnet = layers.Dense(self.num_hidden_nodes_1, activation='relu', 
                           kernel_initializer = self.initializer)(Input)
        
        cnet = layers.Dense(self.num_hidden_nodes_2, activation='relu', 
                           kernel_initializer = self.initializer)(cnet)
        
        cnet = layers.Dense(self.dim_actions, activation='linear',
                                kernel_initializer=self.initializer,
                            name='output_qvalues')(cnet)
        
        actor = Model(inputs = Input, outputs = cnet)

        return actor

    # define forward pass
    def call(self, states):
        actions_output = self.actor(states[None,:])

        # this returns the qvalues
        return actions_output
    
        

In [5]:
gamma = 0.90 #discount factor
epsilon_start = 0.99 #start of epsilon
epsilon_dec = 0.0003 #decay of epsilon after each learning step
batch_size = 32 #batch size for experienc ereplay
agent_lr = 0.0001

#Make the environment
env = gym.make('CartPole-v1')

env.reset() #Reset

agent = Agent(gamma=gamma,epsilon=epsilon_start, epsDec =epsilon_dec,
              batchSize=batch_size, lr=agent_lr, 
              nActions=2)


## DQN Agent Training

In [6]:
## DQN Agent Training
#Here we train the agent
NUM_EPISODES=100

scores, epsHistory = [], []
 
for ep in range(NUM_EPISODES):
    done = False
    score=0
    state = env.reset()
    
    while not done:

        action = agent.choose_action(state)[0] #Get the agent to choose the action
        next_state, reward, done,_ = env.step(action) #Step the environment
        
        if done: done_val = 0
        else: done_val = 1
        
        agent.store_transitions([state,action,reward,done_val,next_state]) #store the transition
    
        score+=reward
        agent.learn() #update q function
        
        state = next_state
        
    scores.append(score)
    if ep>5:
        print('ep {} has reward {:.2f}, mean reward from last 5 episodes is {:.2f}, epsilon is {:.2f}'.format(
        ep, score, np.mean(scores[-5:]), agent.epsilon))
    epsHistory.append(agent.epsilon)
    agent.episode_number+=1
    

ep 6 has reward 23.00, mean reward from last 5 episodes is 23.20, epsilon is 0.95
ep 7 has reward 16.00, mean reward from last 5 episodes is 21.40, epsilon is 0.94
ep 8 has reward 15.00, mean reward from last 5 episodes is 20.20, epsilon is 0.94
ep 9 has reward 12.00, mean reward from last 5 episodes is 16.40, epsilon is 0.93
ep 10 has reward 18.00, mean reward from last 5 episodes is 16.80, epsilon is 0.93
ep 11 has reward 15.00, mean reward from last 5 episodes is 15.20, epsilon is 0.92
ep 12 has reward 11.00, mean reward from last 5 episodes is 14.20, epsilon is 0.92
ep 13 has reward 27.00, mean reward from last 5 episodes is 16.60, epsilon is 0.91
ep 14 has reward 13.00, mean reward from last 5 episodes is 16.80, epsilon is 0.91
ep 15 has reward 17.00, mean reward from last 5 episodes is 16.60, epsilon is 0.90
ep 16 has reward 25.00, mean reward from last 5 episodes is 18.60, epsilon is 0.89
ep 17 has reward 21.00, mean reward from last 5 episodes is 20.60, epsilon is 0.89
ep 18 ha

## Plot Results

In [1]:
# Let's plot the results
%matplotlib notebook
def smooth_window(data, window_size):
    return np.convolve(data, np.ones((window_size,))/window_size, mode='valid')
 
plt.figure()
plt.plot(smooth_window(scores,5))
plt.xlabel('Episode')
plt.ylabel('Average Reward')
ax2 = plt.twinx()
ax2.plot(smooth_window(epsHistory,5), 'r--')
ax2.set_ylabel('Epsilon')

NameError: ignored

### Exercise: 
1. See how the training changes when you reduce the size of the network
2.  Can you now move to a different environment, like the MountainCar example, and see if you can train on that environment?

# 2. Actor-Critic Algorithm

Here we will now employ an actor-critic algorithm to tackle a continuous action space task - the classic 'pendulum swing'

In [None]:
import sys
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import Sequential
import tensorflow_probability as tfp
from collections import namedtuple
import matplotlib.pyplot as plt
import math

#And then for the agent
from tensorflow.keras import Model, layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.initializers import GlorotUniform
import tensorflow as tf


EPISODES = 500

# A2C(Advantage Actor-Critic) agent for Pendulum
class CL_NeuralNet_A2C(Model):
  # This defines two distinct models
  # One is an actor, another is the critic (value function estimation)
  # Both are fully connected neural networks

    def __init__(self, input_dim=3, dim_actions=1, num_hidden_nodes_1=128,
                 num_hidden_nodes_2 = 64,
                actor_lr=0.0002, critic_lr=0.0005, gamma = 0.90):
        self.num_hidden_nodes_1 = num_hidden_nodes_1
        self.num_hidden_nodes_2 = num_hidden_nodes_2
        self.input_dim = input_dim
        self.dim_actions = dim_actions
        self.critic_lr = critic_lr
        self.actor_lr = actor_lr
        self.initializer = tf.keras.initializers.glorot_uniform()
        super(CL_NeuralNet_A2C, self).__init__()
        self.actor = self.build_actor()
        self.critic = self.build_critic()
        self.actor.optimizer = tf.keras.optimizers.Adam(learning_rate=self.actor_lr)
        self.critic.optimizer = tf.keras.optimizers.Adam(learning_rate=self.critic_lr)
        self.transitions = []
        self.gamma = gamma

    def build_actor(self):
        actor = Sequential()
        actor.add(layers.Input(shape=self.input_dim))
        actor.add(layers.Dense(self.num_hidden_nodes_1, activation=tf.nn.relu,
                              kernel_initializer=self.initializer,
                              name='fc_1'))
        actor.add(layers.Dense(self.num_hidden_nodes_2, activation=tf.nn.relu,
                              kernel_initializer=self.initializer,
                              name='fc_2'))
        actor.add(layers.Dense(self.dim_actions * 2, activation='linear',
                              kernel_initializer=self.initializer,
                              name='output_actions_layer'))

      # here the actor will output a mean, standard deviation from which we will sample.

        return actor

    def build_critic(self):
        # critic neural network
        critic = Sequential()

        critic.add(layers.Dense(self.num_hidden_nodes_1, activation=tf.nn.relu,
                              kernel_initializer=self.initializer , input_shape=(self.input_dim,),
                              name='fcc_1'))
        critic.add(layers.Dense(self.num_hidden_nodes_2, activation=tf.nn.relu,
                              kernel_initializer=self.initializer , input_shape=(self.input_dim,),
                              name='fc_2'))
        critic.add(layers.Dense(1, activation='linear',
                              kernel_initializer=self.initializer ,
                              name='output_actions_layer_critic'))

        return critic

    # define forward pass
    def call(self, states):
        actions_output = self.actor(states)
        value_estimate = self.critic(states)
        return actions_output, value_estimate
    
    def store(self, transition_tuple):
      self.transitions.append(transition_tuple)

    def learn(self):
        discounted_rs, states, actions, done_vals = [] ,[] , [] , []
        for ind in range(len(self.transitions)):
            state, action,reward, done, next_state = self.transitions[ind]
            if done is True: done_val = 0
            else: done_val = 1  
            done_vals.append(done_val)
            discounted_r = reward + self.gamma*self.critic(next_state[None,:])*done_val
            discounted_rs.append(discounted_r)
            states.append(state)
            actions.append(action)

        states_tf = tf.stack(states)
        state_values = self.critic(states_tf)
        advantage_estimate = np.array(discounted_rs) - state_values.numpy()
        actions_tf = tf.stack(actions)

        with tf.GradientTape() as tape:
            # Calculate the policy gradient
            actions_mean_tt = tf.reshape(self.actor(states_tf), (-1, 2))
            lognorm_dist = tfp.distributions.MultivariateNormalDiag(
                actions_mean_tt[:, 0],
                tf.nn.softplus(actions_mean_tt[:, 1])).log_prob(actions_tf)
            loss = -tf.reduce_mean(lognorm_dist * advantage_estimate)  # gradient of objective function
            gradients = tape.gradient(loss, self.actor.trainable_variables)
        self.actor.optimizer.apply_gradients(zip(gradients, self.actor.trainable_variables))

        # Critic loss
        with tf.GradientTape() as tape:
            state_values = self.critic(states_tf)
            state_values = state_values * done_vals
            td_error = tf.reduce_mean((np.squeeze(np.array(discounted_rs)) - state_values) ** 2)  # TD_error
            valueGradient = tape.gradient(td_error, policy.critic.trainable_variables)
            
        self.critic.optimizer.apply_gradients(zip(valueGradient, policy.critic.trainable_variables))
        self.transitions = []

        return loss, td_error

## Agent Training

In [None]:

env = gym.make('Pendulum-v0')

#The policy is a global variable. There will be one policy per MPI process
policy =  CL_NeuralNet_A2C(actor_lr=0.0001, critic_lr=0.0002, input_dim=3, dim_actions = 1)

train_int = 64
scores_total = []

for ep in range(EPISODES):
    done = False
    state = env.reset()
    score = 0
    step = 0
    while not done:
    
        actions_output, value_estimate = policy(state[None,:])
        # actions are given by mu, sigma (action_dims x 2) tensor
        
        step +=1

        # Sample the policy to get the action
        output_action = tfp.distributions.MultivariateNormalDiag(
            actions_output[ :, 0], tf.nn.softplus(actions_output[:, 1]) + 1E-2).sample(1)

        action = np.squeeze(tf.clip_by_value(output_action, -2, 2).numpy())

        # Take the selected action in the environment
        next_state, reward, done, _ = env.step([action])

        if step==32: done = True

        Transition = [state, action,  reward, done, next_state]

        policy.store(Transition)

        state = next_state
        score+=reward

    policy.learn()

    if ep==0:
      scores_total.append(score)
    else:
      scores_total.append(0.1*score + 0.9*scores_total[-1])

    if ep>=2:
      print("ep {} with reward {} and average score {}".format(ep, score, scores_total[-1]))



In [None]:

def smooth_window(data, window_size):
    return np.convolve(data, np.ones((window_size,))/window_size, mode='valid')
 
plt.figure()
plt.plot(smooth_window(scores_total,20))
plt.xlabel('Episode')
plt.ylabel('Average Reward')