In [1]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense

class ActorNetwork(Model):
    def __init__(self, state_dim, action_dim):
        super(ActorNetwork, self).__init__()
        self._first_layer = Dense(256, activation='relu', input_shape= state_dim)
        self._second_layer = Dense(128, activation='relu')
        # Output 2 dimensional continous action [-1, 1]
        self._output_layer = Dense(action_dim[0], activation='tanh')
        self._state_dim = state_dim
        self._action_dim = action_dim

    # Return action tensorflow with shape (1, state_dim)
    def call(self, state):
        first_input = self._first_layer(state)
        second_input = self._second_layer(first_input)
        output_action = self._output_layer(second_input)
        return output_action

class CriticNetwork(Model):
    def __init__(self, state_dim, action_dim):
        super(CriticNetwork, self).__init__()
        self._first_state_layer = Dense(64, activation='relu', input_shape=(state_dim[0] + action_dim[0],))
        self._second_state_layer = Dense(32, activation='relu')
        # Output a single value which is the action-value
        self._output_layer = Dense(1, activation=None)
    
    # Output a value of dimension 1 shape (1, state_dim + action_dim)
    def call(self, state_action):
        first_state_input = self._first_state_layer(state_action)
        second_state_input = self._second_state_layer(first_state_input)
        output = self._output_layer(second_state_input)
        return output

In [2]:
import numpy as np

class ReplayMemory():
    def __init__(self, max_mem_size):
        self._max_mem_size = max_mem_size
        self._memory = []
        self._last_transition = ()
        self._mem_ctr = 0

    def store_transition(self, state, action, new_state, reward, terminal):
        transition = (state, action, new_state, reward, terminal)
        # If buffer is not full
        if len(self._memory) < self._max_mem_size:
            self._memory.append(transition)
        else:
            self._memory[self._mem_ctr] = transition
            # Circular buffer counter
            self._mem_ctr = (self._mem_ctr + 1) % self._max_mem_size

    def store_last_transition(self,state, action, new_state, reward, terminal):
        self._last_transition = (state, action, new_state, reward, terminal)

    def sample_transition(self, batch_size):
        last_transition = []
        if len(self._memory) < batch_size:
            last_transition.append(self._last_transition)
            return last_transition
        batch_indexes = np.random.choice(len(self._memory), batch_size, replace=False)
        samples = [self._memory[i] for i in batch_indexes]
        return samples


In [3]:
class OrnsteinUhlenbeckNoise:
    def __init__(self, action_dim, mu=0, theta=0.15, sigma=0.2):
        self.action_dim = action_dim[0]
        self.mu = mu
        self.theta = theta
        self.sigma = sigma
        self.state = np.ones(self.action_dim) * self.mu
        self.reset()

    def reset(self):
        self.state = np.ones(self.action_dim) * self.mu

    def sample(self):
        state = self.state
        dx = self.theta * (self.mu - state) + self.sigma * np.random.randn(self.action_dim)
        self.state = state + dx
        return self.state

In [5]:
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

class Agent():
    def __init__(self, state_dim, action_dim, gamma= 0.5, beta= 0.01,
                 capacity=100, mem_size= 100, learning_rate= 0.1, batch_size = 5):
        self._learning_rate = learning_rate
        self._capacity = capacity
        self._batch_size = batch_size
        self._gamma = gamma
        self._beta = beta
        self._mem_size = mem_size
        self._min_action = -1
        self._max_action = 1

        self._state_dim = state_dim
        self._action_dim = action_dim
        self._state_dim_net = state_dim[0] + action_dim[0]
        self._noise = OrnsteinUhlenbeckNoise(action_dim)

        self._replay_memory = ReplayMemory(self._mem_size)

        self._actor = ActorNetwork(state_dim, action_dim)
        self._target_actor = ActorNetwork(state_dim, action_dim)
        self._critic = CriticNetwork(state_dim, action_dim)
        self._target_critic = CriticNetwork(state_dim, action_dim)

        self._actor.compile(optimizer=Adam(self._learning_rate))
        self._critic.compile(optimizer=Adam(self._learning_rate))
        self._target_actor.compile(optimizer=Adam(self._learning_rate))
        self._target_critic.compile(optimizer=Adam(self._learning_rate))

    def get_action(self, state):
        # Convert state into tensor and reshape to feed into network
        numpy_state = np.array(state)
        reshaped_state = numpy_state.reshape(1, self._state_dim[0])
        tf_state = tf.convert_to_tensor(reshaped_state)
        action = self._actor(tf_state)
        clipped_action = tf.clip_by_value(action, self._min_action, self._max_action)

        return clipped_action.numpy()[0] + self._noise.sample() # add noise
 
    def store_transition(self, state, action, new_state, reward, terminal):
        self._replay_memory.store_transition(state, action, new_state, reward, terminal)
    
    def store_last_transition(self, state, action, new_state, reward, terminal):
        self._replay_memory.store_last_transition(state, action, new_state, reward, terminal)

    def update_target_actor_weights(self):
        actor_weights = self._actor.weights
        target_actor_weights = self._target_actor.weights
        updated_weights = []
        for i, weight in enumerate(actor_weights):
            updated_weights.append((weight*self._beta) + (1-self._beta)*target_actor_weights[i])
        self._target_actor.set_weights(updated_weights)

    def update_target_critic_weights(self):
        critic_weights = self._critic.weights
        target_critic_weights = self._target_critic.weights
        updated_weights = []
        for i, weight in enumerate(critic_weights):
            updated_weights.append((weight*self._beta) + (1-self._beta)*target_critic_weights[i])
        self._target_critic.set_weights(updated_weights)

    def gradient_descent_critic(self, state, new_state, reward):
        with tf.GradientTape() as tape:
            # Convert to tensors and reshape to feed to network
            action = self.get_action(state)
            state_action = np.concatenate((state, action))
            new_state_action = np.concatenate((new_state, action))
            tf_state_action = tf.convert_to_tensor(state_action.reshape(1, self._state_dim_net))
            tf_new_state_action = tf.convert_to_tensor(new_state_action.reshape(1, self._state_dim_net))

            # Feed to network and get target
            target_critic_value = self._target_critic(tf_new_state_action)
            critic_value = self._critic(tf_state_action)
            target = reward + self._gamma * target_critic_value
            critic_loss = tf.keras.losses.MSE(target, critic_value)

        # Perform gradient descent for critic with respect to critic weights
        critic_gradients = tape.gradient(critic_loss, self._critic.trainable_variables)
        self._critic.optimizer.apply_gradients(zip(critic_gradients, self._critic.trainable_variables))

    def gradient_ascent_actor(self, state):
        with tf.GradientTape() as tape:
            # Convert to tensors and reshape to feed to network
            action = self.get_action(state)
            state_action = np.concatenate((state, action))
            tf_state_action = tf.convert_to_tensor(state_action.reshape(1, self._state_dim_net))
            critic_value = self._critic(tf_state_action)
            actor_loss = - tf.reduce_mean(critic_value)
        
        # Perform gradient ascent for actor with respect to actor weights
        actor_gradients = tape.gradient(actor_loss, self._actor.trainable_variables)
        self._actor.optimizer.apply_gradients(zip(actor_gradients, self._actor.trainable_variables))

    def learn(self):
        samples = self._replay_memory.sample_transition(self._batch_size)
        for transition in samples:
            state, action, new_state, reward, terminal = transition

            self.gradient_ascent_actor(state)
            self.update_target_actor_weights()
            self.gradient_descent_critic(state, new_state, reward)
            self.update_target_critic_weights()


In [6]:
import gym

env = gym.make("LunarLander-v2", continuous= True, render_mode="human")
n_games = 1000
best_score = float('-inf')
episode_score_history = []
state_dim = env.observation_space.shape
action_dim = env.action_space.shape
agent = Agent(state_dim, action_dim)

for i in range(n_games):
    current_state = env.reset()[0]
    episode_score = 0
    terminal = False
    while not terminal:
        action = agent.get_action(current_state)
        new_state, reward, terminal, truncated, info = env.step(action)
        terminal = terminal or truncated
        agent.store_transition(current_state, action, new_state, reward, terminal)
        agent.store_last_transition(current_state, action, new_state, reward, terminal)
        current_state = new_state
        episode_score += reward
        agent.learn()

    episode_score_history.append(episode_score)
    avg_score = np.mean(episode_score_history[-100:])

    if avg_score > best_score:
        best_avg_score = avg_score
    print(f"Episode: {i}, score: {episode_score}, avg_score: {avg_score}, best_score: {max(episode_score_history)}")


  if not isinstance(terminated, (bool, np.bool8)):


ValueError: No gradients provided for any variable: (['actor_network/dense/kernel:0', 'actor_network/dense/bias:0', 'actor_network/dense_1/kernel:0', 'actor_network/dense_1/bias:0', 'actor_network/dense_2/kernel:0', 'actor_network/dense_2/bias:0'],). Provided `grads_and_vars` is ((None, <tf.Variable 'actor_network/dense/kernel:0' shape=(8, 256) dtype=float32, numpy=
array([[-0.1270363 , -0.0531676 , -0.13230993, ..., -0.01138888,
        -0.02405429, -0.10280645],
       [-0.10561301,  0.03511579,  0.1497595 , ..., -0.05364694,
        -0.12176427,  0.09937266],
       [-0.02251211, -0.1274388 , -0.05422207, ...,  0.05141737,
        -0.09575436, -0.10565147],
       ...,
       [-0.09025693, -0.07140909, -0.14514272, ...,  0.08167745,
        -0.11591394,  0.11755019],
       [-0.11041129, -0.02023545, -0.12238231, ...,  0.12690616,
         0.01047863, -0.01740056],
       [ 0.11758155, -0.09517828, -0.10438947, ..., -0.0165707 ,
         0.11047366,  0.05663836]], dtype=float32)>), (None, <tf.Variable 'actor_network/dense/bias:0' shape=(256,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.], dtype=float32)>), (None, <tf.Variable 'actor_network/dense_1/kernel:0' shape=(256, 128) dtype=float32, numpy=
array([[-0.0300518 ,  0.015643  ,  0.07542905, ...,  0.04001945,
        -0.10457742,  0.03369015],
       [-0.06714979, -0.02667472,  0.09278625, ...,  0.02379847,
         0.04356837, -0.03836837],
       [-0.02699602,  0.10133684, -0.00965819, ..., -0.01510745,
        -0.04964533, -0.03063947],
       ...,
       [-0.12273344, -0.08878615,  0.02668807, ...,  0.0195525 ,
        -0.00891274, -0.00022659],
       [ 0.03654724,  0.01671699,  0.10452443, ...,  0.10211813,
         0.05836168, -0.08880129],
       [ 0.06880128, -0.03116229,  0.11223871, ...,  0.10326082,
         0.06823215,  0.09043971]], dtype=float32)>), (None, <tf.Variable 'actor_network/dense_1/bias:0' shape=(128,) dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>), (None, <tf.Variable 'actor_network/dense_2/kernel:0' shape=(128, 2) dtype=float32, numpy=
array([[ 0.2104189 , -0.06193571],
       [-0.15620187, -0.18151766],
       [-0.12786575, -0.09091183],
       [-0.16448392, -0.1799414 ],
       [ 0.0561692 , -0.05567451],
       [-0.18651181,  0.15266936],
       [ 0.08773519, -0.17882366],
       [ 0.20089383, -0.20972393],
       [-0.20931663,  0.02897422],
       [ 0.19764926,  0.11250891],
       [-0.09808239,  0.17983334],
       [-0.08618766, -0.07100728],
       [-0.09277057,  0.04640959],
       [ 0.09992529, -0.01525603],
       [-0.08278923, -0.16503936],
       [-0.17280018, -0.1611178 ],
       [ 0.19953535, -0.11632996],
       [ 0.1368403 ,  0.03042483],
       [ 0.02570789, -0.06985962],
       [-0.16039155, -0.09970168],
       [-0.04587464, -0.06774135],
       [-0.14311743, -0.09364624],
       [ 0.2003598 ,  0.17305444],
       [-0.04770203, -0.10845855],
       [ 0.15506862,  0.13613014],
       [-0.12207855,  0.19159381],
       [-0.04918599, -0.14556864],
       [-0.13570726,  0.09617977],
       [ 0.09478866, -0.11523057],
       [ 0.04827176, -0.1854518 ],
       [ 0.11848046,  0.17097731],
       [-0.02855657,  0.09161149],
       [ 0.16683234, -0.06785235],
       [-0.00742292,  0.11613257],
       [ 0.03517653, -0.13065666],
       [ 0.01251589,  0.12283538],
       [-0.07370792, -0.18396282],
       [-0.10769331, -0.02715032],
       [ 0.20861937,  0.00615644],
       [-0.11255343, -0.18039225],
       [ 0.13437362, -0.06259984],
       [-0.01841091,  0.12524368],
       [-0.04200795,  0.04671918],
       [-0.14494078, -0.07877815],
       [ 0.05666582,  0.09269927],
       [-0.15735987, -0.01129131],
       [-0.09721144,  0.14731117],
       [-0.06804165,  0.12763347],
       [-0.19588374,  0.02132116],
       [ 0.04019378, -0.20885831],
       [-0.15812448,  0.00179826],
       [-0.16353983, -0.01286726],
       [ 0.09075265, -0.04199381],
       [ 0.03707512, -0.20680304],
       [-0.1985655 ,  0.2025655 ],
       [-0.16397832,  0.01340482],
       [ 0.09977446,  0.17556538],
       [-0.01030603, -0.1251356 ],
       [ 0.19725369,  0.1268874 ],
       [ 0.13151218, -0.0802304 ],
       [-0.06090716,  0.11034413],
       [-0.01832598,  0.18641193],
       [-0.12308574, -0.12609327],
       [-0.0125203 , -0.17612635],
       [ 0.14178677, -0.03896181],
       [-0.10895292,  0.09257604],
       [ 0.14794354,  0.14491592],
       [-0.21475568,  0.19028364],
       [ 0.1903279 ,  0.03049327],
       [-0.01174815,  0.05858456],
       [ 0.02351841,  0.1502295 ],
       [-0.1922032 , -0.0507661 ],
       [ 0.02278566, -0.09402675],
       [ 0.11820872,  0.15498011],
       [-0.06845722, -0.03530943],
       [ 0.14011405, -0.04976505],
       [-0.01579979, -0.14832896],
       [ 0.16611074, -0.17413273],
       [-0.09930534,  0.09429161],
       [ 0.07011567, -0.16412635],
       [ 0.01957265,  0.14947893],
       [-0.10209916, -0.03394061],
       [ 0.11449535,  0.04774834],
       [ 0.02869733, -0.10418516],
       [ 0.1642098 ,  0.20109113],
       [ 0.09517758,  0.20614685],
       [-0.07989956, -0.07451977],
       [-0.00726752,  0.14963688],
       [ 0.19123708, -0.01501581],
       [-0.10373995,  0.13790144],
       [-0.05393793, -0.05884701],
       [ 0.08632334, -0.14812368],
       [ 0.05047779,  0.07870926],
       [ 0.2108656 ,  0.08646174],
       [ 0.08188479, -0.11224324],
       [-0.10149655,  0.17598431],
       [ 0.02309734,  0.01309966],
       [-0.15668876, -0.01092345],
       [ 0.07611407, -0.00452349],
       [ 0.02974622, -0.15777172],
       [ 0.00780575,  0.05215122],
       [-0.00980268,  0.01483588],
       [ 0.06411053,  0.07712366],
       [ 0.18163137,  0.07664974],
       [ 0.07115258,  0.00402834],
       [-0.15086919,  0.06656601],
       [-0.08345233, -0.16285628],
       [-0.10763871, -0.12026795],
       [-0.10686672, -0.08370392],
       [-0.18356013, -0.09586086],
       [-0.07052058,  0.11541532],
       [-0.0546083 , -0.11053953],
       [ 0.02560627, -0.15222377],
       [-0.11686634, -0.10999752],
       [ 0.16588588, -0.10976616],
       [-0.04989715, -0.0440852 ],
       [ 0.02371772, -0.16428041],
       [-0.0271205 ,  0.21350713],
       [-0.13895252, -0.1395449 ],
       [-0.14183046,  0.16596259],
       [ 0.11900802, -0.14726123],
       [-0.10930589,  0.17486344],
       [-0.09747308,  0.04053976],
       [ 0.13350327, -0.16310444],
       [-0.19545016,  0.09368823],
       [ 0.03905495, -0.06966729],
       [ 0.11901627,  0.04875718],
       [-0.16432631, -0.19607782]], dtype=float32)>), (None, <tf.Variable 'actor_network/dense_2/bias:0' shape=(2,) dtype=float32, numpy=array([0., 0.], dtype=float32)>)).