In [1]:
!pip install gymnasium[atari] --quiet
!pip install gymnasium --quiet
!pip install -U gymnasium[atari] --quiet
!pip install imageio_ffmpeg --quiet
!pip install npy_append_array --quiet
!pip install pyTelegramBotAPI --quiet
!pip install gymnasium[accept-rom-license] --quiet
!pip install gymnasium[box2d] --quiet

[0m

In [329]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense, Conv2D, Input, Lambda
import numpy as np 
 
class Actor:
    def __init__(self, input_dims, action_dim, action_bound, std_bound, actor_lr): 
        self.input_dims = input_dims 
        self.action_dim = action_dim
        self.action_bound = action_bound 
        self.std_bound = std_bound
        self.actor_lr = actor_lr 
        self.actor_network = self.build_network() 
        
    def build_network(self): 
        inputs = Input((self.input_dims, ))
        x = Dense(64, activation="relu", kernel_initializer="he_uniform")(inputs)
        x = Dense(64, activation="relu", kernel_initializer="he_uniform")(x)
        
        out_mu = Dense(self.action_dim, activation='tanh')(x)
        mu_output = Lambda(lambda x: x * self.action_bound)(out_mu)
        std_output = Dense(self.action_dim, activation='softplus')(x)
        
        model = tf.keras.Model(inputs=[inputs], outputs=[mu_output, std_output])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.actor_lr))
        return model

    def get_action(self, state): 
        state = np.reshape(state, [1, self.input_dims])
        mu, std = self.actor_network.predict(state, verbose=False)
        mu, std = mu[0], std[0]
        return np.random.normal(mu, std, size=self.action_dim)

    def log_pdf(self, mu, std, action):
        std = tf.clip_by_value(std, self.std_bound[0], self.std_bound[1])
        var = std ** 2
        log_policy_pdf = -0.5 * (action - mu) ** 2 / var - 0.5 * tf.math.log(
            var * 2 * np.pi
        )
        return tf.reduce_sum(log_policy_pdf, 1, keepdims=True)
    
    
    def compute_loss(self, mu, std, actions, advantages):
        log_policy_pdf = self.log_pdf(mu, std, actions)
        loss_policy = log_policy_pdf * advantages
        return tf.reduce_sum(-loss_policy)
    
    
    def train(self, states, actions, advantages): 
        
        with tf.GradientTape() as tape: 
            mu, std = self.actor_network(states, training=True)
            mu, std = mu[0], std[0]
            loss = self.compute_loss(mu, std, actions, advantages)
        
        params = self.actor_network.trainable_variables
        grads = tape.gradient(loss, params)
        
        self.actor_network.optimizer.apply_gradients(zip(grads, params))
        return loss
        
    def save_models(self): 
        pass 
    
    
    def load_models(self):
        pass
        

In [330]:
class Critic:
    def __init__(self, input_dims, action_dim, critic_lr): 
        self.input_dims = input_dims 
        self.action_dim = action_dim
        self.critic_lr = critic_lr
        self.critic_network = self.build_network() 
        
    def build_network(self): 
        inputs = Input((self.input_dims, ))
        x = Dense(64, activation="relu", kernel_initializer="he_uniform")(inputs)
        x = Dense(64, activation="relu", kernel_initializer="he_uniform")(x)

        output = Dense(1)(x)
        
        model = tf.keras.Model(inputs=[inputs], outputs=[output])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.critic_lr))
        return model
    
    def load_model(self): 
        self.model = tf.keras.models.load_model(self.fname + "_actor_network")
        print("loaded the model")
        
    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def save_model(self): 
        self.model.save(self.fname + "_actor_network") 
    
    def train(self, states, td_targets): 
        with tf.GradientTape() as tape:
            values = self.critic_network(states, training=True)
            loss = self.compute_loss(values, td_targets)
        
        params = self.critic_network.trainable_variables
        grads = tape.gradient(loss, params)
        
        self.critic_network.optimizer.apply_gradients(zip(grads, params))
        return loss
        

In [331]:
class GlobalNetwork: 
    def __init__(self, input_dims, action_dims,  action_bound, std_bound, actor_lr, critic_lr): 
        self.actor_builder = Actor(input_dims, action_dims, action_bound, std_bound, actor_lr)
        self.actor = self.actor_builder.actor_network
        
        self.critic_builder = Critic(input_dims, action_dims, critic_lr)
        self.critic = self.critic_builder.critic_network
        
    def update_global_params(self, states, actions, advantages, td_targets):
        actor_l = self.actor_builder.train(states, actions, advantages)
        critic_l = self.critic_builder.train(states, td_targets)
        return actor_l , critic_l
        
    def pull_global_params(self): 
        actor_params = self.actor.get_weights()
        critic_params = self.critic.get_weights()
        
        return actor_params, critic_params        
    
    def save_models(self): 
        pass 
    
    def load_models(self):
        pass 

class WorkerNetwork: 
    def __init__(self , input_dims, action_dims,  action_bound, std_bound, actor_lr, critic_lr): 
        self.actor_builder = Actor(input_dims, action_dims, action_bound, std_bound, actor_lr)        
        self.critic_builder = Critic(input_dims, action_dims, critic_lr)


In [340]:
from multiprocessing import cpu_count

# global_networks will be created in the A3C Agent
# global_network = GlobalNetwork()

CURR_EPISODE = 0
class A3CWorker: 
    def __init__(self, env, noe, action_bound, std_bound, gamma, 
                     update_interval, global_network, local_network, input_dims, out_dims): 
        self.env = env 
        self.noe = noe
        self.gamma= gamma
        self.update_interval = update_interval
        self.action_bound = action_bound
        self.std_bound = std_bound 
        self.input_dims = input_dims
        
        self.global_network = global_network
        self.global_actor, self.global_critic = self.global_network.actor, self.global_network.critic
        
        self.worker_actor_builder, self.worker_critic_builder = local_network.actor_builder, local_network.critic_builder
        
        self.worker_actor, self.worker_critic = self.worker_actor_builder.actor_network, self.worker_critic_builder.critic_network
        
        actor_weights, critic_weights = global_network.pull_global_params()
        self.worker_actor.set_weights(actor_weights)
        self.worker_critic.set_weights(critic_weights)
        
        
    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zeros_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value

        for k in reversed(range(0, len(rewards))):
            cumulative = self.gamma * cumulative + rewards[k]
            td_targets[k] = cumulative
        return td_targets
    
    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch
    
    def advatnage(self, td_targets, baselines):
        return td_targets - baselines
    
    def learn(self): 
        
        global CURR_EPISODE
        
        while CURR_EPISODE <= self.noe: 
            state = self.env.reset()
            buffer_states, buffer_actions, buffer_rewards = [], [], []
            done = False 
            episodic_reward = 0
            
            while not done:  
                if type(state) == tuple: 
                    state = state[0]
                    
                action = self.worker_actor_builder.get_action(state)
                action = np.clip(action, -self.action_bound, self.action_bound)
                next_state_info = self.env.step(action)
                    
                next_state, reward_prob, terminated, truncated, _ = next_state_info 
                done = terminated or truncated 
                    
                   
                state = np.reshape(state, [1, self.input_dims])
                action = np.reshape(action, [1, 1])
                next_state = np.reshape(next_state, [1, self.input_dims])
                reward_prob = np.reshape(reward_prob, [1, 1])
                    
                buffer_states.append(state)
                buffer_actions.append(action)
                buffer_rewards.append(reward_prob)

                    
                if len(buffer_states) > self.update_interval or done: 
                    states =  self.list_to_batch(buffer_states)
                    actions = self.list_to_batch(buffer_actions)
                    rewards = self.list_to_batch(buffer_rewards)
                    
                    next_v_value = self.worker_critic.predict(next_state, verbose=False)
                    td_targets = self.n_step_td_target(rewards, next_v_value, done)
                    advantages = td_targets - self.worker_critic.predict(states, verbose=False)
                        
                    actor_loss, critic_loss = self.global_network.update_global_params(states, actions, advantages, td_targets)
                        
                    actor_weights, critic_weights = self.global_network.pull_global_params()
                    self.worker_actor.set_weights(actor_weights)
                    self.worker_critic.set_weights(critic_weights)
                        
                    buffer_states, buffer_actions, buffer_rewards = [], [], []
                
                episodic_reward += reward_prob[0][0]
                state  = next_state[0]
            
            print(f"Episode: {CURR_EPISODE}, Reward: {episodic_reward}")
            CURR_EPISODE += 1 
                

In [341]:
from threading import Thread 
from multiprocessing import cpu_count 

class ActorCriticAgent:
    def __init__(self, env, actor_lr, critic_lr, gamma, update_interval, noe, worker_count=cpu_count()):
        self.input_dims = env.observation_space.shape[0]
        self.out_dims = env.action_space.shape[0]
        self.action_bound = env.action_space.high[0]
        self.std_bound = [1e-2, 1.0]
        self.worker_count = worker_count
        self.noe = noe
        self.update_interval = update_interval
        self.env = env
        self.gamma = gamma
        
        self.global_network = GlobalNetwork(self.input_dims, self.out_dims, 
                                                self.action_bound, self.std_bound, actor_lr, critic_lr)
        self.worker_network = WorkerNetwork(self.input_dims, self.out_dims, 
                                                self.action_bound, self.std_bound, actor_lr, critic_lr)
        
        
        
    def learn(self): 
        workers = []
        for _ in range(self.worker_count): 
            a3c_worker = A3CWorker(self.env, self.noe, self.action_bound, self.std_bound,
                                        self.gamma, self.update_interval,  self.global_network,
                                        self.worker_network, self.input_dims, self.out_dims
                                    )
            workers.append(Thread(target=a3c_worker.learn()))
        
        print(len(workers))
        for worker in workers: 
            worker.start()
        
        for worker in workers: 
            worker.join()
        

In [None]:
import gymnasium as gym 

if __name__ == "__main__": 
    env = gym.make("MountainCarContinuous-v0")
    agent = ActorCriticAgent(env, 0.005, 0.005, 0.99, 30, 50)
    agent.learn()

Episode: 2, Reward: -71.76229691049484
Episode: 3, Reward: 39.27725910630954
Episode: 4, Reward: -80.58688811422674
Episode: 5, Reward: -80.45210806325507
Episode: 6, Reward: 26.283470737207907
Episode: 7, Reward: -75.83938066688273
Episode: 8, Reward: -81.64035857301405
Episode: 9, Reward: 49.221613840810136
