In [None]:
!pip install gymnasium[atari] --quiet
!pip install gymnasium --quiet
!pip install -U gymnasium[atari] --quiet
!pip install imageio_ffmpeg --quiet
!pip install npy_append_array --quiet
!pip install pyTelegramBotAPI --quiet
!pip install gymnasium[accept-rom-license] --quiet
!!pip install gymnasium[box2d] --quiet
!pip install gym-super-mario-bros --quiet
!pip install minigrid --quiet
!pip install miniworld --quiet

In [1]:
import tensorflow_probability as tfp
import tensorflow as tf 

tf.keras.backend.set_floatx('float32')

In [2]:
import numpy as np 

class ExperienceReplayBuffer: 
    def __init__(self, max_memory, input_shape, batch_size, cer=False): 
        self.mem_size = max_memory
        self.mem_counter = 0
        self.state_memory = []
        self.next_state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.terminal_memory = []
        self.batch_size = batch_size
        self.cer = cer

    def store_experience(self, state, action, reward, next_state, done): 
        index = self.mem_counter % self.mem_size 

        self.state_memory.append(state)
        self.next_state_memory.append(next_state)
        self.reward_memory.append(reward)
        self.action_memory.append(action)
        self.terminal_memory.append(done)
   #     self.action_probs_memory[index] = action_probs
        self.mem_counter += 1

    def sample_experience(self, batch_size):
        # used to get the last transition
        offset = 1 if self.cer else 0

        max_mem = min(self.mem_counter, self.mem_size) - offset
        batch_index = np.random.choice(max_mem, batch_size - offset, replace=False)

        states = self.state_memory[: ]
        next_states = self.next_state_memory[: ]
        rewards = self.reward_memory[: ]
        actions = self.action_memory[: ]
        terminals = self.terminal_memory[: ]
    #    action_probs = self.action_probs_memory[batch_index]

        if self.cer: 
            last_index = self.mem_counter % self.mem_size - 1
            last_state = self.state_memory[last_index]
            last_action = self.action_memory[last_index]
            last_terminal = self.terminal_memory[last_index]
            last_next_state = self.next_state_memory[last_index]
            last_reward = self.reward_memory[last_index]

            # for 2d and 3d use vstack to append, for 1d array use append() to append the data
            states = np.vstack((self.state_memory[batch_index], last_state))
            next_states = np.vstack((self.next_state_memory[batch_index], last_next_state))

            actions = np.append(actions, last_action)
            terminals = np.append(terminals, last_terminal)
            rewards = np.append(rewards, last_reward)
    
        return np.array(states), np.array(actions), np.array(rewards), np.array(next_states), np.array(terminals)
    
    
    def is_sufficient(self): 
        return self.mem_counter > self.batch_size
    
    def make_empty(self): 
        self.state_memory = []
        self.next_state_memory = []
        self.action_memory = []
        self.reward_memory = []
        self.terminal_memory = []
                                                   

In [3]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense, Conv2D, Input, Flatten
 
class ActorNetwork(tf.keras.Model):
    def __init__(self, input_dims, action_dim=1):
        super(ActorNetwork, self).__init__()
        self.conv1 = Conv2D(64, 3, activation="relu", input_shape=input_dims)
        self.conv2 = Conv2D(64, 3, activation="relu")
        self.flatten = Flatten()
        self.fc1 = Dense(64, activation="relu")
        self.fc2 = Dense(32, activation="relu")
        self.fc3 = Dense(action_dim, activation=None)

    def call(self, x):
        #x = self.conv1(x)
        #x = self.conv2(x)
        #x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x


class CriticNetwork(tf.keras.Model):
    def __init__(self, input_dims, action_dim=1):
        super(CriticNetwork, self).__init__()
        self.conv1 = Conv2D(64, 3, activation="relu", kernel_initializer="he_uniform", input_shape=input_dims)
        self.conv2 = Conv2D(64, 3, activation="relu", kernel_initializer="he_uniform")
        self.flatten = Flatten()
        self.fc1 = Dense(64, activation="relu",  kernel_initializer="he_uniform")
        self.fc2 = Dense(32, activation="relu", kernel_initializer="he_uniform")
        self.fc3 = Dense(1, activation=None)

    def call(self, x):
        #x = self.conv1(x)
        #x = self.conv2(x)
        #x = self.flatten(x)
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        return x

In [97]:
import tensorflow as tf 
from tensorflow.keras.layers import Dense, Conv2D, Input, Lambda
import numpy as np 
from tensorflow.keras.optimizers import Adam
 
class Actor:
    def __init__(self, input_dims, action_dim, actor_lr): 
        self.input_dims = input_dims 
        self.action_dim = action_dim
        self.actor_lr = actor_lr 
        self.model = None
        self.entropy_beta = 0.001
        
    def build_network(self): 
        input_ = tf.keras.layers.Input(self.input_dims)
        x = Conv2D(64, 3, activation="relu")(input_)
        x = Conv2D(64, 3, activation="relu")(x)
        x = Conv2D(32, 3, activation="relu")(x)
        
        x = Dense(64, activation="relu")(x)
        x = Dense(64, activation="relu")(x)
        x = Dense(32, activation="relu")(x)
        
        output_ = Dense(self.action_dim, activation=None)(x)
        model = keras.models.Model(input_, output_)
        model.compile(Adam(learning_rate=self.actor_lr))
        
        self.model = model 
        
    def get_action(self, state): 
        state = tf.convert_to_tensor(np.reshape(state, (1, 7, 7, 3)), dtype=tf.float32)
        action_probabilities = self.model(state)
        action_probabilities = tf.nn.softmax(action_probabilities)
        action_probabilities = action_probabilities.numpy()
        dist = tfp.distributions.Categorical(
            probs=action_probabilities, dtype=tf.float32
        )
        action = dist.sample()
        return int(action.numpy()[0])

    def learn(self, states, action, td_target, baselines):   
        states = tf.convert_to_tensor(states, dtype=tf.float32)
        with tf.GradientTape() as tape: 
            
            logits = self.model(states)
           # action_probs = tf.nn.softmax(logits)
            actor_loss = self.actor_loss(logits, action, td_target, baselines)

        actor_params = self.model.trainable_variables
        actor_grads = tape.gradient(actor_loss, actor_params)

        self.model.optimizer.apply_gradients(zip(actor_grads, actor_params))  
        return actor_loss

    def actor_loss(self, logits, action, td, baselines): 
        baselines = tf.cast(baselines, dtype=tf.float32)
        baselines = tf.squeeze(baselines)
        advantages = tf.convert_to_tensor(td - baselines, dtype=tf.float32)
        ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(
                from_logits=True)
        entropy_loss = tf.keras.losses.CategoricalCrossentropy(
                from_logits=True)
        actions = tf.cast(action, tf.int32)
        policy_loss = ce_loss(
                action, logits, sample_weight=tf.stop_gradient(advantages))
        entropy = entropy_loss(logits, logits)
        return policy_loss - self.entropy_beta * entropy
       
        #baselines = tf.cast(baselines, dtype=tf.float32)
        #baselines = tf.squeeze(baselines)
        #action_probs = tfp.distributions.Categorical(probs=action_probs)
        #log_prob = action_probs.log_prob(action)
        #loss = -log_prob * tf.convert_to_tensor(td - baselines, dtype=tf.float32)
        #loss = tf.reduce_mean(loss)
        #return loss
    
       
        
    def save_model(self): 
        self.model.save("models/a3c/" + "actor_network")  
        print("Model Saved Successfully.")
        
    def load_model(self):
        pass
        

In [90]:
from tensorflow.keras.optimizers import Adam 
import tensorflow as tf 
import tensorflow.keras as keras 

class Critic:
    def __init__(self, input_dims, action_dim, critic_lr): 
        self.input_dims = input_dims 
        self.action_dim = action_dim
        self.critic_lr = critic_lr
        self.model = None
        
    def build_network(self): 
        input_ = tf.keras.layers.Input(self.input_dims)
        x = Conv2D(64, 3, activation="relu")(input_)
        x = Conv2D(64, 3, activation="relu")(x)
        x = Conv2D(32, 3, activation="relu")(x)
        
        x = Dense(64, activation="relu")(x)
        x = Dense(64, activation="relu")(x)
        x = Dense(32, activation="relu")(x)
        
        output_ = Dense(1, activation=None)(x)
        model = keras.models.Model(input_, output_)
        model.compile(Adam(learning_rate=self.critic_lr))
        self.model = model
    
    def load_model(self): 
        self.model = tf.keras.models.load_model(self.fname + "_actor_network")
        print("loaded the model")
        
    def critic_loss(self, value, reward, next_state_value, gamma, done):
        value = tf.cast(value, dtype=tf.float32)
        next_state_value = tf.cast(next_state_value, dtype=tf.float32)
        next_state_value = tf.squeeze(next_state_value)
        reward = tf.convert_to_tensor(reward, dtype=tf.float32)
        value = tf.squeeze(value)
        td_target = reward + gamma * next_state_value * tf.convert_to_tensor([1-int(d) for d in done], dtype=tf.float32)
        delta = tf.keras.losses.MSE(td_target, value)
        return tf.convert_to_tensor(delta, dtype=tf.float32), tf.convert_to_tensor(td_target, dtype=tf.float32)
    
    def save_model(self): 
        self.model.save("models/a3c/" + "critic_network") 
    
    def learn(self, state, next_state_v_value, reward, done, gamma): 
        state = tf.convert_to_tensor(state, dtype=tf.float32)
        #next_state = tf.convert_to_tensor(next_state, dtype=tf.float32)  
        td_target = None
        
        with tf.GradientTape() as tape: 
            v_value = self.model(state)
            v_target = next_state_v_value

            critic_loss, td = self.critic_loss(v_value, reward, v_target, gamma, done)
            
        critic_params = self.model.trainable_variables 
        critic_grads = tape.gradient(critic_loss, critic_params)
        self.model.optimizer.apply_gradients(zip(critic_grads, critic_params))
        
        td_target = td
        return  critic_loss, td_target, v_value
        

In [91]:
class GlobalNetwork: 
    def __init__(self, input_dims, action_dims, actor_lr, critic_lr): 
        self.actor = Actor(input_dims, action_dims, actor_lr)        
        self.critic= Critic(input_dims, action_dims, critic_lr)
        
    def update_global_params(self, state, next_state_v_value, actions, reward, done, gamma):
        critic_l, td_target, v_value = self.critic.learn(state, next_state_v_value, reward, done, gamma)
        actor_l = self.actor.learn(state, actions, td_target, baselines=v_value)
        
        return actor_l , critic_l
        
    def pull_global_params(self): 
        actor_params = self.actor.model.get_weights()
        critic_params = self.critic.model.get_weights()
        
        return actor_params, critic_params        
    
    def save_models(self): 
        pass 
    
    def load_models(self):
        pass 

class WorkerNetwork: 
    def __init__(self , input_dims, action_dims, actor_lr, critic_lr): 
        self.actor = Actor(input_dims, action_dims, actor_lr)     
        self.critic = Critic(input_dims, action_dims, critic_lr)

In [92]:
from multiprocessing import cpu_count

# global_networks will be created in the A3C Agent
# global_network = GlobalNetwork()

CURR_EPISODE = 0
BEST_REWARD = float("-inf")
EPISODE_REWARDS = []

class A3CWorker: 
    def __init__(self, env, noe, gamma, update_interval, global_network, worker_network, input_dims, out_dims): 
        self.env = env 
        self.noe = noe
        self.gamma= gamma
        self.update_interval = update_interval
        self.input_dims = input_dims 
        self.out_dims = out_dims
        
        self.icm_agent = ICMAgent(input_dims, out_dims, 0.1, 0.1, 0.002)
        self.global_network = global_network
        self.worker_network = worker_network
        
        actor_weights, critic_weights = self.global_network.pull_global_params()
        self.worker_network.actor.model.set_weights(actor_weights)
        self.worker_network.critic.model.set_weights(critic_weights)
        
        self.memory = ExperienceReplayBuffer(self.update_interval, input_dims, self.update_interval, False)
        
    
    def learn(self): 
        
        global CURR_EPISODE
        global BEST_REWARD
        global EPISODE_REWARDS
        
        while CURR_EPISODE <= self.noe: 
            state, _ = self.env.reset()
            state = state.get("image")
            
            done = False 
            episodic_reward = 0
            step = 0
            while not done:  
                                    
                action = self.worker_network.actor.get_action(state)
                next_state_info = self.env.step(action)
                    
                next_state, reward_prob, terminated, truncated, _ = next_state_info 
                done = terminated or truncated 
                
                next_state = next_state.get("image")
                
                if (step % self.update_interval == 0 and step!=0) or done:
                    states, actions, rewards, next_states, dones = self.memory.sample_experience(self.update_interval)
                    states = tf.convert_to_tensor(states, dtype=tf.float32)
                    next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)

                    next_state_v_value = self.worker_network.critic.model(next_states)
                    action_probs = self.worker_network.actor.model(states)
                    
                    intrinsic_rewards = self.icm_agent.update(states, next_states, actions, self.out_dims)
                    
                    rewards += intrinsic_rewards

                    actor_l, critic_l = self.global_network.update_global_params(states, next_state_v_value, 
                                                                                 actions, rewards, dones, self.gamma)
                                 
                    actor_weights, critic_weights = self.global_network.pull_global_params()
                    self.worker_network.actor.model.set_weights(actor_weights)
                    self.worker_network.critic.model.set_weights(critic_weights)
                        
                    self.memory.make_empty()
                
                self.memory.store_experience(state, action, reward_prob, next_state, done)
                step += 1
                episodic_reward += reward_prob
                state  = next_state
            
            EPISODE_REWARDS.append(episodic_reward)
            avg_reward = np.mean(EPISODE_REWARDS[-100: ])
            
            if episodic_reward > BEST_REWARD: 
                self.global_network.actor.save_model()
                self.global_network.critic.save_model()
                BEST_REWARD = episodic_reward
                
            print(f"Episode: {CURR_EPISODE}, Reward: {episodic_reward} Average Reward: {avg_reward} Best Reward: {BEST_REWARD}")
            
            CURR_EPISODE += 1 
                

In [93]:
from threading import Thread 
from multiprocessing import cpu_count 

class ActorCriticAgent:
    def __init__(self, env, actor_lr, critic_lr, gamma, update_interval, noe, worker_count=cpu_count()):
        self.input_dims  = (7, 7, 3)
        action_space = [_ for _ in range((env.action_space.n))]
        self.out_dims = len(action_space)
        self.worker_count = worker_count
        self.noe = noe
        self.update_interval = update_interval
        self.env = env
        self.gamma = gamma
        self.actor_lr = actor_lr 
        self.critic_lr = critic_lr
        self.global_network = GlobalNetwork(self.input_dims, self.out_dims, actor_lr, critic_lr)
        self.worker_network = WorkerNetwork(self.input_dims, self.out_dims, actor_lr, critic_lr)
        self.global_network.actor.build_network()
        self.global_network.critic.build_network()
        self.worker_network.actor.build_network()
        self.worker_network.critic.build_network()
        
    def learn(self): 
        workers = []
        for _ in range(self.worker_count): 
            a3c_worker = A3CWorker(self.env, self.noe, self.gamma, 
                                        self.update_interval, self.global_network, self.worker_network, self.input_dims, self.out_dims)
            workers.append(Thread(target=a3c_worker.learn()))
        
        for worker in workers: 
            worker.start()
        
        for worker in workers: 
            worker.join()
        

In [94]:
import tensorflow.keras as keras 
import tensorflow as tf 

class ICMNetwork(tf.keras.Model): 
    def __init__(self, input_dims, out_dims):
        super(ICMNetwork, self).__init__()
        self.input_dims = input_dims 
        self.out_dims = out_dims
        self.conv1 = keras.layers.Conv2D(64, 3, padding="same", input_shape=input_dims, 
                                                                 activation="relu", kernel_initializer="he_uniform")
        self.conv2 = keras.layers.Conv2D(32, 3, padding="same", activation="relu", kernel_initializer="he_uniform")
        self.phi = keras.layers.Conv2D(3, 3, padding="same", activation="relu", kernel_initializer="he_uniform")
     #   self.phi = nn.Conv2d(32, 32, 3, stride=2, padding=1)

        self.inverse = keras.layers.Dense(256, activation='relu', kernel_initializer="he_uniform")
        self.pi_logits = keras.layers.Dense(out_dims, activation="softmax")

        self.dense1 = keras.layers.Dense(256)
        self.phi_hat_next = keras.layers.Dense(7*7*3)
        
        self.flatten = keras.layers.Flatten()

    def call(self, state, next_state, action):
        #state = tf.reshape(state, (1, 7, 7, 3))
        #next_state = tf.reshape(next_state, (1, 7, 7, 3))
        phi = self.conv1(state)
        phi = self.conv2(phi)
        phi = self.phi(phi)
        
        phi_next = self.conv1(next_state)
        phi_next = self.conv2(phi_next)
        phi_next = self.phi(phi_next)
        
        phi = self.flatten(phi)
        phi_next = self.flatten(phi_next)
        inverse = self.inverse(tf.concat([phi, phi_next], axis=1))
        pi_logits = self.pi_logits(inverse)
        
        action = tf.cast(tf.reshape(action, (action.shape[0], 1)), tf.float32)
        forward_input = tf.concat([phi, action], axis=1)
        dense = self.dense1(forward_input)
        phi_hat_next = self.phi_hat_next(dense)
        
        return phi_next, pi_logits, phi_hat_next

In [95]:
import tensorflow as tf 

class ICMAgent: 
    def __init__(self, input_dims, out_dims, alpha, beta, lr):
        self.input_dims = input_dims 
        self.out_dims = out_dims 
        self.alpha = alpha 
        self.beta = beta 
        
        self.icm_model = ICMNetwork(self.input_dims, self.out_dims)
        self.icm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr))
    
    def update(self, states, next_states, actions, n_action): 
        states = tf.convert_to_tensor(states, dtype=tf.float32)
        actions = tf.convert_to_tensor(actions, dtype=tf.int32)
        actions_one_hot = tf.one_hot(actions, depth=n_action)
        
        next_states = tf.convert_to_tensor(next_states, dtype=tf.float32)
       # print(actions.shape, actions_one_hot, states.shape, next_states.shape)
        
        with tf.GradientTape() as tape: 
            phi_next, pi_logits, phi_hat_next = \
                            self.icm_model(states, next_states, actions)
            
           # pi_logits = tf.cast(pi_logits, dtype=tf.int32)
            #print(pi_logits, actions_one_hot)
            inverse_loss_func = tf.keras.losses.CategoricalCrossentropy()
            inverse_loss = (1 - self.beta) * inverse_loss_func(pi_logits, actions_one_hot)

            forward_loss = tf.keras.losses.MSE(phi_hat_next, phi_next)
            forward_loss = self.beta * forward_loss
            forward_loss = tf.reduce_mean(forward_loss)
           # print("forwaed", forward_loss)

            squared = tf.math.pow(phi_hat_next-phi_next, 2)
           # print(squared, "sqa")
            intrinsic_reward = self.alpha*0.5*squared
            intrinsic_reward = tf.reduce_sum(intrinsic_reward, axis=1)

            loss = inverse_loss + forward_loss
            
        params = self.icm_model.trainable_variables
        grads = tape.gradient(loss, params)
        self.icm_model.optimizer.apply_gradients(zip(grads, params))
    
        return intrinsic_reward

In [None]:
import gymnasium as gym
from minigrid.wrappers import RGBImgPartialObsWrapper, ImgObsWrapper
import minigrid

#env = gym.make("MiniGrid-FourRooms-v0")
env = gym.make("MiniGrid-Empty-5x5-v0")
noe = 10000
actor_lr = 0.003
critic_lr = 0.003
gamma = 0.99
update_interval = 2
num_workers = 15

if __name__ == "__main__": 
    agent = ActorCriticAgent(env, actor_lr, critic_lr, gamma, update_interval, noe, num_workers)
    agent.learn()

Model Saved Successfully.
Episode: 0, Reward: 0 Average Reward: 0.0 Best Reward: 0
Episode: 1, Reward: 0 Average Reward: 0.0 Best Reward: 0
Episode: 2, Reward: 0 Average Reward: 0.0 Best Reward: 0
Episode: 3, Reward: 0 Average Reward: 0.0 Best Reward: 0
Episode: 4, Reward: 0 Average Reward: 0.0 Best Reward: 0
Model Saved Successfully.
Episode: 5, Reward: 0.613 Average Reward: 0.10216666666666667 Best Reward: 0.613
Episode: 6, Reward: 0 Average Reward: 0.08757142857142856 Best Reward: 0.613
Episode: 7, Reward: 0 Average Reward: 0.076625 Best Reward: 0.613
Episode: 8, Reward: 0 Average Reward: 0.06811111111111111 Best Reward: 0.613
Episode: 9, Reward: 0.487 Average Reward: 0.11000000000000001 Best Reward: 0.613
Episode: 10, Reward: 0.478 Average Reward: 0.14345454545454547 Best Reward: 0.613
Episode: 11, Reward: 0.44199999999999995 Average Reward: 0.16833333333333333 Best Reward: 0.613
Episode: 12, Reward: 0.42399999999999993 Average Reward: 0.188 Best Reward: 0.613
Episode: 13, Reward: 

In [None]:
import gymnasium as gym 
if __name__ == "__main__": 
    env = gym.make("LunarLander-v2")
    agent = ActorCriticAgent(env, 0.0005, 0.005, 0.99, 4, 1000, 10)
    agent.learn()

###### n 

In [None]:
88854253

In [None]:
import collections
import cv2
import numpy as np
import matplotlib.pyplot as plt
import gymnasium as gym
import tensorflow as tf
from gymnasium.wrappers import *


def manage_memory():
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
        except RuntimeError as e:
            print(e)


def plot_learning_curve(scores, figure_file):

    x = [_ for _ in range(len(scores))]
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg, label="Avg reward for agent", color="black")
    plt.plot(scores, label="Reward for agent", color="red")
    plt.xlabel("episodes")
    plt.ylabel("rewards")
    plt.title('Running average of previous 100 scores')
    plt.legend()
    plt.savefig(figure_file)

    
class RepeatAction(gym.Wrapper):
    def __init__(self, env=None, repeat=4, fire_first=False):
        super(RepeatAction, self).__init__(env)
        self.repeat = repeat
        self.shape = env.observation_space.get("image").low[0][0][0]
        self.fire_first = fire_first

    def step(self, action):
        t_reward = 0.0
        done = False
        for i in range(self.repeat):
            obs, reward, terminated, truncated, info = self.env.step(action)
            done = terminated or truncated
            t_reward += reward
            if done:
                break
        return obs, t_reward, terminated, truncated, info

    def reset(self):
        obs = self.env.reset()
        if self.fire_first:
            assert self.env.unwrapped.get_action_meanings()[1] == 'FIRE'
            obs, _, _, _, _ = self.env_step(1)
        return obs

    
class PreprocessFrame(gym.ObservationWrapper):
    def __init__(self, shape, env=None):
        super(PreprocessFrame, self).__init__(env)
        self.shape = (shape[2], shape[0], shape[1])
        self.observation_space = gym.spaces.Box(low=0.0, high=1.0,
                                                shape=self.shape,
                                                dtype=np.float32)

    def observation(self, obs):
        new_frame = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
        resized_screen = cv2.resize(new_frame, self.shape[1:],
                                    interpolation=cv2.INTER_AREA)
        new_obs = np.array(resized_screen, dtype=np.uint8).reshape(self.shape)
        new_obs = new_obs / 255.0

        return new_obs
    
class StackFrames(gym.ObservationWrapper):
    def __init__(self, env, repeat):
        super(StackFrames, self).__init__(env)
        self.observation_space = gym.spaces.Box(
                env.observation_space.low.repeat(repeat, axis=0),
                env.observation_space.high.repeat(repeat, axis=0),
                dtype=np.float32)
        self.stack = collections.deque(maxlen=repeat)

    def reset(self):
        self.stack.clear()
        observation = self.env.reset()
        for _ in range(self.stack.maxlen):
            self.stack.append(observation)

        return np.array(self.stack).reshape(self.observation_space.low.shape)

    def observation(self, observation):
        self.stack.append(observation)

        return np.array(self.stack).reshape(self.observation_space.low.shape)
    
    
def make_env(env_name): 
    env = gym.make(env_name, render_mode="rgb_array")
    env = RepeatAction(env)
#    env = PreprocessFrame(env.observation_space.get("image").shape, env)
  #  env = FrameStack(env, 4, lz4_compress=False)
  #  env = NormalizeObservation(env)
    return env

In [None]:
class Writer:
    def __init__(self, fname): 
        self.fname = fname 

    def write_to_file(self, content): 
        with open(self.fname, "a") as file: 
            file.write(content + "\n")

    def read_file(self, fname):
        with open(fname, "r") as file: 
            return file.read()
            

In [None]:
import time
from telebot import TeleBot
import datetime
import telebot

token = "6238487424:AAG0jRhvbiVa90qUcf2fAirQr_-quPMs7cU"
chat_id = "1055055706"
bot = TeleBot(token=token) 

def telegram_send(message, bot):
    chat_id = "1055055706"
    bot.send_message(chat_id=chat_id, text=message)

def welcome_msg(multi_step, double_dqn, dueling):
    st = 'Hi! Starting learning with DQN Multi-step = %d, Double DQN = %r, Dueling DQN = %r' % (multi_step, double_dqn, dueling)
    telegram_send(st, bot)
    
def info_msg(episode, max_episode, reward, best_score, loss): 
    st = f"Current Episode: {episode}, Current Reward: {reward}, Max Episode: {max_episode}, Best Score: {best_score}, loss: {loss}"
    telegram_send(st, bot)

def end_msg(learning_time):
    st = 'Finished! Learning time: ' + str(datetime.timedelta(seconds=int(learning_time)))
    telegram_send(st, bot)
    print(st)


In [None]:
import numpy as np 
import imageio


class RecordVideo: 
    
    def __init__(self, prefix_fname,  out_directory="videos/", fps=10): 
        self.prefix_fname = prefix_fname
        self.out_directory = out_directory
        self.fps = fps
        self.images = []
        
    def add_image(self, image): 
        self.images.append(image)
    
    def save(self, episode_no): 
        name = self.out_directory + self.prefix_fname + "_" + str(episode_no) + ".mp4"
        imageio.mimsave(name, [np.array(img) for i, img in enumerate(self.images)], fps=self.fps)
        self.images = []

In [None]:
import cv2 

class Trainer: 
    def __init__(self, env, action_space, input_dims, out_dims, video_prefix, is_tg,
                                     noe, max_steps, record, lr1, lr2, gamma, chkpt, algo_name, update_interval): 
        self.env = env
        self.noe = noe 
        self.max_steps = max_steps 
        self.update_interval = update_interval
        self.out_dims = out_dims

        self.recorder = RecordVideo(video_prefix)
        self.is_tg = is_tg 
        self.record = record
        self.agent = ActorCriticAgent(input_dims, out_dims, gamma, lr1, lr2, action_space, 32, chkpt, algo_name)
        self.memory = ExperienceReplayBuffer(update_interval, input_dims, update_interval, False)
        self.icm_agent = ICMAgent(input_dims, out_dims, 0.1, 0.1, 0.001)

    def train(self): 

        ep_rewards = []
        avg_rewards = []
        best_reward = float("-inf")
        replay_counter = 0

        for episode in range(self.noe): 
           
            state, _ = self.env.reset()   
            state = state.get('image')
            ep_reward = 0 
            steps = 0

            if self.record and episode % 50 == 0: 
                img = self.env.render()
                self.recorder.add_image(img)

            for step in range(self.max_steps):
                
                action = self.agent.get_action(state)
 
                next_info = self.env.step(action)

                next_state, reward_prob, terminated, truncated, _ = next_info 
                next_state = next_state.get("image")
                done = terminated or truncated
                ep_reward += reward_prob
                            
                if (step % update_interval == 0 and step!=0) or done:
                    states, actions, rewards, next_states, dones = self.memory.sample_experience(update_interval)
                    
                    intrinsic_reward = self.icm_agent.update(states, next_states, actions, self.out_dims)

                    self.agent.learn(states, actions, rewards+intrinsic_reward, next_states, dones)
                   # print(actions)
                    self.memory.make_empty()
                self.memory.store_experience(state, action, reward_prob, next_state, done)

                state = next_state
                steps += 1
                if self.record and episode % 50 == 0:
                    img = self.env.render()
                    self.recorder.add_image(img)

                if done: 
                    break 
                
            if self.record and episode % 50 == 0:
                self.recorder.save(episode)

            ep_rewards.append(ep_reward)
            avg_reward = np.mean(ep_rewards[-100:])
            avg_rewards.append(avg_reward)
            
            print(f"Episode: {episode} Steps: {steps} Reward: {ep_reward} Best Score: {best_reward}, Average Reward: {avg_reward}")
            
            if ep_reward > best_reward: 
                self.agent.save_models()
                best_reward = ep_reward
                
        return ep_rewards, avg_rewards


In [None]:
import gymnasium as gym
from minigrid.wrappers import RGBImgPartialObsWrapper, ImgObsWrapper
import minigrid

#env = gym.make("MiniGrid-FourRooms-v0")
env = gym.make("MiniGrid-Empty-5x5-v0")
action_space = [_ for _ in range((env.action_space.n))]
print(env.observation_space)
n_actions = len(action_space)
input_dims = (7, 7, 3)
noe = 10000
max_steps = 200
video_prefix = "actor_critic"
is_tg = True 
record = False
lr1 = 0.003
lr2 = 0.0005
gamma = 0.99
chpkt = 'models/'
algo_name = "actor_critic"
update_interval = 2

if __name__ == "__main__": 
    trainer = Trainer(env, action_space, input_dims, n_actions, video_prefix, is_tg, 
                                          noe, max_steps, record, lr1, lr2, gamma, chpkt, algo_name, update_interval)
    ep_rewards, _ = trainer.train()
    plot_learning_curve(ep_rewards, "actor_critic.png")

In [None]:
import pickle 

with open("actor_critic_eps_rewards.obj", "wb") as f: 
    pickle.dump(ep_rewards[0], f)

with open("actor_critic_avg_rewards.obj", "wb") as f: 
    pickle.dump(ep_rewards[1], f)

In [None]:
plot_learning_curve(ep_rewards[0], "actor_critic.png")

In [None]:
def greedy_policy(observation, q_val_network, action_space): 
    state = tf.convert_to_tensor([observation])
    actions = q_val_network(state)
    action = tf.math.argmax(actions, axis=1).numpy()[0]
    return action

In [None]:
import random 

class Eval: 

    def __init__(self, env, model_path, number_of_episode=50):
        self.env = env 
        self.model = tf.keras.models.load_model(model_path)
        self.recorder = RecordVideo('dqn_lunarlander', 'test_videos/', 15)
        self.number_of_episode = number_of_episode
        
    def test(self): 
        rewards = []
        steps = []
        for episode in range(self.number_of_episode): 
            done = False
            reward = 0
            step = 0
            state = env.reset(seed=random.randint(0,500))
            if episode % 10 == 0: 
                img = env.render()
                self.recorder.add_image(img) 

            while not done:

                if type(state) == tuple: 
                    state = state[0]
                action =  greedy_policy(state, self.model, action_space)
                state, reward_prob, terminated, truncated, _ = env.step(action)
                done = terminated or truncated 
                reward += reward_prob
                step += 1 
                if episode % 10 == 0:
                    img = env.render()
                    self.recorder.add_image(img)
            
            rewards.append(reward)
            steps.append(step)
            self.recorder.save(1) if episode % 10 == 0 else None 
        
        return rewards, steps


In [None]:
evaluator = Eval(env, "/content/models/_actor_critic_actor_network", 10)
evaluator.test()

In [None]:
class Memory:
    def __init__(self):
        self.rewards = []
        self.values = []
        self.log_probs = []
        self.states = []
        self.new_states = []
        self.actions = []

    def remember(self, state, action, new_state, reward, value, log_prob):
        self.rewards.append(reward)
        self.values.append(value)
        self.log_probs.append(log_prob)
        self.states.append(state)
        self.new_states.append(new_state)
        self.actions.append(action)

    def clear_memory(self):
        self.rewards = []
        self.values = []
        self.log_probs = []
        self.actions = []
        self.new_states = []
        self.states = []

    def sample_memory(self):
        return self.states, self.actions, self.new_states, self.rewards,\
               self.values, self.log_probs

In [None]:
import torch.multiprocessing as mp



class ParallelEnv:
    def __init__(self, env_id, global_idx,
                 input_shape, n_actions, num_threads, icm=False):
        names = [str(i) for i in range(num_threads)]
        print("in")
        global_actor_critic = ActorCritic(input_shape, n_actions)
        global_actor_critic.share_memory()
        global_optim = SharedAdam(global_actor_critic.parameters(), lr=1e-4)

        if icm:
            global_icm = ICM(input_shape, n_actions)
            global_icm.share_memory()
            global_icm_optim = SharedAdam(global_icm.parameters(), lr=1e-4)
        else:
            global_icm = None
            global_icm_optim = None

        self.ps = [mp.Process(target=worker,
                              args=(name, input_shape, n_actions,
                                    global_actor_critic, global_optim, env_id,
                                    num_threads, global_idx, global_icm,
                                    global_icm_optim, icm))
                   for name in names]

        [p.start() for p in self.ps]
        [p.join() for p in self.ps]

In [None]:
import torch as T
import torch.nn as nn
import torch.nn.functional as F


class ICM(nn.Module):
    def __init__(self, input_dims, n_actions=3, alpha=0.1, beta=0.2):
        super(ICM, self).__init__()
        self.alpha = alpha
        self.beta = beta

        self.conv1 = nn.Conv2d(input_dims[0], 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.phi = nn.Conv2d(32, 32, 3, stride=2, padding=1)

        self.inverse = nn.Linear(288*2, 256)
        self.pi_logits = nn.Linear(256, n_actions)

        self.dense1 = nn.Linear(288+1, 256)
        self.phi_hat_new = nn.Linear(256, 288)

        device = T.device('cpu')
        self.to(device)

    def forward(self, state, new_state, action):
        conv = F.elu(self.conv1(state))
        conv = F.elu(self.conv2(conv))
        conv = F.elu(self.conv3(conv))
        phi = self.phi(conv)

        conv_new = F.elu(self.conv1(new_state))
        conv_new = F.elu(self.conv2(conv_new))
        conv_new = F.elu(self.conv3(conv_new))
        phi_new = self.phi(conv_new)

        # [T, 32, 3, 3] to [T, 288]
        phi = phi.view(phi.size()[0], -1).to(T.float)
        phi_new = phi_new.view(phi_new.size()[0], -1).to(T.float)

        inverse = self.inverse(T.cat([phi, phi_new], dim=1))
        pi_logits = self.pi_logits(inverse)

        # from [T] to [T, 1]
        action = action.reshape((action.size()[0], 1))
        forward_input = T.cat([phi, action], dim=1)
        dense = self.dense1(forward_input)
        phi_hat_new = self.phi_hat_new(dense)

        return phi_new, pi_logits, phi_hat_new

    def calc_loss(self, states, new_states, actions):
        # don't need [] b/c these are lists of states
        states = T.tensor(states, dtype=T.float)
        actions = T.tensor(actions, dtype=T.float)
        new_states = T.tensor(new_states, dtype=T.float)

        phi_new, pi_logits, phi_hat_new = \
            self.forward(states, new_states, actions)

        inverse_loss = nn.CrossEntropyLoss()
        L_I = (1 - self.beta) * inverse_loss(pi_logits, actions.to(T.long))

        forward_loss = nn.MSELoss()
        L_F = self.beta * forward_loss(phi_hat_new, phi_new)

        intrinsic_reward = self.alpha*0.5*((phi_hat_new-phi_new).pow(2)).mean(dim=1)
        return intrinsic_reward, L_I, L_F

In [None]:
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical


class ActorCritic(nn.Module):
    def __init__(self, input_dims, n_actions, gamma=0.99, tau=1.0):
        super(ActorCritic, self).__init__()

        self.gamma = gamma
        self.tau = tau

        self.conv1 = nn.Conv2d(input_dims[0], 32, 3, stride=2, padding=1)
        self.conv2 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv3 = nn.Conv2d(32, 32, 3, stride=2, padding=1)
        self.conv4 = nn.Conv2d(32, 32, 3, stride=2, padding=1)

        conv_shape = self.calc_conv_output(input_dims)

        self.gru = nn.GRUCell(conv_shape, 256)
        self.pi = nn.Linear(256, n_actions)
        self.v = nn.Linear(256, 1)

    def calc_conv_output(self, input_dims):
        state = T.zeros(1, *input_dims)
        dims = self.conv1(state)
        dims = self.conv2(dims)
        dims = self.conv3(dims)
        dims = self.conv4(dims)
        return int(np.prod(dims.size()))

    def forward(self, state, hx):
        conv = F.elu(self.conv1(state))
        conv = F.elu(self.conv2(conv))
        conv = F.elu(self.conv3(conv))
        conv = F.elu(self.conv4(conv))

        conv_state = conv.view((conv.size()[0], -1))

        hx = self.gru(conv_state, (hx))

        pi = self.pi(hx)
        v = self.v(hx)

        probs = T.softmax(pi, dim=1)
        dist = Categorical(probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)

        return action.numpy()[0], v, log_prob, hx

    def calc_R(self, done, rewards, values):
        values = T.cat(values).squeeze()

        if len(values.size()) == 1:  # batch of states
            R = values[-1]*(1-int(done))
        elif len(values.size()) == 0:  # single state
            R = values*(1-int(done))

        batch_return = []
        for reward in rewards[::-1]:
            R = reward + self.gamma * R
            batch_return.append(R)
        batch_return.reverse()
        batch_return = T.tensor(batch_return,
                                dtype=T.float).reshape(values.size())
        return batch_return

    def calc_cost(self, new_state, hx, done,
                  rewards, values, log_probs, intrinsic_reward=None):

        if intrinsic_reward is not None:
            rewards += intrinsic_reward.detach().numpy()

        returns = self.calc_R(done, rewards, values)

        next_v = T.zeros(1, 1) if done else self.forward(T.tensor(
                                        [new_state], dtype=T.float), hx)[1]
        values.append(next_v.detach())
        values = T.cat(values).squeeze()
        log_probs = T.cat(log_probs)
        rewards = T.tensor(rewards)

        delta_t = rewards + self.gamma * values[1:] - values[:-1]
        n_steps = len(delta_t)
        gae = np.zeros(n_steps)
        for t in range(n_steps):
            for k in range(0, n_steps-t):
                temp = (self.gamma*self.tau)**k * delta_t[t+k]
                gae[t] += temp
        gae = T.tensor(gae, dtype=T.float)

        actor_loss = -(log_probs * gae).sum()
        # if single then values is rank 1 and returns rank 0
        # want to have same shape to avoid a warning
        critic_loss = F.mse_loss(values[:-1].squeeze(), returns)

        entropy_loss = (-log_probs * T.exp(log_probs)).sum()

        total_loss = actor_loss + critic_loss - 0.01 * entropy_loss
        return total_loss


In [None]:
import torch as T


class SharedAdam(T.optim.Adam):
    def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), eps=1e-8,
                 weight_decay=0):
        super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps,
                                         weight_decay=weight_decay)

        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['exp_avg'] = T.zeros_like(p.data)
                state['exp_avg_sq'] = T.zeros_like(p.data)

                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()

In [None]:
import numpy as np
import torch as T



def worker(name, input_shape, n_actions, global_agent,
           optimizer, env_id, n_threads, global_idx, global_icm,
           icm_optimizer, icm):
    T_MAX = 20

    local_agent = ActorCritic(input_shape, n_actions)

    if icm:
        local_icm = ICM(input_shape, n_actions)
    else:
        local_icm = None
        intrinsic_reward = None

    memory = Memory()

    frame_buffer = [input_shape[1], input_shape[2], 1]
    env = make_env(env_id, shape=frame_buffer)

    episode, max_steps, t_steps, scores = 0, 5e5, 0, []

    while episode < max_steps:
        obs = env.reset()
        score, done, ep_steps = 0, False, 0
        hx = T.zeros(1, 256)
        while not done:
            state = T.tensor([obs], dtype=T.float)
            action, value, log_prob, hx = local_agent(state, hx)
            obs_, reward, done, info = env.step(action)
            memory.remember(obs, action, obs_, reward, value, log_prob)
            score += reward
            obs = obs_
            ep_steps += 1
            t_steps += 1
            if ep_steps % T_MAX == 0 or done:
                states, actions, new_states, rewards, values, log_probs = \
                        memory.sample_memory()
                if icm:
                    intrinsic_reward, L_I, L_F = \
                            local_icm.calc_loss(states, new_states, actions)

                loss = local_agent.calc_cost(obs, hx, done, rewards,
                                             values, log_probs,
                                             intrinsic_reward)
                optimizer.zero_grad()
                hx = hx.detach_()
                if icm:
                    icm_optimizer.zero_grad()
                    (L_I + L_F).backward()
                loss.backward()
                T.nn.utils.clip_grad_norm_(local_agent.parameters(), 40)
                for local_param, global_param in zip(
                                        local_agent.parameters(),
                                        global_agent.parameters()):
                    global_param._grad = local_param.grad
                optimizer.step()
                local_agent.load_state_dict(global_agent.state_dict())

                if icm:
                    for local_param, global_param in zip(
                                        local_icm.parameters(),
                                        global_icm.parameters()):
                        global_param._grad = local_param.grad
                    icm_optimizer.step()
                    local_icm.load_state_dict(global_icm.state_dict())

                memory.clear_memory()
        episode += 1
        # with global_idx.get_lock():
        #    global_idx.value += 1
        if name == '1':
            scores.append(score)
            avg_score = np.mean(scores[-100:])
            avg_score_5000 = np.mean(scores[max(0, episode-5000): episode+1])
            print('ICM episode {} thread {} of {} steps {:.2f}M score {:.2f} '
                  'avg score (100) (5000) {:.2f} {:.2f}'.format(
                                                episode, name, n_threads,
                                                t_steps/1e6, score,
                                                avg_score, avg_score_5000))
    if name == '1':
        x = [z for z in range(episode)]
        plot_learning_curve(x, scores, 'ICM_hallway_final.png')

In [None]:
import numpy as np
import matplotlib.pyplot as plt


def plot_learning_curve(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 games')
    plt.savefig(figure_file)


In [None]:
import os
import torch.multiprocessing as mp

os.environ['OMP_NUM_THREADS'] = '1'



mp.set_start_method('spawn')
global_ep = mp.Value('i', 0)
    # env_id = 'PongNoFrameskip-v4'
    # env_id = 'MiniWorld-Hallway-v0'
env_id = 'MiniWorld-FourRooms-v0'
n_threads = 12
n_actions = 3
input_shape = [4, 42, 42]
env = ParallelEnv(env_id=env_id, num_threads=n_threads,
                      n_actions=n_actions, global_idx=global_ep,
                      input_shape=input_shape, icm=True)

In [None]:
import wandb
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense

import gym
import argparse
import numpy as np
from threading import Thread, Lock
from multiprocessing import cpu_count
tf.keras.backend.set_floatx('float64')


CUR_EPISODE = 0


class Actor:
    def __init__(self, state_dim, action_dim):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(0.001)
        self.entropy_beta = 0.01

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(32, activation='relu'),
            Dense(16, activation='relu'),
            Dense(self.action_dim, activation='softmax')
        ])

    def compute_loss(self, actions, logits, advantages):
        ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True)
        entropy_loss = tf.keras.losses.CategoricalCrossentropy(
            from_logits=True)
        actions = tf.cast(actions, tf.int32)
        policy_loss = ce_loss(
            actions, logits, sample_weight=tf.stop_gradient(advantages))
        entropy = entropy_loss(logits, logits)
        return policy_loss - self.entropy_beta * entropy

    def train(self, states, actions, advantages):
        with tf.GradientTape() as tape:
            logits = self.model(states, training=True)
            loss = self.compute_loss(
                actions, logits, advantages)
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


class Critic:
    def __init__(self, state_dim):
        self.state_dim = state_dim
        self.model = self.create_model()
        self.opt = tf.keras.optimizers.Adam(0.0005)

    def create_model(self):
        return tf.keras.Sequential([
            Input((self.state_dim,)),
            Dense(32, activation='relu'),
            Dense(16, activation='relu'),
            Dense(16, activation='relu'),
            Dense(1, activation='linear')
        ])

    def compute_loss(self, v_pred, td_targets):
        mse = tf.keras.losses.MeanSquaredError()
        return mse(td_targets, v_pred)

    def train(self, states, td_targets):
        with tf.GradientTape() as tape:
            v_pred = self.model(states, training=True)
            assert v_pred.shape == td_targets.shape
            loss = self.compute_loss(v_pred, tf.stop_gradient(td_targets))
        grads = tape.gradient(loss, self.model.trainable_variables)
        self.opt.apply_gradients(zip(grads, self.model.trainable_variables))
        return loss


class Agent:
    def __init__(self, env_name):
        env = gym.make(env_name)
        self.env_name = env_name
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.n

        self.global_actor = Actor(self.state_dim, self.action_dim)
        self.global_critic = Critic(self.state_dim)
        self.num_workers = cpu_count()

    def train(self, max_episodes=1000):
        workers = []

        for i in range(self.num_workers):
            env = gym.make(self.env_name)
            workers.append(WorkerAgent(
                env, self.global_actor, self.global_critic, max_episodes))

        for worker in workers:
            worker.start()

        for worker in workers:
            worker.join()


class WorkerAgent(Thread):
    def __init__(self, env, global_actor, global_critic, max_episodes):
        Thread.__init__(self)
        self.lock = Lock()
        self.env = env
        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.n

        self.max_episodes = max_episodes
        self.global_actor = global_actor
        self.global_critic = global_critic
        self.actor = Actor(self.state_dim, self.action_dim)
        self.critic = Critic(self.state_dim)

        self.actor.model.set_weights(self.global_actor.model.get_weights())
        self.critic.model.set_weights(self.global_critic.model.get_weights())

    def n_step_td_target(self, rewards, next_v_value, done):
        td_targets = np.zeros_like(rewards)
        cumulative = 0
        if not done:
            cumulative = next_v_value

        for k in reversed(range(0, len(rewards))):
            cumulative = 0.99 * cumulative + rewards[k]
            td_targets[k] = cumulative
        return td_targets

    def advatnage(self, td_targets, baselines):
        return td_targets - baselines

    def list_to_batch(self, list):
        batch = list[0]
        for elem in list[1:]:
            batch = np.append(batch, elem, axis=0)
        return batch

    def train(self):
        global CUR_EPISODE

        while self.max_episodes >= CUR_EPISODE:
            state_batch = []
            action_batch = []
            reward_batch = []
            episode_reward, done = 0, False

            state = self.env.reset()

            while not done:
                # self.env.render()
                probs = self.actor.model.predict(
                    np.reshape(state, [1, self.state_dim]), verbose=False)
                action = np.random.choice(self.action_dim, p=probs[0])

                next_state, reward, done, _ = self.env.step(action)

                state = np.reshape(state, [1, self.state_dim])
                action = np.reshape(action, [1, 1])
                next_state = np.reshape(next_state, [1, self.state_dim])
                reward = np.reshape(reward, [1, 1])

                state_batch.append(state)
                action_batch.append(action)
                reward_batch.append(reward)

                if len(state_batch) >= 5 or done:
                    states = self.list_to_batch(state_batch)
                    actions = self.list_to_batch(action_batch)
                    rewards = self.list_to_batch(reward_batch)

                    next_v_value = self.critic.model.predict(next_state, verbose=False)
                    td_targets = self.n_step_td_target(
                        rewards, next_v_value, done)
                    advantages = td_targets - self.critic.model.predict(states, verbose=False)
                    
                    with self.lock:
                        actor_loss = self.global_actor.train(
                            states, actions, advantages)
                        critic_loss = self.global_critic.train(
                            states, td_targets)

                        self.actor.model.set_weights(
                            self.global_actor.model.get_weights())
                        self.critic.model.set_weights(
                            self.global_critic.model.get_weights())

                    state_batch = []
                    action_batch = []
                    reward_batch = []
                    td_target_batch = []
                    advatnage_batch = []

                episode_reward += reward[0][0]
                state = next_state[0]

            print('EP{} EpisodeReward={}'.format(CUR_EPISODE, episode_reward))
            CUR_EPISODE += 1

    def run(self):
        self.train()


def main():
    env_name = 'CartPole-v1'
    agent = Agent(env_name)
    agent.train()



main()
