# DNQ algorithms

In [None]:
import tensorflow as tf
import numpy as np

## replay buffer

In [None]:
class Replay_buffer():
    def __init__(self, buffer_size=[1024,7]):
        self.row_cap, self.col_cap = buffer_size
        self.buffer = np.zeros(shape=(self.row_cap, self.col_cap), dtype=np.float32)
        self.pointer = 0
    
    def add_exp(self, obs, action, reward, obs_, done):
        exp = np.hstack((obs, action, reward, obs_, done)).astype(np.float32)
        idx = self.pointer % self.row_cap
        self.buffer[idx, :] = exp
        buffer_ready = False if self.pointer < self.row_cap else True
        
        self.pointer += 1
        return buffer_ready
    
    def get_minibatch(self, batch_size):
        idx = np.random.randint(self.row_cap, size=batch_size)
        return self.buffer[idx, :]
    

## network

In [None]:
class Net_model(tf.keras.Model):
    def __init__(self, output_layer_dim):
        super(Net_model, self).__init__()
        
        self.layer0 = tf.keras.layers.Dense(units=128)
        self.layer1 = tf.keras.layers.Dense(units=64)
        self.layer2 = tf.keras.layers.Dense(units=output_layer_dim)
    
    def call(self, inputs):
        h0 = self.layer0(inputs)
        h0 = tf.nn.relu(h0)
        
        h1 = self.layer1(h0)
        h1 = tf.nn.relu(h1)
        
        out = self.layer2(h1)
        return out
    
    

## DQN agent

In [None]:
class DQN():
    def __init__(self, action_n, observation_dim, buffer_size=[1024,7]):
        self.replay_buffer = Replay_buffer(buffer_size)
        self.eval_net = Net_model(action_n)
        self.target_net = Net_model(action_n)
        
        # super params
        self.gamma = 0.8
        self.epsilon = 1.0
        self.eps_decrs = 1e-6
        self.min_eps = 0.05
        self.lr = 1e-3
        self.batch_size = int(buffer_size[0] / 32)
        self.obs_dim = observation_dim
        self.action_n = action_n
        self.learning_times = 0
        self.sync_weights_per_times = 100
        self.sync_weights_times = 0
        
        self.agent_status = {"buffer_ready": False,
                             "learn_times": self.learning_times,
                             "sync_times": self.sync_weights_times,
                             "loss": None,
                             "eps_cur": None}
    
    def get_action(self, obs, eps_greedy=True):
        eps = max(self.epsilon - self.eps_decrs * self.learning_times, self.min_eps)
        self.agent_status["eps_cur"] = eps
        if np.random.uniform() < eps and eps_greedy:
            action = np.random.randint(self.action_n)
        else:
            obs = tf.constant(obs, dtype=tf.float32)
            obs = tf.expand_dims(obs, axis=0)
            action = tf.argmax(self.eval_net(obs)[0])
        return int(action)
    
    def add_exp(self, obs, action, reward, obs_, done):
        ready = self.replay_buffer.add_exp(obs, action, reward, obs_, done)
        self.agent_status["buffer_ready"] = ready
        return ready
    
    def learning(self, ):
        minibatch = self.replay_buffer.get_minibatch(self.batch_size)
        obs = minibatch[:, 0:self.obs_dim]
        action = minibatch[:, self.obs_dim]
        reward = minibatch[:, self.obs_dim+1]
        obs_ = minibatch[:, -self.obs_dim-1:-1]
        done = minibatch[:, -1]
        
        q_target = reward + self.gamma * tf.reduce_max(self.target_net(obs_), axis=1) * (1 - done)
        
        action = tf.cast(action, dtype=tf.int32)
        action_onehot = tf.one_hot(action, depth=self.action_n, dtype=tf.float32)
        
        with tf.GradientTape() as tape:
            q_value = self.eval_net(obs)
            q_value = q_value * action_onehot
            q_value = tf.reduce_sum(q_value, axis=1)
            
            loss = tf.losses.mean_squared_error(q_target, q_value)
            
        grads = tape.gradient(target=loss, sources=self.eval_net.trainable_variables)
        tf.optimizers.Adam(self.lr).apply_gradients(zip(grads, self.eval_net.trainable_variables))
        
        self.agent_status["learn_times"] = self.learning_times
        self.agent_status["loss"] = float(loss)
        
        if self.learning_times % self.sync_weights_per_times == 0:
            self.__sync_weights_to_target()
        
        self.learning_times += 1
               
    def __sync_weights_to_target(self, ):
        self.target_net.set_weights(self.eval_net.get_weights())
        self.agent_status["sync_times"] = self.sync_weights_times
        self.sync_weights_times += 1
        
    

# Test: CartPole-v1

In [None]:
import gym

env = gym.make("CartPole-v1")
action_n = env.action_space.n 
obs_dim = env.observation_space.shape[0]
agent = DQN(action_n=action_n, observation_dim=obs_dim, buffer_size=[2048, obs_dim*2+3])

step = 0
learning_per_step = 1
episode = 0
evaluate_per_episode = 10
evaluate_times = 10

while True:
    # train
    obs = env.reset()
    done = False
    while not done:
        a = agent.get_action(obs)
        obs_, reward, done, _ = env.step(a)
        ready = agent.add_exp(obs, a, reward, obs_, done)
        obs = obs_
        
        if ready and step % learning_per_step == 0:
            agent.learning()
        step += 1
        
        print("\r** agent status: {}".format(agent.agent_status), end='', flush=True)
    
    episode += 1
    
    # evaluate
    if ready and episode % evaluate_per_episode == 0:
        print("\n-------------------- evaluating -----------------------\n")
        total_reward = 0
        for _ in range(evaluate_times):
            obs = env.reset()
            done = False
            while not done:
                a = agent.get_action(obs, eps_greedy=False)
                env.render()
                obs_, reward, done, _ = env.step(a)
                total_reward += reward
                obs = obs_
        agent.agent_status["reward"] = total_reward / evaluate_times

env.close()