# DQN

* 기존의 Q Table은 잘 수렴하지만 Q-Network는 수렴하지 않는다. 이유는 아래와 같다.
    * Correlations between samples : state에 따라 받아오는 결과값은 다들 유사하다 상호 연관성이 크다보니 큰 특징을 찾아내지 못한다.
    * Non-stationary targets : target 값인 y가 q_pred와 같은 네트워크를 공유하기 때문에 가중치인 w가 바뀌다보면 target 값 y도 바뀌어버린다.
<br><br>
* 어떻게 해결했는가?
    * Go deep
    * Capture and replay : agent에 행동에 따라 돌려받는 값들을 바로 학습하지 않고 버퍼에 저장해둔 후에 랜덤으로 샘플을 추출하여 학습한다, 랜덤을 통해서 전체 그래프의 양상을 파악할 수 있다.
    * Separate networks(create a traget network)

In [1]:
import numpy as np
import gym
import tensorflow as tf
import matplotlib.pyplot as plt
from collections import deque
import random

env = gym.make('CartPole-v0')
env._max_episode_steps = 10001
input_size = env.observation_space.shape[0]
output_size = env.action_space.n

dis = 0.9
REPLAY_MEMORY = 50000 

In [2]:
class DQN:
    def __init__(self, session, input_size, output_size, name ='main'):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        
        self._build_network()
        
    def _build_network(self, h_size = 10, l_rate =1e-1):
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(
                tf.float32, [None, self.input_size], name = 'input_x')
            # First layer
            W1 = tf.get_variable('W1', shape=[self.input_size, h_size],
                                initializer = tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X, W1))
            # Second layer
            W2 = tf.get_variable('W2', shape=[h_size, self.output_size],
                                initializer = tf.contrib.layers.xavier_initializer())
            # Q prediction
            self._Qpred = tf.matmul(layer1, W2)
        
        # policy
        self._Y = tf.placeholder(
            shape=[None, self.output_size], dtype = tf.float32)
        # loss function
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        # learning
        self._train = tf.train.AdamOptimizer(
            learning_rate = l_rate).minimize(self._loss)
        
    def predict(self, state):
        x = np.reshape(state, [1, self.input_size])
        return self.session.run(self._Qpred, feed_dict = {self._X : x})
    
    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train],feed_dict={
            self._X: x_stack, self._Y : y_stack})

In [3]:
def simple_replay_train(DQN, train_batch):
    x_stack = np.empty(0).reshape(0, DQN.input_size)
    y_stack = np.empty(0).reshape(0, DQN.output_size)
    
    for state, action, reward, next_state, done in train_batch:
        Q = DQN.predict(state)
        
        if done:
            Q[0, action] = reward
        else:
            Q[0, action] = reward + dis * np.max(DQN.predict(next_state))
            
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    return DQN.update(x_stack, y_stack)

def bot_play(mainDQN):
    s = env.reset()
    reward_sum = 0
    while True:
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print('Total score : {}'.format(reward_sum))
            break
            
def main():
    max_episodes = 5000
    
    replay_buffer = deque()
    
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size)
        tf.global_variables_initializer().run()
        
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            
            state = env.reset()
            
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))
                    
                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -60
                    
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                    
                state = next_state
                step_count += 1
                if step_count > 10000:
                    break
                    
            print("Episode : {}  steps : {}".format(episode, step_count))
            if step_count > 10000:
                pass
            
            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                    
                print("Loss : ", loss)
                
        bot_play(mainDQN)
        
if __name__ == '__main__':
    main()

W1204 22:04:14.614483 18868 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



Episode : 0  steps : 11
Episode : 1  steps : 30
Loss :  11.291983
Episode : 2  steps : 38
Episode : 3  steps : 52
Episode : 4  steps : 57
Episode : 5  steps : 18
Episode : 6  steps : 61
Episode : 7  steps : 77
Episode : 8  steps : 76
Episode : 9  steps : 44
Episode : 10  steps : 78
Episode : 11  steps : 74
Loss :  181.77788
Episode : 12  steps : 11
Episode : 13  steps : 12
Episode : 14  steps : 12
Episode : 15  steps : 10
Episode : 16  steps : 12
Episode : 17  steps : 10
Episode : 18  steps : 13
Episode : 19  steps : 12
Episode : 20  steps : 13
Episode : 21  steps : 12
Loss :  4.1722465
Episode : 22  steps : 13
Episode : 23  steps : 11
Episode : 24  steps : 11
Episode : 25  steps : 11
Episode : 26  steps : 13
Episode : 27  steps : 10
Episode : 28  steps : 9
Episode : 29  steps : 10
Episode : 30  steps : 8
Episode : 31  steps : 10
Loss :  1.7010094
Episode : 32  steps : 34
Episode : 33  steps : 16
Episode : 34  steps : 48
Episode : 35  steps : 27
Episode : 36  steps : 24
Episode : 37  s

Loss :  1.3983499
Episode : 302  steps : 24
Episode : 303  steps : 53
Episode : 304  steps : 25
Episode : 305  steps : 24
Episode : 306  steps : 36
Episode : 307  steps : 26
Episode : 308  steps : 29
Episode : 309  steps : 21
Episode : 310  steps : 26
Episode : 311  steps : 30
Loss :  181.6988
Episode : 312  steps : 81
Episode : 313  steps : 46
Episode : 314  steps : 40
Episode : 315  steps : 29
Episode : 316  steps : 24
Episode : 317  steps : 28
Episode : 318  steps : 41
Episode : 319  steps : 45
Episode : 320  steps : 109
Episode : 321  steps : 38
Loss :  1.1291823
Episode : 322  steps : 10
Episode : 323  steps : 9
Episode : 324  steps : 8
Episode : 325  steps : 8
Episode : 326  steps : 10
Episode : 327  steps : 9
Episode : 328  steps : 10
Episode : 329  steps : 10
Episode : 330  steps : 9
Episode : 331  steps : 9
Loss :  187.00397
Episode : 332  steps : 28
Episode : 333  steps : 22
Episode : 334  steps : 62
Episode : 335  steps : 31
Episode : 336  steps : 17
Episode : 337  steps : 2

KeyboardInterrupt: 

## seperate networks

In [3]:
def replay_train(mainDQN, targetDQN, train_batch): # update는 mainDQN
    x_stack = np.empty(0).reshape(0, mainDQN.input_size)
    y_stack = np.empty(0).reshape(0, mainDQN.output_size)
    
    for state, action, reward, next_state, done in train_batch:
        Q = mainDQN.predict(state)
        
        if done:
            Q[0, action] = reward
        else:
            Q[0, action] = reward + dis * np.max(targetDQN.predict(next_state))
            
        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])
        
    return mainDQN.update(x_stack, y_stack)

def get_copy_var_ops(*, dest_scope_name = 'target', src_scope_name = 'main'):
    op_holder = []
    
    src_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope = src_scope_name) # Weight만 가져옴
    dest_vars = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope = dest_scope_name)
    
    for src_var, dest_var in zip(src_vars, dest_vars):
        op_holder.append(dest_var.assign(src_var.value()))
        
    return op_holder

def bot_play(mainDQN):
    s = env.reset()
    reward_sum = 0
    while True:
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print('Total score : {}'.format(reward_sum))
            break

def main_s():
    max_episodes = 2000
    
    replay_buffer = deque()
    
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size, name = 'main')
        targetDQN = DQN(sess, input_size, output_size, name = 'target')
        tf.global_variables_initializer().run()
        
        copy_ops = get_copy_var_ops(dest_scope_name = 'target',
                                   src_scope_name = 'main')
        # weight copy
        sess.run(copy_ops)
        
        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0
            
            state = env.reset()
            
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))
                    
                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100
                    
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()
                    
                state = next_state
                step_count += 1
                if step_count > 10000:
                    break
                    
            print("Episode : {}  steps : {}".format(episode, step_count))
            if step_count > 10000:
                pass
            
            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)
                    
                print("Loss : ", loss)
                sess.run(copy_ops)
        bot_play(mainDQN)
        
if __name__ == '__main__':
    main_s()

W1204 23:13:45.326853  7452 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



Episode : 0  steps : 24
Episode : 1  steps : 12
Loss :  28.92364
Episode : 2  steps : 34
Episode : 3  steps : 35
Episode : 4  steps : 55
Episode : 5  steps : 29
Episode : 6  steps : 18
Episode : 7  steps : 37
Episode : 8  steps : 38
Episode : 9  steps : 48
Episode : 10  steps : 105
Episode : 11  steps : 93
Loss :  6.790284
Episode : 12  steps : 25
Episode : 13  steps : 55
Episode : 14  steps : 21
Episode : 15  steps : 31
Episode : 16  steps : 35
Episode : 17  steps : 27
Episode : 18  steps : 37
Episode : 19  steps : 24
Episode : 20  steps : 40
Episode : 21  steps : 22
Loss :  3.163361
Episode : 22  steps : 19
Episode : 23  steps : 28
Episode : 24  steps : 50
Episode : 25  steps : 134
Episode : 26  steps : 37
Episode : 27  steps : 31
Episode : 28  steps : 35
Episode : 29  steps : 49
Episode : 30  steps : 33
Episode : 31  steps : 47
Loss :  1.8652792
Episode : 32  steps : 73
Episode : 33  steps : 28
Episode : 34  steps : 37
Episode : 35  steps : 19
Episode : 36  steps : 46
Episode : 37  

Loss :  1532.7422
Episode : 302  steps : 40
Episode : 303  steps : 57
Episode : 304  steps : 27
Episode : 305  steps : 41
Episode : 306  steps : 71
Episode : 307  steps : 30
Episode : 308  steps : 49
Episode : 309  steps : 33
Episode : 310  steps : 44
Episode : 311  steps : 40
Loss :  988.2869
Episode : 312  steps : 25
Episode : 313  steps : 49
Episode : 314  steps : 23
Episode : 315  steps : 29
Episode : 316  steps : 40
Episode : 317  steps : 29
Episode : 318  steps : 30
Episode : 319  steps : 25
Episode : 320  steps : 26
Episode : 321  steps : 41
Loss :  9.574621
Episode : 322  steps : 29
Episode : 323  steps : 28
Episode : 324  steps : 40
Episode : 325  steps : 32
Episode : 326  steps : 45
Episode : 327  steps : 44
Episode : 328  steps : 46
Episode : 329  steps : 49
Episode : 330  steps : 24
Episode : 331  steps : 22
Loss :  1.6887375
Episode : 332  steps : 33
Episode : 333  steps : 58
Episode : 334  steps : 25
Episode : 335  steps : 42
Episode : 336  steps : 25
Episode : 337  steps

Episode : 596  steps : 2489
Episode : 597  steps : 749
Episode : 598  steps : 71
Episode : 599  steps : 49
Episode : 600  steps : 67
Episode : 601  steps : 115
Loss :  6.806156
Episode : 602  steps : 9
Episode : 603  steps : 9
Episode : 604  steps : 8
Episode : 605  steps : 8
Episode : 606  steps : 9
Episode : 607  steps : 9
Episode : 608  steps : 9
Episode : 609  steps : 9
Episode : 610  steps : 10
Episode : 611  steps : 10
Loss :  2.678418
Episode : 612  steps : 40
Episode : 613  steps : 34
Episode : 614  steps : 42
Episode : 615  steps : 28
Episode : 616  steps : 21
Episode : 617  steps : 31
Episode : 618  steps : 26
Episode : 619  steps : 34
Episode : 620  steps : 28
Episode : 621  steps : 33
Loss :  2.5745702
Episode : 622  steps : 65
Episode : 623  steps : 70
Episode : 624  steps : 44
Episode : 625  steps : 67
Episode : 626  steps : 44
Episode : 627  steps : 53
Episode : 628  steps : 35
Episode : 629  steps : 62
Episode : 630  steps : 50
Episode : 631  steps : 48
Loss :  2.183765

Episode : 891  steps : 71
Loss :  1.1378552
Episode : 892  steps : 265
Episode : 893  steps : 221
Episode : 894  steps : 155
Episode : 895  steps : 82
Episode : 896  steps : 98
Episode : 897  steps : 147
Episode : 898  steps : 256
Episode : 899  steps : 368
Episode : 900  steps : 187
Episode : 901  steps : 181
Loss :  2.15186
Episode : 902  steps : 63
Episode : 903  steps : 113
Episode : 904  steps : 97
Episode : 905  steps : 64
Episode : 906  steps : 142
Episode : 907  steps : 107
Episode : 908  steps : 77
Episode : 909  steps : 81
Episode : 910  steps : 111
Episode : 911  steps : 70
Loss :  1.2014258
Episode : 912  steps : 56
Episode : 913  steps : 108
Episode : 914  steps : 49
Episode : 915  steps : 52
Episode : 916  steps : 60
Episode : 917  steps : 66
Episode : 918  steps : 62
Episode : 919  steps : 51
Episode : 920  steps : 106
Episode : 921  steps : 66
Loss :  546.93713
Episode : 922  steps : 39
Episode : 923  steps : 34
Episode : 924  steps : 25
Episode : 925  steps : 65
Episod