In [None]:
# -*- coding: utf-8 -*-

import numpy as np
import tensorflow as tf
import random
from collections import deque
import gym

env = gym.make('CartPole-v0')
class DQN:
    def __init__(self, session, input_size, output_size, name="main"):
        self.session = session
        self.input_size = input_size
        self.output_size = output_size
        self.net_name = name
        self._build_network()

    def _build_network(self, h_size=10, l_rate=0.1):
        with tf.variable_scope(self.net_name):
            self._X = tf.placeholder(
                tf.float32, [None, self.input_size], name="input_x")

            # First layer of weights
            W1 = tf.get_variable("W1", shape=[self.input_size, h_size],
                                 initializer=tf.contrib.layers.xavier_initializer())
            layer1 = tf.nn.tanh(tf.matmul(self._X, W1))

            # Second layer of Weights
            W2 = tf.get_variable("W2", shape=[h_size, self.output_size],
                                 initializer=tf.contrib.layers.xavier_initializer())

            # Q prediction
            self._Qpred = tf.matmul(layer1, W2)

        # We need to define the parts of the network needed for learning a policy
        self._Y = tf.placeholder(
            shape=[None, self.output_size], dtype=tf.float32)

        # Loss function
        self._loss = tf.reduce_mean(tf.square(self._Y - self._Qpred))
        # Learning
        self._train = tf.train.AdamOptimizer(
            learning_rate=l_rate).minimize(self._loss)

    def predict(self, state):
        x = np.reshape(state, [1, self.input_size])
        return self.session.run(self._Qpred, feed_dict={self._X:x})

    def update(self, x_stack, y_stack):
        return self.session.run([self._loss, self._train],
                                feed_dict={self._X: x_stack, self._Y: y_stack})
        

# Constant defining our neural network
input_size = env.observation_space.shape[0] # 4개 상태
output_size = env.action_space.n # 2개 action

dis = 0.9
REPLAY_MEMORY = 50000

def simple_replay_train(DQN, train_batch):
    x_stack = np.empty(0).reshape(0, DQN.input_size) # 한번에 모아서 학습하기 위한 자료구조
    y_stack = np.empty(0).reshape(0, DQN.output_size)

    # Get stored information from the buffer
    for state, action, reward, next_state, done in train_batch:
        Q = DQN.predict(state)

        if done: # terminal?
            Q[0, action] = reward
        else:
            # Obtain the Q's values by feeding the new state through our network
            Q[0, action] = reward + dis * np.max(DQN.predict(next_state))

        y_stack = np.vstack([y_stack, Q])
        x_stack = np.vstack([x_stack, state])

    # Train our network using target and predicted Q values on each episode
    return DQN.update(x_stack, y_stack)

def bot_play(mainDQN):
    # See our trained network in action
    s = env.reset()
    reward_sum = 0
    while True:
        env.render()
        a = np.argmax(mainDQN.predict(s))
        s, reward, done, _ = env.step(a)
        reward_sum += reward
        if done:
            print ("Total score {} ".format(reward_sum))
            break

def main():
    max_episode = 5000
    # store the previous obervations in the replay memory
    replay_buffer = deque()
    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size)
        tf.global_variables_initializer().run()
        for episode in range(max_episode):
            e = 0.1/((episode / 10) + 1)
            done = False
            step_count = 0
            state = env.reset()
            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()
                else:
                    action = np.argmax(mainDQN.predict(state))

                # Get new state and reward from environment
                next_state, reward, done, _ = env.step(action)
                if done:
                    reward = -100 # big penalty

                # Save the experience to our buffer
                replay_buffer.append((state, action, reward, next_state, done))
                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000: # good enough(막대가 넘어지지 않고 잘 유지되는 횟수)
                    break

            print ("Episode: {} steps {}".format(episode, step_count))
            if step_count > 10000:
                pass

            if episode % 10 ==1: # train every 10 episode
                # Get a random batch of experience
                for _ in range(50):
                    # Minibatch works better
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = simple_replay_train(mainDQN, minibatch)
                print ("Loss : ", loss)

        bot_play(mainDQN)

if __name__ == "__main__":
    main()


  result = entry_point.load(False)


Episode: 0 steps 8
Episode: 1 steps 9
Loss :  719.8277
Episode: 2 steps 63
Episode: 3 steps 83
Episode: 4 steps 45
Episode: 5 steps 75
Episode: 6 steps 57
Episode: 7 steps 70
Episode: 8 steps 106
Episode: 9 steps 46
Episode: 10 steps 100
Episode: 11 steps 47
Loss :  8.106441
Episode: 12 steps 58
Episode: 13 steps 50
Episode: 14 steps 41
Episode: 15 steps 44
Episode: 16 steps 81
Episode: 17 steps 82
Episode: 18 steps 95
Episode: 19 steps 49
Episode: 20 steps 48
Episode: 21 steps 102
Loss :  11.816924
Episode: 22 steps 78
Episode: 23 steps 52
Episode: 24 steps 67
Episode: 25 steps 41
Episode: 26 steps 61
Episode: 27 steps 29
Episode: 28 steps 74
Episode: 29 steps 49
Episode: 30 steps 30
Episode: 31 steps 44
Loss :  482.64615
Episode: 32 steps 51
Episode: 33 steps 46
Episode: 34 steps 58
Episode: 35 steps 21
Episode: 36 steps 29
Episode: 37 steps 32
Episode: 38 steps 37
Episode: 39 steps 21
Episode: 40 steps 50
Episode: 41 steps 28
Loss :  3.7721772
Episode: 42 steps 68
Episode: 43 steps 

Loss :  517.1702
Episode: 362 steps 31
Episode: 363 steps 29
Episode: 364 steps 37
Episode: 365 steps 39
Episode: 366 steps 40
Episode: 367 steps 39
Episode: 368 steps 29
Episode: 369 steps 59
Episode: 370 steps 39
Episode: 371 steps 43
Loss :  11.95347
Episode: 372 steps 40
Episode: 373 steps 33
Episode: 374 steps 25
Episode: 375 steps 29
Episode: 376 steps 31
Episode: 377 steps 27
Episode: 378 steps 31
Episode: 379 steps 43
Episode: 380 steps 21
Episode: 381 steps 38
Loss :  470.28238
Episode: 382 steps 80
Episode: 383 steps 40
Episode: 384 steps 50
Episode: 385 steps 85
Episode: 386 steps 91
Episode: 387 steps 48
Episode: 388 steps 60
Episode: 389 steps 38
Episode: 390 steps 36
Episode: 391 steps 44
Loss :  0.8066845
Episode: 392 steps 29
Episode: 393 steps 22
Episode: 394 steps 34
Episode: 395 steps 82
Episode: 396 steps 53
Episode: 397 steps 27
Episode: 398 steps 20
Episode: 399 steps 69
Episode: 400 steps 35
Episode: 401 steps 76
Loss :  1.7399378
Episode: 402 steps 36
Episode: 4

Loss :  507.08344
Episode: 712 steps 70
Episode: 713 steps 79
Episode: 714 steps 34
Episode: 715 steps 37
Episode: 716 steps 59
Episode: 717 steps 32
Episode: 718 steps 29
Episode: 719 steps 24
Episode: 720 steps 47
Episode: 721 steps 49
Loss :  12.162249
Episode: 722 steps 55
Episode: 723 steps 91
Episode: 724 steps 54
Episode: 725 steps 109
Episode: 726 steps 35
Episode: 727 steps 35
Episode: 728 steps 91
Episode: 729 steps 80
Episode: 730 steps 55
Episode: 731 steps 70
Loss :  4.089474
Episode: 732 steps 126
Episode: 733 steps 109
Episode: 734 steps 56
Episode: 735 steps 47
Episode: 736 steps 49
Episode: 737 steps 200
Episode: 738 steps 46
Episode: 739 steps 34
Episode: 740 steps 50
Episode: 741 steps 70
Loss :  1.5010885
Episode: 742 steps 200
Episode: 743 steps 90
Episode: 744 steps 200
Episode: 745 steps 104
Episode: 746 steps 52
Episode: 747 steps 81
Episode: 748 steps 200
Episode: 749 steps 137
Episode: 750 steps 200
Episode: 751 steps 83
Loss :  3.9482503
Episode: 752 steps 38

Episode: 1059 steps 65
Episode: 1060 steps 71
Episode: 1061 steps 82
Loss :  15.812009
Episode: 1062 steps 107
Episode: 1063 steps 99
Episode: 1064 steps 78
Episode: 1065 steps 66
Episode: 1066 steps 88
Episode: 1067 steps 84
Episode: 1068 steps 66
Episode: 1069 steps 59
Episode: 1070 steps 87
Episode: 1071 steps 83
Loss :  15.8179455
Episode: 1072 steps 43
Episode: 1073 steps 46
Episode: 1074 steps 42
Episode: 1075 steps 48
Episode: 1076 steps 59
Episode: 1077 steps 63
Episode: 1078 steps 47
Episode: 1079 steps 38
Episode: 1080 steps 47
Episode: 1081 steps 79
Loss :  11.496169
Episode: 1082 steps 38
Episode: 1083 steps 53
Episode: 1084 steps 51
Episode: 1085 steps 51
Episode: 1086 steps 45
Episode: 1087 steps 63
Episode: 1088 steps 45
Episode: 1089 steps 80
Episode: 1090 steps 63
Episode: 1091 steps 71
Loss :  472.0372
Episode: 1092 steps 23
Episode: 1093 steps 15
Episode: 1094 steps 14
Episode: 1095 steps 18
Episode: 1096 steps 20
Episode: 1097 steps 18
Episode: 1098 steps 16
Episode