In [1]:
import os
import tensorflow as tf
import numpy as np

In [2]:
class DQN:
    def __init__(self, learning_rate, num_actions, name, input_dims, 
                 fc1_dims=256, fc2_dims=256, chkpt_dir='tmp/dqn'):
        self.learning_rate = learning_rate
        self.num_actions = num_actions
        self.name = name
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.chkpt_dir = chkpt_dir
        self.sess = tf.Session()
        self.build_network()
        self.sess.run(tf.global_variables_initializer())
        self.saver = tf.train.Saver()
        self.checkpoint_file = os.path.join(chkpt_dir, 'dqn.ckpt')
    
    def build_network(self):
        with tf.variable_scope(self.name):
            self.input = tf.placeholder(tf.float32, shape=[None, self.input_dims], name="inputs")
            self.actions = tf.placeholder(tf.float32, shape=[None, self.num_actions], name="actions")
            self.q_target = tf.placeholder(tf.float32, shape=[None, self.num_actions], name="q_target")
            
            flatten = tf.layers.flatten(self.input)
            dense_1 = tf.layers.dense(flatten, units=self.fc1_dims, activation=tf.nn.relu)
            dense_2 = tf.layers.dense(dense_1, units=self.fc2_dims, activation=tf.nn.relu)
            self.q_values = tf.layers.dense(dense_2, units=self.num_actions)
            self.loss = tf.reduce_mean(tf.square(self.q_values - self.q_target))
            self.train_op = tf.train.AdamOptimizer(self.learning_rate_rate).minimize(self.loss)
    
    def load_checkpoint(self):
        print("... Loading Checkpoint ...")
        self.saver.restore(self.sess, self.checkpoint_file)
        
    def save_checkpoint(self):
        self.saver.save(self.sess, self.checkpoint_file)       

In [None]:
class Agent:
    def __init__(self, learning_rate, discount, memory_size, num_actions, epsilon, batch_size, input_dims, epsilon_start=0.99, epsilon_end=0.01, q_dir='tmp/q'):
        self.action_space = [i for i in range(num_actions)]
        self.num_actions = num_actions
        self.discount = discount
        self.memory_size = memory_size
        self.memory_counter = 0
        self.epsilon = epsilon
        self.epsilon_start = epsilon_start
        self.epsilon_end = epsilon_end
        self.batch_size = batch_size
        self.q_eval = DQN(learning_rate, num_actions, input_dims, name='q_eval', chkpt_dir=q_dir)
        self.state_memory = np.zeros((self.memory_size, *input_dims))
        self.new_state_memory = np.zeros((self.memory_size, *input_dims))
        self.action_memory = np.zeros((self.memory_size, self.num_actions), dtype=np.int8)
        self.reward_memory = np.zeros(self.memory_size)
        self.terminal_memory = np.zeros(self.memory_size, dtype=np.int8)
    
    def store_transition(self, state, action, reward, new_state, terminal):
        index = self.memory_counter % self.memory_size
        self.state_memory[index] = state
        self.new_state_memory[index] = new_state
        self.reward_memory[index] = reward
        actions = np.zeros(self.num_actions)
        actions[action] = 1.0
        self.action_memory[index] = actions
        self.terminal_memory = 1 - terminal