In [10]:
import gym
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import deque

ENV_NAME = "CartPole-v0"

# FEAR MODEL
fear_on = False
fear_radius = 5
fear_factor = 0.5
fear_linear = 1000000
fear_warmup = 20000

# DQN
GAMMA = 0.9 # Discount factor
INITIAL_EPSILON = 0.5 # Initial value for epsilon-greedy
FINAL_EPSILON = 0.01 # Final value for epsilon-greedy
EXPLORATION_STEPS = 10000 # Number of steps till epsilon reaches its final value
MEMORY_REPLAY_SIZE = 10000
BATCH_SIZE = 32
OPTIMIZER_LEARNING_RATE = 0.0001

class Agent():
    def __init__(self, env):
        self.num_actions = env.action_space.n
        self.num_observations = env.observation_space.shape[0]
        self.epsilon = INITIAL_EPSILON
        self.epsilon_step = (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORATION_STEPS
        self.time_step = 0
        
        # Create memory buffer for replay
        self.memory_replay = deque()
        
        # Create Q network and define loss
        self.state_input, self.q_values = self.build_network()
        self.action_input, self.y_input, self.loss, self.optimizer = self.build_training_ops()
        
        # Create session
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        
    def build_network(self):
        W1 = tf.Variable(tf.truncated_normal([self.num_observations,20]))
        b1 = tf.Variable(0.01, dtype=tf.float32)
        W2 = tf.Variable(tf.truncated_normal([20,self.num_actions]))
        b2 = tf.Variable(0.01, dtype=tf.float32)
        
        state_input = tf.placeholder(dtype=tf.float32)
        hidden_layer = tf.nn.relu(tf.matmul(state_input,W1) + b1)
        q_values = tf.matmul(hidden_layer, W2) + b2
        
        return state_input, q_values
        
    def build_training_ops(self): #action_input, y_input
        action_input = tf.placeholder(dtype=tf.int32)
        y_input = tf.placeholder(dtype=tf.float32)
        
        action_one_hot = tf.one_hot(action_input, self.num_actions, on_value=1.0, off_value=0.0)
        q_action = tf.reduce_sum(tf.multiply(self.q_values, action_one_hot), axis=1)
        
        loss = tf.reduce_mean(tf.square(y_input - q_action))
        optimizer = tf.train.AdamOptimizer(OPTIMIZER_LEARNING_RATE).minimize(loss)
        
        return action_input, y_input, loss, optimizer
        
    def run(self, state, action, reward, next_state, terminal):        
        # Store transition in memory
        self.memory_replay.append((state, action, reward, next_state, terminal))
        
        if len(self.memory_replay) > MEMORY_REPLAY_SIZE:
            self.memory_replay.popleft()
            
        if len(self.memory_replay) > BATCH_SIZE:
            self.train_network()
            
    def get_action(self, state):
        if random.random() <= self.epsilon:
            return random.randint(0, self.num_actions-1)
        else:
            return np.argmax(self.q_values.eval(feed_dict={self.state_input:[state]}))
        
        self.epsilon -= self.epsilon_step
        
    def train_network(self):
        mini_batch = pd.DataFrame(random.sample(self.memory_replay, BATCH_SIZE), columns = ['state','action','reward','next_state','terminal'])
        
        q_values_batch = self.q_values.eval(feed_dict={self.state_input:mini_batch['next_state'].tolist()})
    
        terminal_batch = mini_batch['terminal'] + 0 # convert True to 1, False to 0
        
        y_batch = mini_batch['reward'] + (1 - terminal_batch) * GAMMA * np.max(q_values_batch, axis=1)
        
        loss, _ = self.sess.run([self.loss,self.optimizer], feed_dict={
            self.y_input : y_batch.tolist(),
            self.action_input : mini_batch['action'].tolist(),
            self.state_input : mini_batch['state'].tolist()
        })
        
        self.time_step += 1
        
        

In [9]:
EPISODES = 10000
env = gym.make(ENV_NAME)
agent = Agent(env)

for episode in xrange(EPISODES):
    state = env.reset()
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        agent.run(state, action, reward, next_state, done)
        state = next_state
        
    if episode % 100 == 0:
        total_reward = 0
        for i in xrange(10): #10 test
            state = env.reset()
            done = False
            while not done:
#                 env.render()
                action = agent.get_action(state)
                state,reward,done,_ = env.step(action)
                total_reward += reward
        avg_reward = total_reward / 10
        print('episode : ', episode, "avg_reward:", avg_reward)
        
    

('episode : ', 0, 'avg_reward:', 43.6)
[array([ 0.05355016, -0.02047127, -0.00641891, -0.01344002]), array([ 0.10347478, -0.1868039 , -0.1805973 , -0.36140569]), array([ 0.11113688, -0.00283495, -0.14220991, -0.40632803]), array([ 0.11586891, -0.20669189, -0.11969814,  0.08356857]), array([ 0.06403475,  0.56525806, -0.02490905, -0.89962851]), array([ 0.04740886,  0.36942301,  0.00579298, -0.59113394]), array([ 0.11513017, -0.19966479, -0.14075475, -0.07275773]), array([ 0.0997387 ,  0.0103638 , -0.18782542, -0.70515476]), array([ 0.04391943,  0.17447175,  0.01183713, -0.30220757]), array([ 0.03340466, -0.01942256,  0.03100186, -0.03658879]), array([ 0.05828173, -0.02081504, -0.01196234, -0.00585669]), array([ 0.1152639 , -0.00668645, -0.13435002, -0.32023654]), array([ 0.03652104,  0.36991948,  0.02388346, -0.60231663]), array([ 0.11108018, -0.19568416, -0.15033647, -0.16164331]), array([ 0.11253642,  0.18003418, -0.10774949, -0.42714945]), array([ 0.05314074,  0.17474215, -0.00668771,

ValueError: Cannot feed value of shape (32,) for Tensor u'Placeholder_10:0', which has shape '(?, 2)'