In [1]:
import gym
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import deque

ENV_NAME = "CartPole-v0"

# FEAR MODEL
fear_on = True
fear_radius = 5
fear_factor = 0.5
fear_linear = 10000 # Number of steps for the adjusted fear_factor reach the value of fear factor (same as epsilon)
fear_warmup = 1000
FEAR_LEARNING_RATE = 0.0001

# DQN
GAMMA = 0.9 # Discount factor
INITIAL_EPSILON = 0.5 # Initial value for epsilon-greedy
FINAL_EPSILON = 0.01 # Final value for epsilon-greedy
EXPLORATION_STEPS = 10000 # Number of steps till epsilon reaches its final value
MEMORY_REPLAY_SIZE = 10000
BATCH_SIZE = 32
OPTIMIZER_LEARNING_RATE = 0.0001

class Agent():
    def __init__(self, env):
        """ Initilizes the agent: replay memory, neural network """
        self.num_actions = env.action_space.n
        self.num_observations = env.observation_space.shape[0]
        self.epsilon = INITIAL_EPSILON
        self.epsilon_step = (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORATION_STEPS
        self.time_step = 0
        
        # Create memory buffer for replay
        self.memory_replay = deque()
        
        # Create Q network and define loss
        self.state_input, self.q_values = self.build_network()
        self.action_input, self.y_input, self.loss, self.optimizer = self.build_training_ops() 
        
        # Create Fear network and define loss
        self.avg_fear = 0
        self.danger_states = []
        self.safe_states = []
        
        self.fear_state_input, self.fear_scores = self.build_fear_network()
        self.fear_y_input, self.fear_loss, self.fear_optimizer = self.build_fear_training_ops()
        
        # Create session
        self.sess = tf.InteractiveSession()
        
        # Create summaries
        self.total_reward = 0 
        self.test_total_reward = 0
        self.total_loss = 0
        self.total_fear_loss = 0
        self.duration = 0
        self.episode = 0
        
        self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary()
        self.summary_writer = tf.summary.FileWriter("logs/", self.sess.graph)
        
        self.sess.run(tf.global_variables_initializer())
        
    def build_network(self):
        """ Constructs the neural network with 1 hidden layer to approximate the Q function """
        W1 = tf.Variable(tf.truncated_normal([self.num_observations,16]))
        b1 = tf.Variable(tf.constant(0.01, shape=[16]))
        W2 = tf.Variable(tf.truncated_normal([16,self.num_actions]))
        b2 = tf.Variable(tf.constant(0.01, shape=[self.num_actions]))

        state_input = tf.placeholder(dtype=tf.float32)
        hidden_layer = tf.nn.relu(tf.matmul(state_input,W1) + b1)
        q_values = tf.matmul(hidden_layer, W2) + b2

        return state_input, q_values
        
    def build_training_ops(self):
        """ Defines the loss function """
        y_input = tf.placeholder(dtype=tf.float32)
        action_input = tf.placeholder(dtype=tf.int32)
        action_one_hot = tf.one_hot(action_input, self.num_actions, on_value=1.0, off_value=0.0)

        q_action = tf.reduce_sum(tf.multiply(self.q_values, action_one_hot), axis=1)
        
        loss = tf.reduce_mean(tf.square(y_input - q_action))
        optimizer = tf.train.AdamOptimizer(OPTIMIZER_LEARNING_RATE).minimize(loss)
        
        return action_input, y_input, loss, optimizer
    
    def build_fear_network(self):
        W1 = tf.Variable(tf.truncated_normal([self.num_observations,16]))
        b1 = tf.Variable(tf.constant(0.01, shape=[16]))
        W2 = tf.Variable(tf.truncated_normal([16,1]))
        b2 = tf.Variable(tf.constant(0.01, shape=[1]))

        fear_state_input = tf.placeholder(dtype=tf.float32)
        hidden_layer = tf.nn.relu(tf.matmul(fear_state_input,W1) + b1)
        fear_scores = tf.matmul(hidden_layer, W2) + b2

        return fear_state_input, fear_scores
        
    def build_fear_training_ops(self):
        fear_y_input = tf.placeholder(dtype=tf.float32)
        
        yhat = tf.minimum(tf.maximum(tf.reduce_sum(self.fear_scores, axis=1), .01), 0.99)
        
        fear_loss = tf.reduce_mean( - (fear_y_input * tf.log(yhat) + (1-fear_y_input) * tf.log(1-yhat)))
        fear_optimizer = tf.train.AdamOptimizer(FEAR_LEARNING_RATE).minimize(fear_loss)
        
        return fear_y_input, fear_loss, fear_optimizer
        
        
    def run(self, state, action, reward, next_state, terminal):  
        """ Updates knowledge """
        # Store transition in memory
        self.memory_replay.append((state, action, reward, next_state, terminal))
        
        if len(self.memory_replay) > MEMORY_REPLAY_SIZE:
            self.memory_replay.popleft()
            
        if len(self.memory_replay) > BATCH_SIZE:
            self.train_network()
            
        self.total_reward += reward
        self.duration += 1.0
        
        if terminal:
            stats = [self.total_reward, self.total_loss / self.duration, self.test_total_reward, self.total_fear_loss / self.duration]
            
            for i in range(len(stats)):
                self.sess.run(self.update_ops[i], feed_dict={
                    self.summary_placeholders[i]: float(stats[i])
                })
            summary_str = self.sess.run(self.summary_op)
            self.summary_writer.add_summary(summary_str, self.episode + 1)
            
            self.total_reward = 0
            self.total_loss = 0
            self.total_fear_loss = 0
            self.duration = 0
            self.episode += 1
        
    def explore(self, state):
        """ Explores possible actions using epsilon-greedy with epsilon linearly decreasing """
        
        if fear_on and self.time_step > fear_warmup: 
            fear_score = self.fear_scores.eval(feed_dict={self.fear_state_input:[state]})
            self.avg_fear = self.avg_fear * .99 + fear_score * .01
        
        if random.random() <= self.epsilon:
            return random.randint(0, self.num_actions-1)
        else:
            return np.argmax(self.q_values.eval(feed_dict={self.state_input:[state]}))
        
        self.epsilon -= self.epsilon_step
        
    def get_action(self, state):
        """ Get the best action : max Q """
        return np.argmax(self.q_values.eval(feed_dict={self.state_input:[state]}))
        
    def train_network(self):
        """ Trains the network using mini batch """
        mini_batch = pd.DataFrame(random.sample(self.memory_replay, BATCH_SIZE), columns = ['state','action','reward','next_state','terminal'])
        
        q_values_batch = self.q_values.eval(feed_dict={self.state_input:mini_batch['next_state'].tolist()})
        terminal_batch = mini_batch['terminal'] + 0 # convert True to 1, False to 0
        
        if fear_on and self.time_step > fear_warmup:
            adjusted_fear_factor = np.min([fear_factor * float(self.time_step)/fear_linear, fear_factor])
            fear_penalty = (1 - terminal_batch) * adjusted_fear_factor * np.max(self.fear_scores.eval(feed_dict={self.fear_state_input:mini_batch['next_state'].tolist()}), axis=1) + terminal_batch * 10 * adjusted_fear_factor
            y_batch = mini_batch['reward'] + (1 - terminal_batch) * GAMMA * np.max(q_values_batch, axis=1) - fear_penalty
        else:
            y_batch = mini_batch['reward'] + (1 - terminal_batch) * GAMMA * np.max(q_values_batch, axis=1)
        
        loss, _ = self.sess.run([self.loss,self.optimizer], feed_dict={
            self.y_input : y_batch.tolist(),
            self.action_input : mini_batch['action'].tolist(),
            self.state_input : mini_batch['state'].tolist()
        })
        
        self.total_loss += loss
        
        if fear_on and self.time_step > fear_warmup:
            if (len(self.danger_states) >= int(BATCH_SIZE/2)) and (len(self.safe_states) > int(BATCH_SIZE/2)):
                fear_minibatch = random.sample(self.danger_states, int(BATCH_SIZE)/2) + random.sample(self.safe_states, int(BATCH_SIZE)/2)
                fear_y_batch = [1] * int(BATCH_SIZE/2) + [0] * int(BATCH_SIZE/2)
                fear_loss, _ = self.sess.run([self.fear_loss, self.fear_optimizer], feed_dict={
                    self.fear_y_input: fear_y_batch,
                    self.fear_state_input: fear_minibatch
                })
                self.total_fear_loss += fear_loss
        self.time_step += 1
        
    def log_danger(self, new_danger_states):
        self.danger_states += new_danger_states
        
    def log_safe(self, new_safe_states):
        self.safe_states += new_safe_states
        
    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        tf.summary.scalar(ENV_NAME + "/Total_Reward/Episode", episode_total_reward)
        episode_avg_loss = tf.Variable(0.)
        tf.summary.scalar(ENV_NAME + "/Avg_Loss/Episode", episode_avg_loss)
        episode_test_total_reward = tf.Variable(0.)
        tf.summary.scalar(ENV_NAME + "/Test_Total_Reward/Episode", episode_test_total_reward)
        episode_avg_fear_loss = tf.Variable(0.)
        tf.summary.scalar(ENV_NAME + "/Avg_Fear_Loss/Episode", episode_avg_fear_loss)
        
        summary_vars = [episode_total_reward, episode_avg_loss, episode_test_total_reward, episode_avg_fear_loss]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in range(len(summary_vars))]
        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in range(len(summary_vars))]
        summary_op = tf.summary.merge_all()
        
        return summary_placeholders, update_ops, summary_op
    def record_test_reward(self, reward):
        self.test_total_reward = reward
        

In [2]:
EPISODES = 10000
env = gym.make(ENV_NAME)
agent = Agent(env)

reached = False
for episode in xrange(EPISODES):
    state = env.reset()
    done = False
    episode_observations = []
    while not done:
        action = agent.explore(state)
        next_state, reward, done, _ = env.step(action)
        agent.run(state, action, reward, next_state, done)
        state = next_state
        episode_observations.append(state)
        
    if fear_on:
        agent.log_danger(episode_observations[-fear_radius:])
        agent.log_safe(episode_observations[:-fear_radius])
        
        for e in episode_observations:
            del e
        del episode_observations
        episode_observations = []
    
    if episode % 100 == 0:
        total_reward = 0
        for i in xrange(10): #10 test
            state = env.reset()
            done = False
            while not done:
#                 env.render()
                action = agent.get_action(state)
                state,reward,done,_ = env.step(action)
                total_reward += reward

        avg_reward = total_reward / 10
        agent.record_test_reward(avg_reward)
        print('episode : ', episode, "avg_reward:", avg_reward)
        if avg_reward >= 200:
            if reached:
                break
            reached = True
        


('episode : ', 0, 'avg_reward:', 10.6)
('episode : ', 100, 'avg_reward:', 9.1)
('episode : ', 200, 'avg_reward:', 9.1)
('episode : ', 300, 'avg_reward:', 14.3)
('episode : ', 400, 'avg_reward:', 11.4)
('episode : ', 500, 'avg_reward:', 15.0)
('episode : ', 600, 'avg_reward:', 16.8)
('episode : ', 700, 'avg_reward:', 21.5)
('episode : ', 800, 'avg_reward:', 22.3)
('episode : ', 900, 'avg_reward:', 36.1)
('episode : ', 1000, 'avg_reward:', 41.1)
('episode : ', 1100, 'avg_reward:', 67.2)
('episode : ', 1200, 'avg_reward:', 157.9)
('episode : ', 1300, 'avg_reward:', 200.0)
('episode : ', 1400, 'avg_reward:', 200.0)
