In [None]:
import gym
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import deque

ENV_NAME = "CartPole-v0"

# FEAR MODEL
fear_on = False
fear_radius = 5
fear_factor = 0.5
fear_linear = 1000000
fear_warmup = 20000

# DQN
GAMMA = 0.9 # Discount factor
INITIAL_EPSILON = 0.5 # Initial value for epsilon-greedy
FINAL_EPSILON = 0.01 # Final value for epsilon-greedy
EXPLORATION_STEPS = 10000 # Number of steps till epsilon reaches its final value
MEMORY_REPLAY_SIZE = 10000
BATCH_SIZE = 32
OPTIMIZER_LEARNING_RATE = 0.0001

class Agent():
    def __init__(self, env):
        """ Initilizes the agent: replay memory, neural network """
        self.num_actions = env.action_space.n
        self.num_observations = env.observation_space.shape[0]
        self.epsilon = INITIAL_EPSILON
        self.epsilon_step = (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORATION_STEPS
        self.time_step = 0
        
        # Create memory buffer for replay
        self.memory_replay = deque()
        
        # Create Q network and define loss
        self.state_input, self.q_values = self.build_network()
        self.action_input, self.y_input, self.loss, self.optimizer = self.build_training_ops()  
        
        # Create session
        self.sess = tf.InteractiveSession()
        self.sess.run(tf.global_variables_initializer())
        
    def build_network(self):
        """ Constructs the neural network with 1 hidden layer to approximate the Q function """
        W1 = tf.Variable(tf.truncated_normal([self.num_observations,16]))
        b1 = tf.Variable(tf.constant(0.01, shape=[16]))
        W2 = tf.Variable(tf.truncated_normal([16,self.num_actions]))
        b2 = tf.Variable(tf.constant(0.01, shape=[self.num_actions]))

        state_input = tf.placeholder(dtype=tf.float32)
        hidden_layer = tf.nn.relu(tf.matmul(state_input,W1) + b1)
        q_values = tf.matmul(hidden_layer, W2) + b2

        return state_input, q_values
        
    def build_training_ops(self):
        """ Defines the loss function """
        y_input = tf.placeholder(dtype=tf.float32)
        action_input = tf.placeholder(dtype=tf.int32)
        action_one_hot = tf.one_hot(action_input, self.num_actions, on_value=1.0, off_value=0.0)

        q_action = tf.reduce_sum(tf.multiply(self.q_values, action_one_hot), axis=1)
        
        loss = tf.reduce_mean(tf.square(y_input - q_action))
        optimizer = tf.train.AdamOptimizer(OPTIMIZER_LEARNING_RATE).minimize(loss)
        
        return action_input, y_input, loss, optimizer
        
    def run(self, state, action, reward, next_state, terminal):   
        """  """
        # Store transition in memory
        self.memory_replay.append((state, action, reward, next_state, terminal))
        
        if len(self.memory_replay) > MEMORY_REPLAY_SIZE:
            self.memory_replay.popleft()
            
        if len(self.memory_replay) > BATCH_SIZE:
            self.train_network()
            
    def explore(self, state):
        """ Explores possible actions using epsilon-greedy with epsilon linearly decreasing """
        if random.random() <= self.epsilon:
            return random.randint(0, self.num_actions-1)
        else:
            return np.argmax(self.q_values.eval(feed_dict={self.state_input:[state]}))
        
        self.epsilon -= self.epsilon_step
        
    def get_action(self, state):
        """ Get the best action : max Q """
        return np.argmax(self.q_values.eval(feed_dict={self.state_input:[state]}))
        
    def train_network(self):
        """ Trains the network using mini batch """
        mini_batch = pd.DataFrame(random.sample(self.memory_replay, BATCH_SIZE), columns = ['state','action','reward','next_state','terminal'])
        
        q_values_batch = self.q_values.eval(feed_dict={self.state_input:mini_batch['next_state'].tolist()})
        terminal_batch = mini_batch['terminal'] + 0 # convert True to 1, False to 0
        
        y_batch = mini_batch['reward'] + (1 - terminal_batch) * GAMMA * np.max(q_values_batch, axis=1)
        
        loss, _ = self.sess.run([self.loss,self.optimizer], feed_dict={
            self.y_input : y_batch.tolist(),
            self.action_input : mini_batch['action'].tolist(),
            self.state_input : mini_batch['state'].tolist()
        })
        
        self.time_step += 1
        
        

In [None]:
EPISODES = 10000
env = gym.make(ENV_NAME)
agent = Agent(env)

for episode in xrange(EPISODES):
    state = env.reset()
    done = False
    while not done:
        action = agent.explore(state)
        next_state, reward, done, _ = env.step(action)
        agent.run(state, action, reward, next_state, done)
        state = next_state
        
    if episode % 100 == 0:
        total_reward = 0
        for i in xrange(10): #10 test
            state = env.reset()
            done = False
            while not done:
#                 env.render()
                action = agent.get_action(state)
                state,reward,done,_ = env.step(action)
                total_reward += reward
        avg_reward = total_reward / 10
        print('episode : ', episode, "avg_reward:", avg_reward)
        if avg_reward >= 200:
            break
        


('episode : ', 0, 'avg_reward:', 9.7)
('episode : ', 100, 'avg_reward:', 27.7)
('episode : ', 200, 'avg_reward:', 29.8)
('episode : ', 300, 'avg_reward:', 11.6)
('episode : ', 400, 'avg_reward:', 9.2)
('episode : ', 500, 'avg_reward:', 9.4)
('episode : ', 600, 'avg_reward:', 14.8)
('episode : ', 700, 'avg_reward:', 40.9)
('episode : ', 800, 'avg_reward:', 57.2)
('episode : ', 900, 'avg_reward:', 138.3)
('episode : ', 1000, 'avg_reward:', 124.5)
('episode : ', 1100, 'avg_reward:', 134.4)
('episode : ', 1200, 'avg_reward:', 153.0)
('episode : ', 1300, 'avg_reward:', 200.0)
('episode : ', 1400, 'avg_reward:', 200.0)
('episode : ', 1500, 'avg_reward:', 150.8)
