In [2]:
import os
import sys
new_path = r'/usr/local/lib/python2.7/dist-packages'
sys.path.append(new_path)

import gym
env = gym.make('CartPole-v0')

In [3]:
import numpy as np
import random
import tensorflow as tf

seed_val = 111
np.random.seed(seed_val)
random.seed(seed_val+1)
tf.set_random_seed(seed_val+2)
env.seed(seed_val+3)

  from ._conv import register_converters as _register_converters


[114]

In [4]:
from collections import deque
from keras.layers import Dense
from keras.models import Sequential, clone_model
from keras.utils import to_categorical
from keras import optimizers
from keras import backend as K
import time

Using TensorFlow backend.


In [5]:
gamma = 0.99
replay_buffer_length = 6000
learning_rate = 0.002
batch_size = 32
N_ACTIONS = env.action_space.n
eps_min = 0.01
n_episodes = 3000
# Neural net architecture, each layer must specify number of neurons and activation function
neural_net_arch = [[16, 'relu'], [16, 'relu']]

In [6]:
eps_schedule = np.append(np.linspace(1, eps_min, int(0.4*n_episodes)), eps_min * np.ones(n_episodes-int(0.4*n_episodes)))
states = deque(maxlen= replay_buffer_length)
actions = deque(maxlen= replay_buffer_length)
rewards = deque(maxlen= replay_buffer_length)
next_states = deque(maxlen= replay_buffer_length)
done_flags = deque(maxlen= replay_buffer_length)

In [7]:
def huber_loss(target, prediction):
    error = prediction - target
    return K.mean(K.sqrt(1+K.square(error))-1, axis=-1)

def preprocess(img):
    return np.reshape(img, (1,4))

def select_action(q_model, cur_state, eps):
    if random.random() < eps:
        return random.choice(range(N_ACTIONS))
    else:
        return np.argmax(q_model.predict(cur_state))

def store_memory(states, actions, rewards, next_states, done_flags,
                 state, action, reward, next_state, done):
    states.append(state)
    actions.append(action)
    rewards.append(reward)
    next_states.append(next_state)
    done_flags.append(done)
    
# def q_learning_update(gamma, alpha, cur_state, reward, next_state, action, q_vals):
#     target = reward + gamma * np.max(q_vals[next_state])
#     q_vals[cur_state][action] = (1 - alpha) * q_vals[cur_state][action] + alpha * target
    

def replay(q_target_model, q_model, states, actions, rewards, next_states, done_flags):

    minibatch_indices = random.sample(range(replay_buffer_length), batch_size)
    states_minibatch = np.reshape(np.array(states)[minibatch_indices], (batch_size, 4))
    actions_minibatch = np.array(actions)[minibatch_indices]
    rewards_minibatch = np.array(rewards)[minibatch_indices]
    next_states_minibatch = np.reshape(np.array(next_states)[minibatch_indices], (batch_size, 4))
    done_flags_minibatch = np.array(done_flags)[minibatch_indices]

#     from IPython.core.debugger import set_trace
#     set_trace()
 
    # y_minibatch currently gives one value for each state (corresponding to the Q_target for best action)
    # but to train we need a shape of (batch_size, n_actions) for the loss
    # setting the Loss[i] = 0 for all action_i except the best action

    # Two options: init y_minibatch with zeros or q_model values
    # zeros will move the q_model to a higher loss corresponding to actions which were not taken
    # q_model values will have zero loss for actions which were not taken
    y_minibatch = q_model.predict(states_minibatch)
    # y_minibatch = np.zeros((batch_size, N_ACTIONS))

    # q_val_non_target =  #shape: (32,2)
    # q_val_target =  #shape: (32,2)
    
    # Two options: reward for completion: -1 vs 1
    # Hypothesis: gamma = 1 will work better for ending reward=1 because it can see 'longer' in the future 
    # that it's not going to get much reward
    # Generally reward = -10 training faster to solve the game
    # y_minibatch[range(batch_size), actions_minibatch] = rewards_minibatch + (1. - done_flags_minibatch) * (1 * q_val_target[range(batch_size),best_action_non_target])
    best_action_non_target = np.argmax(q_model.predict(next_states_minibatch), axis=1) #shape: (32,)
    y_minibatch[range(batch_size), actions_minibatch] = rewards_minibatch + (1. - done_flags_minibatch) * (gamma * q_target_model.predict(next_states_minibatch)[range(batch_size),best_action_non_target] )
    
    q_model.fit(x=states_minibatch, y=y_minibatch, verbose=0)

def reset_q_target(q_model):
    model_copy = clone_model(q_model)
    model_copy.set_weights(q_model.get_weights())
    return model_copy

def create_q_model():
    model = Sequential()
    if (len(neural_net_arch) >= 1):
        model.add(Dense(neural_net_arch[0][0], activation=neural_net_arch[0][1], input_dim=env.observation_space.shape[0]))
    if (len(neural_net_arch) >= 2):
        model.add(Dense(neural_net_arch[1][0], activation=neural_net_arch[1][1]))
    if (len(neural_net_arch) >= 3):
        model.add(Dense(neural_net_arch[2][0], activation=neural_net_arch[2][1]))
                  
    model.add(Dense(N_ACTIONS, activation='linear'))
    opt = optimizers.Adam(lr=learning_rate)
    model.compile(optimizer=opt, loss='mse')
    # model.compile(optimizer=opt, loss=huber_loss)
    return model

In [8]:
q_model = create_q_model()
q_target_model = reset_q_target(q_model)

Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [9]:
global_steps = 0
ret=[]
q_vals=[]
eps_vals=[]

In [None]:
cur_state = preprocess(env.reset())
for i in range(replay_buffer_length):
    action = select_action(q_model, cur_state, 1) #Full exploration for replay buffer initiation
    next_state, reward, done, info = env.step(action)
    next_state = preprocess(next_state)
    store_memory(states, actions, rewards, next_states, done_flags,
                     cur_state, action, reward, next_state, done)
    if done:
        cur_state = preprocess(env.reset())
    cur_state = next_state

for episode in range(1,n_episodes+1):
    # preprocess state
    cur_state = preprocess(env.reset())
    
    for t in range(300):
        env.render()
        global_steps += 1

        if global_steps % 100 == 0:
            q_target_model = reset_q_target(q_model)

        action = select_action(q_model, cur_state, eps_schedule[episode])
        next_state, reward, done, info = env.step(action)
        reward = -10 if done else reward
        next_state = preprocess(next_state)
        
        if done:
            ret.append(t)
            q_vals.append(q_model.predict(cur_state)[0,action])
            eps_vals.append(eps_schedule[episode])
            # next_state = np.zeros(cur_state.shape)
            store_memory(states, actions, rewards, next_states, done_flags,
                         cur_state, action, reward, next_state, done)
            
            if (t > 180):
                print("Reward:", t)
            if (episode % 10 == 0):
                print("Episode:", episode)
                print("Avg Reward (Last 10):", np.mean(ret[-10:-1]))
                if episode % 100 == 0:
                    print("Avg Reward (Last 100):", np.mean(ret[-100:-1]))
            break
        
        store_memory(states, actions, rewards, next_states, done_flags,
                     cur_state, action, reward, next_state, done)
        
        cur_state = next_state
        
        replay(q_target_model, q_model, states, actions, rewards, next_states, done_flags)