We make two updates to our basic dqn: 
*  add a target network 
*  add reward clipping 

In [49]:
import gym
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
from keras import backend as K  
import tensorflow as tf 
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
from math import exp 


In [50]:
# Setting up the environment 
env = gym.make("CartPole-v0")

In [51]:
def huber_loss1(y, yhat, delta = 3):
    """Adapted from here: https://jaromiru.com/2016/10/21/lets-make-a-dqn-full-dqn/
    y: true value, yhat: predicted value"""
    error = y - yhat
    cond = K.abs(error) < delta
    L2 = 0.5 * K.square(error)
    L1 = delta * (K.abs(error) - 0.5 * delta)
    loss = tf.where(cond, L2, L1)  
    return K.mean(loss)

In [52]:
def huber_loss(a, b, in_keras=True):
    error = a - b
    quadratic_term = error*error / 2
    linear_term = abs(error) - 1/2
    use_linear_term = (abs(error) > 1.0)
    if in_keras:
        # Keras won't let us multiply floats by booleans, so we explicitly cast the booleans to floats
        use_linear_term = K.cast(use_linear_term, 'float32')
    return use_linear_term * linear_term + (1-use_linear_term) * quadratic_term

In [53]:
def make_neural_network(): 
    n_actions = env.action_space.n
    input_dim = env.observation_space.shape[0]
    model = Sequential()
    model.add(Dense(16, input_dim = input_dim , activation = 'relu'))
    model.add(Dense(16, activation = 'relu'))
    model.add(Dense(n_actions, activation = 'linear'))
    # Add the huber/logcosh loss function 
    model.compile(optimizer=Adam(), loss = huber_loss)
    return model 

## Neural net 1: network that approximates the q-value 
nnet_q = make_neural_network()
## Neural net 2: target network, updated periodocally with values from nnet 1  
nnet_target = make_neural_network()

In [54]:
def replay(replay_memory, minibatch_size=32):
    minibatch = np.random.choice(replay_memory, minibatch_size, replace=True)
    s_l =      np.array(list(map(lambda x: x['s'], minibatch)))
    a_l =      np.array(list(map(lambda x: x['a'], minibatch)))
    r_l =      np.array(list(map(lambda x: x['r'], minibatch)))
    sprime_l = np.array(list(map(lambda x: x['sprime'], minibatch)))
    done_l   = np.array(list(map(lambda x: x['done'], minibatch)))
    # Predict the next state values with target network
    qvals_sprime_l = nnet_target.predict(sprime_l)
    # Predict current state values with realtime q network
    target_f = nnet_q.predict(s_l) 
    # q-update
    for i,(s,a,r,qvals_sprime, done) in enumerate(zip(s_l,a_l,r_l,qvals_sprime_l, done_l)): 
        if not done:  target = r + gamma * np.max(qvals_sprime)
        else:         target = r
        target_f[i][a] = target
    nnet_q.fit(s_l,target_f, epochs=1, verbose=0)
    return nnet_q

In [55]:
# Parameters 
n_episodes = 1000
gamma = 0.99
minibatch_size = 32
r_sums = []  # stores rewards of each epsiode 
replay_memory = [] # replay memory holds s, a, r, s'
mem_max_size = 100000
min_epsilon = 0.1
max_epsilon = 1
C = 100 # update the target network after this many steps 
LAMBDA = 0.001 # epsilon decay parameter, capital letter to avoid clash with lambda function

In [None]:
steps = 0 
epsilon = max_epsilon
for n in range(n_episodes): 
    s = env.reset()
    done=False
    r_sum = 0
    while not done: 
        # Uncomment this to see the agent learning
        #env.render()
        # Choose action to be epsilon-greedy
        if np.random.random() < epsilon:  a = env.action_space.sample()
        else:  
            # Feedforward pass for current state to get predicted q-values for all actions 
            qvals_s = nnet_q.predict(s.reshape(1,4))
            a = np.argmax(qvals_s); 
        # Take step, store results 
        sprime, r, done, info = env.step(a)
        r_sum += r 
        # add to memory, respecting memory buffer limit 
        if len(replay_memory) > mem_max_size:
            replay_memory.pop(0)
        replay_memory.append({"s":s,"a":a,"r":r,"sprime":sprime,"done":done})
        # Update target weights every C steps 
        steps +=1 
        if steps % C == 0: nnet_target.set_weights(nnet_q.get_weights())
        # Update state
        s=sprime
        # Train the nnet that approximates q(s,a), using the replay memory
        nnet_q = replay(replay_memory, minibatch_size = minibatch_size)
        # Decrease epsilon as we go
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * exp(-LAMBDA * steps )
    #print("Total reward:", r_sum)
    r_sums.append(r_sum)
    if n % 100 == 0: print(n)
        

0
100
200
300
400
500
600
700


In [None]:
# Plot the performance of the agent 
plt.plot(r_sums)
plt.xlabel("Episode Number")
plt.ylabel("Reward")

In [15]:
rolling_average = pd.DataFrame(r_sums).rolling(100,100).mean()
np.argmax(np.array(rolling_average > 195))

265