We make two updates to our basic dqn: 
*  add a target network 
*  add reward clipping 

We use logcosh loss instead of huber loss function because it is easy to use in Keras. The two are  almost the same for low delta values (of huber loss). 

In [1]:
import gym
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
%matplotlib inline
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

  from ._conv import register_converters as _register_converters
Using Theano backend.


In [2]:
# Setting up the environment 
env = gym.make("CartPole-v1")

In [3]:
def make_neural_network(): 
    n_actions = env.action_space.n
    input_dim = env.observation_space.shape[0]
    model = Sequential()
    model.add(Dense(64, input_dim = input_dim , activation = 'relu'))
    model.add(Dense(32, activation = 'relu'))
    model.add(Dense(n_actions, activation = 'linear'))
    # Add the huber/logcosh loss function 
    model.compile(optimizer=Adam(), loss = 'logcosh')
    return model 

## Neural net 1: network that approximates the q-value 
nnet_q = make_neural_network()
## Neural net 2: target network, updated periodocally with values from nnet 1  
nnet_target = make_neural_network()

In [17]:
def replay(replay_memory, minibatch_size=32):
    minibatch = np.random.choice(replay_memory, minibatch_size, replace=True)
    s_l =      np.array(list(map(lambda x: x['s'], minibatch)))
    a_l =      np.array(list(map(lambda x: x['a'], minibatch)))
    r_l =      np.array(list(map(lambda x: x['r'], minibatch)))
    sprime_l = np.array(list(map(lambda x: x['sprime'], minibatch)))
    done_l   = np.array(list(map(lambda x: x['done'], minibatch)))
    # Predict the next state values with target network
    qvals_sprime_l = nnet_target.predict(sprime_l)
    # Predict current state values with realtime q network
    target_f = nnet_q.predict(s_l) 
    # q-update
    for i,(s,a,r,qvals_sprime, done) in enumerate(zip(s_l,a_l,r_l,qvals_sprime_l, done_l)): 
        if not done:  target = r + gamma * np.max(qvals_sprime)
        else:         target = r
    target_f[i][a] = target
    nnet_q.fit(s_l,target_f, epochs=1, verbose=0)
    return nnet_q

In [None]:
# Parameters 
n_episodes = 1200
gamma = 0.99
epsilon = 1
minibatch_size = 32
r_sums = []  # stores rewards of each epsiode 
replay_memory = [] # replay memory holds s, a, r, s'
mem_max_size = 100000
C = 1000 # update the target network after this many steps 

In [19]:
steps = 0 
for n in range(n_episodes): 
    s = env.reset()
    done=False
    r_sum = 0
    while not done: 
        # Uncomment this to see the agent learning
        env.render()
        # Feedforward pass for current state to get predicted q-values for all actions 
        qvals_s = nnet_q.predict(s.reshape(1,4))
        # Choose action to be epsilon-greedy
        if np.random.random() < epsilon:  a = env.action_space.sample()
        else:                             a = np.argmax(qvals_s); 
        # Take step, store results 
        sprime, r, done, info = env.step(a)
        r_sum += r 
        # add to memory, respecting memory buffer limit 
        if len(replay_memory) > mem_max_size:
            replay_memory.pop(0)
        replay_memory.append({"s":s,"a":a,"r":r,"sprime":sprime,"done":done})
        # Update target weights every C steps 
        steps +=1 
        if steps % C == 0: nnet_target.set_weights(nnet_q.get_weights())
            
        # Update state
        s=sprime
        
        # Train the nnet that approximates q(s,a), using the replay memory
        nnet_q = replay(replay_memory, minibatch_size = minibatch_size)
        # Decrease epsilon until we hit a target threshold 
        if epsilon > 0.01:      epsilon -= 0.001
    print("Total reward:", r_sum)
    r_sums.append(r_sum)
        

Total reward: 23.0
Total reward: 15.0
Total reward: 13.0
Total reward: 37.0
Total reward: 30.0
Total reward: 35.0
Total reward: 28.0
Total reward: 48.0
Total reward: 56.0
Total reward: 14.0
Total reward: 16.0
Total reward: 62.0
Total reward: 49.0
Total reward: 72.0
Total reward: 203.0
Total reward: 91.0
Total reward: 190.0
Total reward: 204.0
Total reward: 180.0
Total reward: 169.0
Total reward: 170.0
Total reward: 135.0
Total reward: 144.0
Total reward: 153.0
Total reward: 115.0
Total reward: 142.0
Total reward: 108.0
Total reward: 115.0
Total reward: 160.0
Total reward: 122.0
Total reward: 137.0
Total reward: 247.0
Total reward: 287.0
Total reward: 91.0
Total reward: 164.0
Total reward: 162.0
Total reward: 73.0
Total reward: 128.0
Total reward: 67.0
Total reward: 44.0
Total reward: 168.0
Total reward: 159.0
Total reward: 154.0
Total reward: 150.0
Total reward: 93.0
Total reward: 82.0
Total reward: 69.0
Total reward: 61.0
Total reward: 121.0
Total reward: 64.0
Total reward: 67.0
Total

KeyboardInterrupt: 

In [None]:
r_sum.plot()