In [1]:
import matplotlib.pyplot as plt
import numpy as np
import time
import random
import gym
from memory import replay_buffer
from DQN import DDQN, huber_loss
from keras.models import load_model
from keras.optimizers import RMSprop, Adam
import imageio

%matplotlib inline
%load_ext autoreload
%autoreload 2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Imports specifically so we can render outputs in Jupyter.
from JSAnimation.IPython_display import display_animation
from matplotlib import animation
from IPython.display import display

def display_frames_as_gif(frames):
    """
    Displays a list of frames as a gif, with controls
    """
    plt.figure(figsize=(4*frames[0].shape[1] / 72.0, 4*frames[0].shape[0] / 72.0), dpi = 72)
    patch = plt.imshow(frames[0])
    plt.axis('off')

    def animate(i):
        patch.set_data(frames[i])

    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval = 50)
    display(display_animation(anim, default_mode = 'loop'))
    
    return anim

In [3]:
def test_agent():
    #------------------------------------------------
    #
    #  Test agent by playing Atari from policy
    #
    #------------------------------------------------

    # restart environment
    frame_buffer = []
    state_buffer = []

    env.reset()
    num_no_op_actions = np.random.randint(4, NUM_NO_OP_ACTIONS_MAX)
    reward = 0
    done = False

    # release the ball!
    image_, reward_, done_, _ = env.step(FIRE_ACTION)
    frame_buffer.append(image_)
    prev_lives = 5
    alive_frames = 0     
    actions = []
    prev_livess = []
    lives = []
    # take a number of no-op actions to randomise start
    for ii in range(num_no_op_actions):
        del image_

        image_, reward_, done_, info = env.step(NO_OP_ACTION)


        frame_buffer.append(image_)
        state_buffer.append(preprocess(image_))

        reward += reward_
        done = done | done_

        # count loss of life as terminal (agent must learn to not die)
        if info['ale.lives'] != prev_lives:
            image_, reward_, done_, _ = env.step(FIRE_ACTION)
            prev_lives = info['ale.lives']
    while not done_:      
        q_predict = dqn.model.predict([np.dstack(state_buffer[-4:][::1]).reshape((1,) + ATARI_SHAPE) ]
                                      + [np.ones((1, NUM_ACTIONS))], 
                                      batch_size = 1)
        action = np.argmax(q_predict)
        if random.random() < 0.05:
            action = np.random.randint(NUM_ACTIONS)
        del image_
        image_, reward_, done_, info = env.step(action)
        reward += reward_
        frame_buffer.append(image_.copy())  
        state_buffer.append(preprocess(image_))
        alive_frames += 1
        actions.append(action)
        lives.append(info['ale.lives'])
        prev_livess.append(prev_lives)
        if info['ale.lives'] != prev_lives:
            image_, reward_, done_, _ = env.step(FIRE_ACTION)
            prev_lives = info['ale.lives']
    
    return reward

In [4]:
def preprocess(image):
    # Pretty crude, truncating, but fast, gray-scale preprocessing.
    # Something better with a decent time-trade-off would be welcome
    return (  image[::2, ::2, 0]//3
            + image[::2, ::2, 1]//3
            + image[::2, ::2, 2]//3).astype('uint8')

In [5]:
class stats:
    """Store some statistics as the agent learns"""
    def __init__(self):
        self.alive_frames         = 0
        self.total_reward         = 0
        self.game_number          = 0
        self.sum_predicted_reward = 0
        self.policy_actions_taken = [0 for _ in range(NUM_ACTIONS)]
        self.results              = []

In [6]:
# Get the Atari emulator fired up
env = gym.make('BreakoutDeterministic-v4')
env.reset()
new_episode = True

NUM_ACTIONS = env.action_space.n
NUM_FRAMES  = 4 # Number of frames to stack for each state
ATARI_SHAPE = (105, 80, NUM_FRAMES)

# Define the DQN model
dqn         = DDQN(NUM_ACTIONS, ATARI_SHAPE)

In [7]:
# Constants
NUM_ITERATIONS         = 50000000 # How many frames the agent will experience
MIN_OBSERVATION        = 50000    # Preload the experience replay buffer with some observations
BUFFER_SIZE            = 1000000  # Experience replay buffer size
MINIBATCH_SIZE         = 32       # Mnih et al. uses 32. Some other implementations experiment with larger batches

EPSILON_DECAY          = 1000000  # Define annealing schedule (how many iterations to get to intermediate, then final eps)
FINAL_EPSILON          = 0.1
INTERMEDIATE_EPSILON   = 0.1
INITIAL_EPSILON        = 1.0

DECAY_RATE             = 0.99     # Discounted reward factor
TARGET_NETWORK_PERIOD  = 10000    # How often to clone the Q network for predicting actions
SAVE_PERIOD            = 250000   # Save the network weights periodically
REPORT_EPISODE_PERIOD  = 100      # How many episodes before reporting stats

NUM_NO_OP_ACTIONS_MAX  = 7        # Maximum number of random actions to take after launching the ball
FIRE_ACTION            = 1        # We're gonna have to fire to start each episode moving
NO_OP_ACTION           = 0        # Do nothing (no operation) action

UPDATE_FREQUENCY       = 4        # Number of actions agent takes between SGD updates

In [8]:
# Prepare to begin learning process
prev_time              = 0
prev_iteration         = 0
iteration_             = 0
epsilon                = INITIAL_EPSILON
t0                     = time.time()

In [9]:
# Initalise experience replay memory
experience_buffer  = replay_buffer(BUFFER_SIZE)

# Initalise statistics container
learning_stats = stats()

In [10]:
# dqn.model        = load_model('saved_2250000.h5', custom_objects={'huber_loss': huber_loss})
# dqn.target_model = load_model('saved_2250000.h5', custom_objects={'huber_loss': huber_loss})

In [11]:
# dqn.model.compile(optimizer=Adam(lr=0.000025), loss=huber_loss)

In [12]:
# results_file.close()

In [13]:
new_episode = True
output_filename = 'results.txt'
results_file = open(output_filename, 'a')

In [14]:
results_file.write("Iteration, time, FPS, Average Reward, Average Q predicted, Average alive frames, epsilon, test_reward\n")

102

In [15]:
prev_lives = 5

In [16]:
# Well, here we go...
while iteration_ < NUM_ITERATIONS:
    
    #------------------------------------------------
    #
    #         Initialise Atari to random state
    #
    #------------------------------------------------
    if new_episode:
        new_episode = False
        done = False
        
        # First of all, launch the ball as the first action of the episode
        image_, reward_, done, info = env.step(FIRE_ACTION)
        
        # Then do a random number of no_op actions
        num_no_op_actions = np.random.randint(4, NUM_NO_OP_ACTIONS_MAX)
        for ii in range(num_no_op_actions):
            image_, reward_, done, info = env.step(NO_OP_ACTION)
            
            # Count loss of life as terminal (please agent, learn to not die)
            # But don't reset until all lives are lost
            if info['ale.lives'] != prev_lives:
                done = True   
            
            # The last NUM_FRAMES observations can go into the experience buffer
            if ii >= num_no_op_actions - NUM_FRAMES:
                experience_buffer.append(preprocess(image_), NO_OP_ACTION, reward_, done)
            del image_
        
            
    else:
        
        #------------------------------------------------
        #
        #              Agent plays Atari
        #
        #------------------------------------------------
        
        # Recall latest state from observations 
        initial_state = experience_buffer.get_last_state()
        action = NO_OP_ACTION # default action - will get overwritten in a moment
        if iteration_ < MIN_OBSERVATION:
            # Start off by playing randomly
            action = np.random.randint(NUM_ACTIONS)
        else:
            if random.random() < epsilon:
                # Occasionally explore with a random action
                action = np.random.randint(NUM_ACTIONS)
            else:
                # Play from policy
                q_predict = dqn.model.predict([initial_state.reshape((1,) + ATARI_SHAPE)] + [np.ones((1, NUM_ACTIONS))], 
                                              batch_size = 1)

                action = np.argmax(q_predict)

                # Save some stats for later
                learning_stats.sum_predicted_reward += np.max(q_predict)
                learning_stats.policy_actions_taken[action] += 1  

        # Take action, observe image, are we done?
        image_, reward_, done, info = env.step(action)
        
        # Count loss of life as terminal (please agent, learn to not die)
        # But don't reset until all lives are lost
        if info['ale.lives'] != prev_lives:
            done = True    

        # Save experience to buffer
        experience_buffer.append(preprocess(image_), action, reward_, done)
        del image_
            
        learning_stats.alive_frames += 1
        learning_stats.total_reward += reward_   
            
        iteration_ += 1

        #------------------------------------------------
        #
        #  Agent recalls previous experiences and learns
        #
        #------------------------------------------------

        
        if (iteration_ > MIN_OBSERVATION) and (iteration_ % UPDATE_FREQUENCY == 0):

            s_now, a_batch, r_batch, d_batch, s_next = experience_buffer.get_batch(MINIBATCH_SIZE)      
            
            # For the next state, what does the target model currently predict as the maximum discounted reward 
            target_q_values_next = dqn.target_model.predict([s_next] + [np.ones((MINIBATCH_SIZE, NUM_ACTIONS))], 
                                                            batch_size = MINIBATCH_SIZE)
            
            # Use the main network to predict the action to take
            q_values_next = dqn.model.predict([s_next] + [np.ones((MINIBATCH_SIZE, NUM_ACTIONS))], 
                                                            batch_size = MINIBATCH_SIZE)        
            actions_batch = np.argmax(q_values_next, axis = 1)
            
            # Use the target network to predict the Q-value of that action
            target_q_values = target_q_values_next[np.arange(MINIBATCH_SIZE), actions_batch]
            
            targets = np.zeros((MINIBATCH_SIZE, NUM_ACTIONS))
            for ii in range(MINIBATCH_SIZE):
                targets[ii, a_batch[ii]] = r_batch[ii] # for the observed action, set reward
                if d_batch[ii] == False:
                    targets[ii, a_batch[ii]] += DECAY_RATE * target_q_values[ii]

            action_mask = np.zeros((MINIBATCH_SIZE, NUM_ACTIONS))
            action_mask[np.arange(MINIBATCH_SIZE), a_batch] = 1.0

            dqn.model.train_on_batch([s_now] + [action_mask], targets.astype('float32'))
            
        # Ocassionally we'll copy the weights from the backprop'd model to the target model
        # used for predicting the discounted future reward for an action
        if iteration_ % TARGET_NETWORK_PERIOD == TARGET_NETWORK_PERIOD-1:
            model_weights = dqn.model.get_weights()
            dqn.target_model.set_weights(model_weights)
            
            print('Cloning model @ iteration', iteration_)
            print()
    
    if done:
        
        learning_stats.results.append((learning_stats.total_reward, 
                                       learning_stats.sum_predicted_reward, 
                                       learning_stats.alive_frames, 
                                       epsilon))
        
        if learning_stats.game_number % REPORT_EPISODE_PERIOD == REPORT_EPISODE_PERIOD - 1:
            
            result = np.mean(np.array(learning_stats.results[-REPORT_EPISODE_PERIOD:]), axis = 0)
            print('Played 100 episodes...')
            print('Iteration', iteration_)
            print('Total time', int(time.time() - t0))
            print('FPS',  (iteration_ - prev_iteration)/(time.time() - prev_time))
            print('Average reward', result[0])
            print('Average Q predicted', result[1]/result[2])
            print('Average alive frames', result[2])
            print('Epsilon', result[3])
            print('Policy actions', learning_stats.policy_actions_taken)
            
            test_rewards = []
            for ii in range(10):
                test_rewards.append(test_agent())
            test_reward = np.mean(test_rewards)
            test_min    = np.min(test_rewards)
            test_max    = np.max(test_rewards)
            print('Test Reward', test_reward) 
            print('Test Reward [min-max] [', test_min, test_max, ']') 
            
            print()
            prev_time = time.time()
            prev_iteration = iteration_
            learning_stats.policy_actions_taken = [0 for _ in range(NUM_ACTIONS)]
            
            results_file = open(output_filename, 'a')
            
            results_file.write("%f, %f, %f, %f, %f, %f, %f, %f\n" % (iteration_, 
                                                 int(time.time() - t0), 
                                                 (iteration_ - prev_iteration)/(time.time() - prev_time),
                                                 result[0],
                                                 result[1]/result[2],
                                                 result[2],
                                                 result[3],
                                                 test_reward))
        
        # Reset for new episode
        if info['ale.lives'] == 0:
            env.reset()
        new_episode = True
        learning_stats.alive_frames = 0
        learning_stats.total_reward = 0
        learning_stats.game_number += 1
        learning_stats.sum_predicted_reward = 0
        
        # Kick off the next episode if necessary (FIRE action launches the ball)
        if prev_lives != info['ale.lives']:
            _, _, _, info = env.step(FIRE_ACTION)
            prev_lives = info['ale.lives']
        
    # Annealing schedule    
    if epsilon > INTERMEDIATE_EPSILON:
        epsilon -= (INITIAL_EPSILON-INTERMEDIATE_EPSILON)/EPSILON_DECAY  
    elif epsilon > FINAL_EPSILON:
        epsilon -= (INTERMEDIATE_EPSILON-FINAL_EPSILON)/EPSILON_DECAY  
      
    # Save model every so often so we can interrupt learning and do warm restarts
    if iteration_ % SAVE_PERIOD == SAVE_PERIOD - 1:
        dqn.model.save('saved_' + str(iteration_+1) + '.h5')

Played 100 episodes...
Iteration 2470
Total time 6
FPS 1.6156518478712096e-06
Average reward 0.19
Average Q predicted 0.0
Average alive frames 24.7
Epsilon 0.9988014610000395
Policy actions [0, 0, 0, 0]


NameError: name 'num_random_actions' is not defined

In [None]:
#------------------------------------------------
#
#  Test agent by playing Atari from policy
#
#------------------------------------------------

# restart environment
frame_buffer = []
state_buffer = []

env.reset()
num_no_op_actions = np.random.randint(4, NUM_NO_OP_ACTIONS_MAX)
reward = 0
done = False

# release the ball!
image_, reward_, done_, _ = env.step(FIRE_ACTION)
frame_buffer.append(image_)
prev_lives = 5
alive_frames = 0     
actions = []
prev_livess = []
lives = []
# take a number of no-op actions to randomise start
for ii in range(num_no_op_actions):
    del image_
    
    image_, reward_, done_, info = env.step(NO_OP_ACTION)


    frame_buffer.append(image_)
    state_buffer.append(preprocess(image_))

    reward += reward_
    done = done | done_

    # count loss of life as terminal (agent must learn to not die)
    if info['ale.lives'] != 5:
        done = True
while not done_:      
    q_predict = dqn.model.predict([np.dstack(state_buffer[-4:][::1]).reshape((1,) + ATARI_SHAPE) ]
                                  + [np.ones((1, NUM_ACTIONS))], 
                                  batch_size = 1)
    action = np.argmax(q_predict)
    if random.random() < 0.05:
        action = np.random.randint(NUM_ACTIONS)
    del image_
    image_, reward_, done_, info = env.step(action)
    frame_buffer.append(image_.copy())  
    state_buffer.append(preprocess(image_))
    alive_frames += 1
    actions.append(action)
    lives.append(info['ale.lives'])
    prev_livess.append(prev_lives)
    if info['ale.lives'] != prev_lives:
        print('Launching Ball')
        image_, reward_, done_, _ = env.step(FIRE_ACTION)
        prev_lives = info['ale.lives']

In [None]:
plt.plot(actions)

In [None]:
plt.plot(lives)
plt.plot(prev_livess)

In [None]:
# Show video of agent playing Atari
anim = display_frames_as_gif(frame_buffer)

In [None]:
# Save video as an animated gif
plt.rcParams["animation.convert_path"] = 'C:\Program Files\ImageMagick-7.0.7-Q16\magick.exe'
writer = animation.ImageMagickFileWriter(fps = 30)
anim.save(filename = "breakout_4.gif", writer=writer)