In [None]:
import matplotlib.pyplot as plt
import numpy as np
import time
import random
import gym
from memory import replay_buffer
from DQN import DDQN

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
def preprocess(image):
    # Pretty crude, truncating, but fast, gray-scale preprocessing.
    # Something better with a decent time-trade-off would be welcome
    return (  image[::2, ::2, 0]//3
            + image[::2, ::2, 1]//3
            + image[::2, ::2, 2]//3).astype('uint8')

In [None]:
class stats:
    """Store some statistics as the agent learns"""
    def __init__(self):
        self.alive_frames         = 0
        self.total_reward         = 0
        self.game_number          = 0
        self.sum_predicted_reward = 0
        self.policy_actions_taken = [0 for _ in range(NUM_ACTIONS)]
        self.results              = []

In [None]:
# Get the Atari emulator fired up
env = gym.make('BreakoutDeterministic-v4')
env.reset()
new_episode = True

NUM_ACTIONS = env.action_space.n
NUM_FRAMES  = 4 # Number of frames to stack for each state
ATARI_SHAPE = (105, 80, NUM_FRAMES)

# Define the DQN model
dqn         = DDQN(NUM_ACTIONS, ATARI_SHAPE)

In [None]:
# Constants
NUM_ITERATIONS         = 10000000 # How many frames the agent will experience
MIN_OBSERVATION        = 50000    # Preload the experience replay buffer with some observations
BUFFER_SIZE            = 1000000  # Experience replay buffer size
MINIBATCH_SIZE         = 32       # Mnih et al. uses 32. Some other implementations experiment with larger batches

EPSILON_DECAY          = 500000   # Define annealing schedule
FINAL_EPSILON          = 0.01
INTERMEDIATE_EPSILON   = 0.1
INITIAL_EPSILON        = 1.0

DECAY_RATE             = 0.99     # Discounted reward factor
TARGET_NETWORK_PERIOD  = 10000    # How often to clone the Q network for predicting actions
SAVE_PERIOD            = 250000   # Save the network weights periodically
REPORT_EPISODE_PERIOD  = 100      # How many episodes before reporting stats

NUM_RANDOM_ACTIONS_MAX = 7        # Maximum number of random actions to take after launching the ball
FIRE_ACTION            = 1        # We're gonna have to fire to start each episode moving

In [None]:
# Prepare to begin learning process
prev_time              = 0
prev_iteration         = 0
iteration_             = 0
epsilon                = INITIAL_EPSILON
t0                     = time.time()

In [None]:
# Initalise experience replay memory
experience_buffer  = replay_buffer(BUFFER_SIZE)

# Initalise statistics container
learning_stats = stats()

In [None]:
# Well, here we go...
for iteration_ in range(iteration_, NUM_ITERATIONS + iteration_):
    
    #------------------------------------------------
    #
    #         Initialise Atari to random state
    #
    #------------------------------------------------
    if new_episode:
        new_episode = False
        done = False
        
        # First of all, launch the ball as the first action of the episode
        image_, reward_, done_, _ = env.step(FIRE_ACTION)
        
        # Then do a random number of random actions
        num_random_actions = np.random.randint(4, NUM_RANDOM_ACTIONS_MAX)
        for ii in range(num_random_actions):
            action = np.random.randint(NUM_ACTIONS)

            image_, reward_, done_, info = env.step(action)
            
            # The last NUM_FRAMES observations can go into the experience buffer
            if ii >= num_random_actions - NUM_FRAMES:
                experience_buffer.append(preprocess(image_), action, reward_, done_)
            del image_
            done = done | done_
            
            # Count loss of life as terminal (please agent, learn to not die)
            if info['ale.lives'] != 5:
                done = True   
    else:
        
        #------------------------------------------------
        #
        #              Agent plays Atari
        #
        #------------------------------------------------
        
        # Recall latest state from observations 
        initial_state = experience_buffer.get_last_state()
        
        # Start off by playing randomly
        if iteration_ < MIN_OBSERVATION:
            action = np.random.randint(NUM_ACTIONS)
        else:
            if random.random() < epsilon:
                # Occasionally explore with a random action
                action = np.random.randint(NUM_ACTIONS)
            else:
                # Play from policy
                q_predict = dqn.model.predict([initial_state.reshape((1,) + ATARI_SHAPE)] + [np.ones((1, NUM_ACTIONS))], 
                                              batch_size = 1)
                
                action = np.argmax(q_predict)
                
                # Save some stats for later
                learning_stats.sum_predicted_reward += np.max(q_predict)
                learning_stats.policy_actions_taken[action] += 1  
                
        # Take action, observe image, are we done?
        image_, reward, done, info = env.step(action)

        # Save experience to buffer
        experience_buffer.append(preprocess(image_), action, reward, done)
        del image_

        # Count loss of life as terminal (please agent, learn to not die)
        if info['ale.lives'] != 5:
            done = True

        # Recall latest state from observations    
        state = experience_buffer.get_last_state()

        #------------------------------------------------
        #
        #  Agent recalls previous experiences and learns
        #
        #------------------------------------------------

        if iteration_ > MIN_OBSERVATION:
            
            indexes = experience_buffer.get_batch_indexes(MINIBATCH_SIZE)       
            
            s_now, a_batch, r_batch, d_batch, s_next = experience_buffer.get_batch(indexes)      
            
            # For the next state, what does the target model currently predict as the maximum discounted reward 
            target_q_values_next = dqn.target_model.predict([s_next] + [np.ones((MINIBATCH_SIZE, NUM_ACTIONS))], 
                                                            batch_size = MINIBATCH_SIZE)

            targets = np.zeros((MINIBATCH_SIZE, NUM_ACTIONS))
            for ii in range(MINIBATCH_SIZE):
                targets[ii, a_batch[ii]] = r_batch[ii] # for the observed action, set reward
                if d_batch[ii] == False:
                    targets[ii, a_batch[ii]] += DECAY_RATE * np.max(target_q_values_next[ii, :])

            action_mask = np.zeros((MINIBATCH_SIZE, NUM_ACTIONS))
            action_mask[np.arange(MINIBATCH_SIZE), a_batch] = 1.0

            dqn.model.train_on_batch([s_now] + [action_mask], targets.astype('float32'))
            
        # Ocassionally we'll copy the weights from the backprop'd model to the target model
        # used for predicting the discounted future reward for an action
        if iteration_ % TARGET_NETWORK_PERIOD == TARGET_NETWORK_PERIOD-1:
            model_weights = dqn.model.get_weights()
            dqn.target_model.set_weights(model_weights)
            
        #------------------------------------------------
        #
        # Record some more stats and reset the game once done
        #
        #------------------------------------------------

        learning_stats.alive_frames += 1
        learning_stats.total_reward += reward
    
    if done:
        
        learning_stats.results.append((learning_stats.total_reward, 
                                       learning_stats.sum_predicted_reward, 
                                       learning_stats.alive_frames, 
                                       epsilon))
        
        if learning_stats.game_number % REPORT_EPISODE_PERIOD == REPORT_EPISODE_PERIOD - 1:
            
            result = np.mean(np.array(learning_stats.results[-REPORT_EPISODE_PERIOD:]), axis = 0)
            print('Played 100 episodes...')
            print('Iteration', iteration_)
            print('Total time', int(time.time() - t0))
            print('FPS',  (iteration_ - prev_iteration)/(time.time() - prev_time))
            print('Average reward', result[0])
            print('Average Q predicted', result[1]/result[2])
            print('Average alive frames', result[2])
            print('Epsilon', result[3])
            print('Policy actions', learning_stats.policy_actions_taken)
            print()
            prev_time = time.time()
            prev_iteration = iteration_
            learning_stats.policy_actions_taken = [0 for _ in range(NUM_ACTIONS)]
        
        # Reset for new episode
        env.reset()
        new_episode = True
        learning_stats.alive_frames = 0
        learning_stats.total_reward = 0
        learning_stats.game_number += 1
        learning_stats.sum_predicted_reward = 0
        
    # Annealing schedule    
    if epsilon > INTERMEDIATE_EPSILON:
        epsilon -= (INITIAL_EPSILON-INTERMEDIATE_EPSILON)/EPSILON_DECAY  
    elif epsilon > FINAL_EPSILON:
        epsilon -= (INTERMEDIATE_EPSILON-FINAL_EPSILON)/EPSILON_DECAY  
      
    # Save model every so often so we can interrupt learning and do warm restarts
    if iteration_ % SAVE_PERIOD == SAVE_PERIOD - 1:
        DQN.model.save('saved_' + str(iteration_+1) + '.h5')