# Final exam Bubble Bobble

- Team: TToBoT
- Member: { Sejun, Steve, Victor } @kaist

## Import libraries and configure levels to eval on

In [1]:
# Import general libraries
import gin.tf
import seaborn as sns
import matplotlib.pyplot as plt
from absl import flags
import numpy as np
import tensorflow as tf
import os
import sys
import gin
import numpy as np
import tensorflow.compat.v1 as tf
from tensorflow.contrib import slim as contrib_slim
import retro
import gym
import cv2

# Import dopamine libraries
import bubble
from dopamine.colab import utils as colab_utils
from dopamine.agents.dqn import dqn_agent
from dopamine.agents.implicit_quantile import implicit_quantile_agent
from dopamine.agents.rainbow import rainbow_agent
from dopamine.discrete_domains import atari_lib
from dopamine.discrete_domains import iteration_statistics
from dopamine.discrete_domains import run_experiment
from dopamine.discrete_domains.run_experiment import create_agent
from dopamine.utils import agent_visualizer
from dopamine.utils import atari_plotter
from dopamine.utils import bar_plotter
from dopamine.utils import line_plotter

pygame 1.9.6
Hello from the pygame community. https://www.pygame.org/contribute.html


## Custom Runner Class

The ExamRunner class allows us to run our experiment, consisting in getting the real env score for each input level number 

In [2]:
class ExamRunner(run_experiment.Runner):
    def __init__(self, base_dir, trained_agent_ckpt_path, create_agent_fn, game_levels_eval, name='', queue=None):
        tf.logging.info('Creating ExamRunner({}) ...'.format(name))
        self._trained_agent_ckpt_path = trained_agent_ckpt_path
        self._use_legacy_checkpoint = False
        super(ExamRunner, self).__init__(base_dir, create_agent_fn)
        self._agent.eval_mode = True
        self.game_levels_eval = game_levels_eval
        # set highest number for self._max_steps_per_episode such that the env is free to continue until game over
        self._max_steps_per_episode = 10000000
        self.name = name
        self.queue = queue

    def post_reward(self, level=0, reward=0):
        self.queue.put({'name':self.name,'level':level,'reward':reward}) if self.queue is not None else None
    
    def _initialize_checkpointer_and_maybe_resume(self, checkpoint_file_prefix):
        tf.logging.info('Initializing checkpointer and resume ExamRunner ...')
        self._agent.reload_checkpoint(self._trained_agent_ckpt_path,
                                      self._use_legacy_checkpoint)
        self._start_iteration = 0

    def _run_one_iteration(self, game_level):
        """Runs one iteration of agent/environment interaction, conssisting in one episode of the agent in the environment.
        An iteration involves running a single episode on the environment.
        
        Return:
        env_true_return: int, the episode return of the original environment (not the wrapper).
        """
        tf.logging.info('Starting iteration for game_level %d', game_level)

        # initialize for the run
        self._agent.eval_mode = True    
        self._environment.episode_true_return = 0

        # run one episode
        episode_length, episode_return = self._run_one_episode()

        # read from the env the true reward
        env_true_return = self._environment.episode_true_return
        self._environment.episode_true_return = 0
        
        return env_true_return

    def run_experiment(self):
        """ Run a full experiment.
        One iteration for each evaluation game level.
        """
        print(self.game_levels_eval)
        episode_returns = list()
        self.post_reward(0)  # post starting
        for game_level in self.game_levels_eval:
            print("========================================")
            tf.logging.info('Start iteration for game_level %d', game_level)
            # choose the given game_level
            self._environment.reset(game_level=game_level)

            # run one iteration in the environment
            env_return = self._run_one_iteration(game_level)
            self.post_reward(game_level, env_return) # post reward

            # log and save the return per episode
            tf.logging.info('End iteration for game_level %d. Episode return: %d', game_level, env_return)
            episode_returns.append(env_return)
        
        print("========================================")
        return episode_returns


## Custom IQN Agent class

This new IQN Agent class only modifies the reload_checkpoint function. Anything else is the same as in the ImplicitQuantileAgent class.

In [3]:
class ExamIQNAgent(implicit_quantile_agent.ImplicitQuantileAgent):
    def reload_checkpoint(self, checkpoint_path, use_legacy_checkpoint=False):
        if use_legacy_checkpoint:
          variables_to_restore = atari_lib.maybe_transform_variable_names(
              tf.all_variables(), legacy_checkpoint_load=True)
        else:
          global_vars = set([x.name for x in tf.global_variables()])
          ckpt_vars = [
              '{}:0'.format(name)
              for name, _ in tf.train.list_variables(checkpoint_path)
          ]
          include_vars = list(global_vars.intersection(set(ckpt_vars)))
          variables_to_restore = contrib_slim.get_variables_to_restore(
              include=include_vars)
        if variables_to_restore:
          reloader = tf.train.Saver(var_list=variables_to_restore)
          reloader.restore(self._sess, checkpoint_path)
          tf.logging.info('Done restoring from %s', checkpoint_path)
        else:
          tf.logging.info('Nothing to restore!')


## Import Gin configuration

The dopamine framework works with the support of .gin files as configuration files.

This configuration is taken from IQN9.

In [4]:
gin_config = '''
# run train for bubble agent
# - NOTE: customized for bubble w/ IQN
# - origin from `dopamine/agents/implicit_quantile/configs/implicit_quantile.gin`
#
# [RUN TRAIN]
# $ python -um dopamine.discrete_domains.train --base_dir=/tmp/bubble_iqn1 --gin_files='bubble/iqn_bubble.gin' --gin_bindings='RainbowAgent.tf_device="/cpu:*"'

# Hyperparameters follow Dabney et al. (2018), but we modify as necessary to
# match those used in Rainbow (Hessel et al., 2018), to ensure apples-to-apples
# comparison.
import dopamine.agents.implicit_quantile.implicit_quantile_agent
import dopamine.agents.rainbow.rainbow_agent
import dopamine.discrete_domains.atari_lib
import dopamine.discrete_domains.run_experiment
import dopamine.replay_memory.prioritized_replay_buffer
import gin.tf.external_configurables

# agent for bubble
import bubble.retro_lib_exam
import bubble.bubble_agent
retro_lib_exam.create_retro_environment_exam.game_name = 'BubbleBobble'
retro_lib_exam.create_retro_environment_exam.level = 1
Runner.create_environment_fn = @retro_lib_exam.create_retro_environment_exam


create_agent.agent_name = 'implicit_quantile'
RetroPreprocessingExam.wall_offset = 0          # use 200 if activate
#RetroPreprocessingExam.step_penalty = -0.0001  # every step penalty
RetroPreprocessingExam.step_penalty = 0.0005    # every step penalty (survival is better since 200623/443)
RetroPreprocessingExam.reset_fire = 0
RetroPreprocessingExam.score_bonus = 0.02       # bonus reward if got new-score.

ImplicitQuantileAgent.kappa = 1.0
ImplicitQuantileAgent.num_tau_samples = 64
ImplicitQuantileAgent.num_tau_prime_samples = 64
ImplicitQuantileAgent.num_quantile_samples = 32
# ImplicitQuantileAgent.double_dqn = True   # NOTE - default is False
RainbowAgent.gamma = 0.99
RainbowAgent.update_horizon = 3
RainbowAgent.min_replay_history = 20000 # agent steps
RainbowAgent.update_period = 4
RainbowAgent.target_update_period = 8000 # agent steps

RainbowAgent.epsilon_train = 0.001
RainbowAgent.epsilon_eval = 0.001
RainbowAgent.epsilon_decay_period = 200000  # agent steps (1 at step=1 => 0.001 at step=200000)

# IQN currently does not support prioritized replay.
RainbowAgent.replay_scheme = 'uniform'
RainbowAgent.tf_device = '/gpu:0'  # '/cpu:*' use for non-GPU version
RainbowAgent.optimizer = @tf.train.AdamOptimizer()

tf.train.AdamOptimizer.learning_rate = 0.00005
tf.train.AdamOptimizer.epsilon = 0.0003125

Runner.num_iterations = 600
Runner.training_steps = 200000             # origin 250000
Runner.evaluation_steps = 0
Runner.max_steps_per_episode = 20000       # origin 27000

WrappedPrioritizedReplayBuffer.replay_capacity = 1000000
WrappedPrioritizedReplayBuffer.batch_size = 32
'''

# parse this config
gin.parse_config(gin_config, skip_unknown=False)

# extended gin config
gin_files = []
gin_bindings = []

## Run experiment code

In this section we run the experiment and get the scores for each game level

- the agent is the standard dopamine IQN agent created with the create_agent() function in run_experiment.py
- the runner is our custom ExamRunner class

In [5]:
def create_runner_fn(base_dir, trained_agent_ckpt_path, create_agent, game_levels_eval, name='', queue=None):
    """Creates an experiment Runner.
    Args:
    base_dir: str, base directory for hosting all subdirectories.
    trained_agent_ckpt_path: load the checkpoint.
    create_agent: function to create the agent.

    Returns:
    runner: A `Runner` like object.
    """
    return ExamRunner(base_dir, trained_agent_ckpt_path, create_agent, game_levels_eval, name, queue)

def create_agent_fn(sess, environment, summary_writer=None):
    return ExamIQNAgent(sess, num_actions=environment.action_space.n, summary_writer=summary_writer)

#! eval with IQN7/IQN8/IQN9 Result
def startMyRunnerIQN(name = '', Q = None, chkpt = 100):
    global gin_files, gin_bindings, proc_queue, GAME_LEVELS_EVAL
    from dopamine.discrete_domains import run_experiment
    run_experiment.load_gin_configs(gin_files, gin_bindings)
    # create runner
    exam = create_runner_fn(base_dir='./exam', 
                     trained_agent_ckpt_path='/tmp/bubble_{}/checkpoints/tf_ckpt-{}'.format(name, chkpt), 
                     create_agent=create_agent_fn, 
                     game_levels_eval=GAME_LEVELS_EVAL,
                     name = name, queue = Q)
    episode_returns = exam.run_experiment()
    #! print the results.
    for game_level, ep_ret in zip(GAME_LEVELS_EVAL, episode_returns):
        print("Game[{}] level: {}\tEpisode undiscounted env return: {}".format(name, game_level, ep_ret))

## Support Multi-Processing

In [6]:
# The IOloop is shared
def get_ioloop():
    import IPython, zmq
    ipython = IPython.get_ipython()
    if ipython and hasattr(ipython, 'kernel'):
        return zmq.eventloop.ioloop.IOLoop.instance()
ioloop = get_ioloop()

# Thread for update canvas
import threading, time
from ipycanvas import Canvas
from multiprocessing import Process, Queue

# Main Thread
class MyThread(threading.Thread):
    def __init__(self, sleep = 0.5, name = 'my'):
        super().__init__()
        self._quit = threading.Event()
        self.sleep = 0.5
        self.name = name
        self.start()    
    def run(self):
        while not self._quit.isSet():
            def update_progress():
                if self._quit.isSet():
                    return
                self.display()
            time.sleep(self.sleep)
            ioloop.add_callback(update_progress)
        print("! T[{}].Quit()".format(self.name))
    def quit(self):
        self._quit.set()
    def display(self):
        pass

# draw to canvas
def drawPlot2Canvas(fig = None, x=0, y=0, canv = None):
    global canvas
    canv = canvas if canv is None else canv
    fig = plt.gcf() if fig is None else fig
    plt.close()          # not to update on screen.
    fig.canvas.draw()    # draw fig to canvas
    arr = np.array(fig.canvas.renderer._renderer)
    h, w, d = np.shape(arr)
    cv = Canvas(width=w, height=h)
    cv.put_image_data(arr, 0, 0)
    cv.stroke_rect(x,y, x+w-1, y+h-1)
    canv.clear_rect(x,y, x+w, y+h)
    canv.draw_image(cv, x, y)
    
# train thread
def startProcessEval(target = None, name = 'T0', chkpt = 0):
    global proc_queue, proc_list
    proc_queue = Queue() if proc_queue is None else proc_queue
    proc = Process(target = target, args = (name, proc_queue, chkpt))
    proc_list.append(proc)
    proc.start()
    return proc

# stop(or kill) processes
def stopProcessEval():
    global proc_list
    for proc in proc_list:
        t = proc.terminate()
        proc.join()
        print('! terminated = {}'.format(t))
    proc_list = []

# MyThread for status display
class MyTrainStatus(MyThread):
    draw_to_plot = True
    def __init__(self, canvas = None):
        super().__init__(name='status')
        print('! MyTrainStatus({})'.format(self.name))
        self.rewards = {}
        self.canvas = canvas
    def display(self):
        global proc_queue, plt, GAME_LEVELS_EVAL
        received = 0
        # pop all queue...
        while not proc_queue.empty():
            msg = proc_queue.get()
            n = msg['name'] if 'name' in msg else 'none'
            l = msg['level'] if 'level' in msg else 0
            r = msg['reward'] if 'reward' in msg else 0
            # only if level is valid.
            if l > 0:
                L = self.rewards[n] if n in self.rewards else []
                L.append(r)
                self.rewards[n] = L
                received += 1
        # plot rewards if received.
        if received > 0:
            fig = plt.figure(1, figsize=(8, 6))
            max_len = 0
            for n in self.rewards:
                l = self.rewards[n]
                plt.plot(l, label=n)
                max_len = max(len(l), max_len)
            plt.legend(fancybox=True, framealpha=1, shadow=True, borderpad=1)
            plt.xticks([])  # hide x-ticks
            # build tables.
            labels = []
            tables = []
            cols = [GAME_LEVELS_EVAL[i] if i < len(GAME_LEVELS_EVAL) else i for i in range(max_len)]
            for n in self.rewards:
                labels.append(n)
                l = self.rewards[n]
                vals = ['%2d'%(l[i]) if i < len(l) else '' for i in range(max_len)]
                tables.append(vals)
            # draw table
            plt.table(cellText=tables, rowLabels=labels, colLabels=cols, loc='bottom')
            # print('! tables = {}'.format(tables))
            # print('! rows = {}'.format(labels))
            # print('! cols = {}'.format([i for i in range(max_len)]))
            # plt.close()
            # draw to canvas
            drawPlot2Canvas(fig, canv = self.canvas)
            
# process list in global
proc_list = []
proc_queue = Queue()

## Evaluate Levels w/ multi-thread

In [7]:
#  Game levels numbers to perform the evaluation on
#GAME_LEVELS_EVAL = [1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 20, 31, 41, 50, 60, 71, 81, 90]
GAME_LEVELS_EVAL = [2,9,15,33,46,59,64,73,76,81]

In [18]:
#! start process of runner
#startProcessEval(target = startMyRunnerIQN, name = 'iqn7', chkpt = 199)
startProcessEval(target = startMyRunnerIQN, name = 'iqn8', chkpt = 654)
startProcessEval(target = startMyRunnerIQN, name = 'iqn9', chkpt = 304)

<Process(Process-9, started)>

INFO:tensorflow:Creating ExamRunner(iqn8) ...
! create_retro_environment in retro_lib_exam.py: BubbleBobble/1
INFO:tensorflow:Create RetroGame in RetroPreprocessingExam:BubbleBobble-Nes w/ stage:Level01
INFO:tensorflow:Creating ExamRunner(iqn9) ...
! create_retro_environment in retro_lib_exam.py: BubbleBobble/1
INFO:tensorflow:Create RetroGame in RetroPreprocessingExam:BubbleBobble-Nes w/ stage:Level01
! RetroPreprocessingExam: wall_offset=0, step_penalty=0.0005, game_level=1, reset_fire=0, score_bonus=0.02
! RetroPreprocessingExam: wall_offset=0, step_penalty=0.0005, game_level=1, reset_fire=0, score_bonus=0.02
INFO:tensorflow:Creating ExamIQNAgent agent with the following parameters:
INFO:tensorflow:	 gamma: 0.990000
INFO:tensorflow:	 update_horizon: 3.000000
INFO:tensorflow:	 min_replay_history: 20000
INFO:tensorflow:	 update_period: 4
INFO:tensorflow:	 target_update_period: 8000
INFO:tensorflow:	 epsilon_train: 0.001000
INFO:tensorflow:	 epsilon_eval: 0.001000
INFO:tensorflow:	 eps

INFO:tensorflow:	 num_quantile_samples: 32
INFO:tensorflow:	 quantile_embedding_dim: 64
INFO:tensorflow:	 double_dqn: False
INFO:tensorflow:Initializing checkpointer and resume ExamRunner ...
INFO:tensorflow:Initializing checkpointer and resume ExamRunner ...
INFO:tensorflow:Restoring parameters from /tmp/bubble_iqn8/checkpoints/tf_ckpt-654
INFO:tensorflow:Restoring parameters from /tmp/bubble_iqn9/checkpoints/tf_ckpt-304
INFO:tensorflow:Done restoring from /tmp/bubble_iqn8/checkpoints/tf_ckpt-654
[2, 9, 15, 33, 46, 59, 64, 73, 76, 81]
INFO:tensorflow:Start iteration for game_level 2
INFO:tensorflow:Create RetroGame in RetroPreprocessingExam:BubbleBobble-Nes w/ stage:Level02
INFO:tensorflow:Done restoring from /tmp/bubble_iqn9/checkpoints/tf_ckpt-304
[2, 9, 15, 33, 46, 59, 64, 73, 76, 81]
INFO:tensorflow:Start iteration for game_level 2
INFO:tensorflow:Create RetroGame in RetroPreprocessingExam:BubbleBobble-Nes w/ stage:Level02
INFO:tensorflow:Starting iteration for game_level 2
INFO:t

In [17]:
#! show plot display 
canvas = Canvas(width=640, height=480)
tstatus = MyTrainStatus(canvas)
canvas
#! 
# iqn7 - checkpoint by 199 iteration of training on only stage-level=1
# iqn8 - checkpoint by 654 iteration of training with mixed strategy (see presentation)
# iqn9 - checkpoint by 304 iteration of training on each stage-level=1~50 (since 200 after iqn7)

! MyTrainStatus(status)


Canvas(height=480, width=640)

In [16]:
# stop - thread of status
tstatus.quit() if tstatus else None
# stop all process of eval
stopProcessEval()

! terminated = None
! terminated = None
