In [1]:
%matplotlib inline

import gym
import itertools
import numpy as np
import os
import random
import sys
import tensorflow as tf

if "../" not in sys.path:
  sys.path.append("../")

from lib import plotting
from collections import deque, namedtuple

In [2]:
env = gym.envs.make("Breakout-v0")

[2017-08-17 11:47:50,506] Making new env: Breakout-v0


In [3]:
# Atari Actions: 0 (noop), 1 (fire), 2 (left) and 3 (right) are valid actions
VALID_ACTIONS = [0, 1, 2, 3]

In [None]:
class StateProcessor():
    """
    Processes a raw Atari iamges. Resizes it and converts it to grayscale.
    """
    def __init__(self):
        """Build the TF Graph for the StateProcessor.
           input_state[210x160x3] -> output[84x84x1]
        """
        with tf.variable_scope("state_processor"):
            self.input_state = tf.placeholder(shape=[210, 160, 3], dtype=tf.uint8)
            self.output = tf.image.rgb_to_grayscale(self.input_state)
            self.output = tf.image.crop_to_bounding_box(self.output, 34, 0, 160, 160)
            self.output = tf.image.resize_images(
                self.output, [84, 84], method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)
            self.output = tf.squeeze(self.output)

    def process(self, sess, state):
        """
        Args:
            sess: A Tensorflow session object
            state: A [210, 160, 3] Atari RGB State

        Returns:
            A processed [84, 84, 1] state representing grayscale values.
        """
        return sess.run(self.output, { self.input_state: state })

In [5]:
class Estimator():
    """Q-Value Estimator neural network.

    This network is used for both the Q-Network and the Target Network.
    """

    def __init__(self, scope="estimator", summaries_dir=None):
        """Builds the TF Graph and sets up summaries in order to monitor
           our progress. 
           
        Keyword Arguments:
            scope: TF Scope name for the estimator graph
            summaries_dir: where to store TF summaries
        """
        self.scope = scope
        # Writes Tensorboard summaries to disk
        self.summary_writer = None
        with tf.variable_scope(scope):
            # Build the graph
            self._build_model()
            
            # If we passed a summary dir then attempt to create it
            # if it doesn't exist and create a new summary_writer
            if summaries_dir:
                summary_dir = os.path.join(summaries_dir, "summaries_{}".format(scope))
                if not os.path.exists(summary_dir):
                    os.makedirs(summary_dir)
                self.summary_writer = tf.summary.FileWriter(summary_dir)

    def _build_model(self):
        """
        Builds the Tensorflow graph.
        """

        # Placeholders for our input
        # Our input are 4 RGB frames of shape 160, 160 each
        # Four RGB frames are needed in order to get an estimate of the direction
        # that the ball is heading to as well.
        self.X_pl = tf.placeholder(shape=[None, 84, 84, 4], dtype=tf.uint8, name="X")
        # The TD target value
        self.y_pl = tf.placeholder(shape=[None], dtype=tf.float32, name="y")
        # Integer id of which action was selected
        self.actions_pl = tf.placeholder(shape=[None], dtype=tf.int32, name="actions")

        X = tf.to_float(self.X_pl) / 255.0
        batch_size = tf.shape(self.X_pl)[0]

        # Three convolutional layers
        # Arguments to tf.contrib.layers.conv2d: (X, output_filters, kernel_size, stride)
        # Conv[8x8/4][32] -> Conv[4x4/2][64] -> Conv[3x3/1][64] 
        conv1 = tf.contrib.layers.conv2d(
            X, 32, 8, 4, activation_fn=tf.nn.relu)
        conv2 = tf.contrib.layers.conv2d(
            conv1, 64, 4, 2, activation_fn=tf.nn.relu)
        conv3 = tf.contrib.layers.conv2d(
            conv2, 64, 3, 1, activation_fn=tf.nn.relu)

        # Fully connected layers
        # Flatten our convolutions to [batch_size, k]
        flattened = tf.contrib.layers.flatten(conv3)
        # FC1[k,512]
        fc1 = tf.contrib.layers.fully_connected(flattened, 512)
        # Predictions[512,VALID_ACTIONS]
        self.predictions = tf.contrib.layers.fully_connected(fc1, len(VALID_ACTIONS))

        # Now here comes the tricky part, when calculating the loss we must
        # take into account what actions were taken. Intuitively, the TD target
        # is just a number so our target should be a number as well.
        
        # Get the predictions for the chosen actions only
        # gather_indices: contains the indices of the FLATTENED predictions array
        #                 for the actions we took
        gather_indices = tf.range(batch_size) * tf.shape(self.predictions)[1] + self.actions_pl
        
        # Now we just flatten our predictions array and gather all the values
        # using TF.gather.
        self.action_predictions = tf.gather(tf.reshape(self.predictions, [-1]), gather_indices)

        # Calcualte the loss
        self.losses = tf.squared_difference(self.y_pl, self.action_predictions)
        self.loss = tf.reduce_mean(self.losses)

        # Optimizer Parameters from original paper
        # Parameters: (LR, decay, momentum, epsilon)
        self.optimizer = tf.train.RMSPropOptimizer(0.00025, 0.99, 0.0, 1e-6)
        self.train_op = self.optimizer.minimize(self.loss, global_step=tf.contrib.framework.get_global_step())

        # Summaries for Tensorboard
        self.summaries = tf.summary.merge([
            tf.summary.scalar("loss", self.loss),
            tf.summary.histogram("loss_hist", self.losses),
            tf.summary.histogram("q_values_hist", self.predictions),
            tf.summary.scalar("max_q_value", tf.reduce_max(self.predictions))
        ])


    def predict(self, sess, s):
        """
        Predicts action values.

        Args:
          sess: Tensorflow session
          s: State input of shape [batch_size, 4, 160, 160, 3]

        Returns:
          Tensor of shape [batch_size, NUM_VALID_ACTIONS] containing the estimated 
          action values.
        """
        return sess.run(self.predictions, { self.X_pl: s })

    def update(self, sess, s, a, y):
        """
        Updates the estimator towards the given targets.

        Args:
          sess: Tensorflow session object
          s: State input of shape [batch_size, 4, 160, 160, 3]
          a: Chosen actions of shape [batch_size]
          y: Targets of shape [batch_size]

        Returns:
          The calculated loss on the batch.
        """
        feed_dict = { self.X_pl: s, self.y_pl: y, self.actions_pl: a }
        summaries, global_step, _, loss = sess.run(
            [self.summaries, tf.contrib.framework.get_global_step(), self.train_op, self.loss],
            feed_dict)
        if self.summary_writer:
            self.summary_writer.add_summary(summaries, global_step)
        return loss

In [6]:
# For Testing....

tf.reset_default_graph()
global_step = tf.Variable(0, name="global_step", trainable=False)

e = Estimator(scope="test")
sp = StateProcessor()

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    # Example observation batch
    observation = env.reset()
    
    observation_p = sp.process(sess, observation)
    observation = np.stack([observation_p] * 4, axis=2)
    observations = np.array([observation] * 2)
    
    # Test Prediction
    print(e.predict(sess, observations))

    # Test training step
    y = np.array([10.0, 10.0])
    a = np.array([1, 3])
    print(e.update(sess, observations, a, y))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[[ 0.          0.02172653  0.04793787  0.        ]
 [ 0.          0.02172653  0.04793787  0.        ]]
99.783


In [7]:
def copy_model_parameters(sess, estimator1, estimator2):
    """
    Copies the model parameters of one estimator to another.

    Args:
      sess: Tensorflow session instance
      estimator1: Estimator to copy the paramters from
      estimator2: Estimator to copy the parameters to
    """
    e1_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator1.scope)]
    e1_params = sorted(e1_params, key=lambda v: v.name)
    e2_params = [t for t in tf.trainable_variables() if t.name.startswith(estimator2.scope)]
    e2_params = sorted(e2_params, key=lambda v: v.name)

    update_ops = []
    for e1_v, e2_v in zip(e1_params, e2_params):
        # A very TF-like thing where we create operations
        # before running them
        op = e2_v.assign(e1_v)
        update_ops.append(op)

    # This is where the real work is being done
    sess.run(update_ops)

In [8]:
def make_epsilon_greedy_policy(estimator, nA):
    """
    Creates an epsilon-greedy policy based on a given Q-function approximator and epsilon.

    Args:
        estimator: An estimator that returns q values for a given state
        nA: Number of actions in the environment.

    Returns:
        A function that takes the (sess, observation, epsilon) as an argument and returns
        the probabilities for each action in the form of a numpy array of length nA.

    """
    def policy_fn(sess, observation, epsilon):
        A = np.ones(nA, dtype=float) * epsilon / nA
        # Observation is [84,84,4] so we expand to be [1,84,84,4]
        # that way in can run through the prediction
        q_values = estimator.predict(sess, np.expand_dims(observation, 0))[0]
        best_action = np.argmax(q_values)
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

In [None]:
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for fff-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sample when initializing 
          the replay memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Lambda time discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """
    
    # Very cute pythonic way of naming some tuple lol
    Transition = namedtuple("Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(
        episode_lengths=np.zeros(num_episodes),
        episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    # Create checkpoint directories and monitor directories
    # if they don't exist
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    # Automagically find checkpoint and load it
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)
    
    # Get the current time step
    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    # Create decay schedule using linspace which just gets us from start
    # to end in a certain number of steps
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(
        q_estimator,
        len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        # total_t: represents the total number of timesteps our network
        #          has ben run
        probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps-1)])
        action = np.random.choice(np.arange(len(probs)), p=probs)
        next_state, reward, done, _ = env.step(action)
        
        # Convert state to grayscale and do some numpy magic to
        # append state as the last seen frame
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)

        # Transitions are a namedtuple defined above
        replay_memory.append(Transition(state, action, reward, next_state, done))
        
        if done:
            # We are done so reset the environment and stack the same
            # frame four times
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else: 
            state = next_state

    # Record videos
    # env.monitor.start(monitor_path,
                      # resume=True,
                      # video_callable=lambda count: count % record_video_every == 0)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        # itertools.count() here just makes sure we do t += 1
        for t in itertools.count():
            """In this loop we simply go through our episode as normal with
               a slight caveat. Meanwhile we're following our episode until
               termination, we also sample a minibatch from our replay memory.
               
               We remove something from the replay memory if it is full and append
               a new state. In this case, the state is the current one that we're on.
            """

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps-1)]

            # Some updates to our summaries
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # TODO: Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                # This is just as simple as updating the weights of our
                # target estimator and done!
                copy_model_parameters(sess, q_estimator, target_estimator)

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                    t, total_t, i_episode + 1, num_episodes, loss), end="")
            sys.stdout.flush()

            # Take a step in the environment
            probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(probs)), p=probs)
            next_state, reward, done, _ = env.step(action)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # TODO: Save transition to replay memory
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:,:,1:], np.expand_dims(next_state, 2), axis=2)
            replay_memory.append(Transition(state, action, reward, next_state, done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # TODO: Sample a minibatch from the replay memory
            # RARS - Must sample the whole minibatch, random.sample works nicely for this
            transitions = random.sample(replay_memory, batch_size)
            # RARS - Map, applies np.array to the output of zip thus
            #        now we have numpy arrays all accross
            s_batch, a_batch, r_batch, ns_batch, d_batch = map(np.array, zip(*transitions))
            
            # TODO: Calculate q values and targets
            q_target_values = target_estimator.predict(sess, ns_batch)
            
            target = r_batch + \
                    np.invert(done).astype(np.float32) * \
                    discount_factor * \
                    np.amax(q_target_values, axis=1)
            
            # TODO Perform gradient descent update
            # RARS - Need to do this in order to convert from map
            s_batch = np.array(s_batch)
            loss = q_estimator.update(sess, s_batch, a_batch, target)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward")
        episode_summary.value.add(simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode+1],
            episode_rewards=stats.episode_rewards[:i_episode+1])

    env.monitor.close()
    return stats

In [None]:
tf.reset_default_graph()

# Where we save our checkpoints and graphs
experiment_dir = os.path.abspath("./experiments/{}".format(env.spec.id))

# Create a glboal step variable
global_step = tf.Variable(0, name='global_step', trainable=False)
    
# Create estimators
q_estimator = Estimator(scope="q", summaries_dir=experiment_dir)
target_estimator = Estimator(scope="target_q")

# State processor
state_processor = StateProcessor()

# Run it!
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for t, stats in deep_q_learning(sess,
                                    env,
                                    q_estimator=q_estimator,
                                    target_estimator=target_estimator,
                                    state_processor=state_processor,
                                    experiment_dir=experiment_dir,
                                    num_episodes=10000,
                                    replay_memory_size=500000,
                                    replay_memory_init_size=50000,
                                    update_target_estimator_every=10000,
                                    epsilon_start=1.0,
                                    epsilon_end=0.1,
                                    epsilon_decay_steps=500000,
                                    discount_factor=0.99,
                                    batch_size=32):

        print("\nEpisode Reward: {}".format(stats.episode_rewards[-1]))

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Populating replay memory...
Step 205 (205) @ Episode 1/10000, loss: 0.00035014655441045763
Episode Reward: 1.0
Step 234 (439) @ Episode 2/10000, loss: 0.03314521163702011923
Episode Reward: 1.0
Step 371 (810) @ Episode 3/10000, loss: 0.00044526866986416285
Episode Reward: 4.0
Step 158 (968) @ Episode 4/10000, loss: 0.00038672258961014454
Episode Reward: 0.0
Step 256 (1224) @ Episode 5/10000, loss: 0.00033945223549380966
Episode Reward: 1.0
Step 346 (1570) @ Episode 6/10000, loss: 0.00049587246030569085
Episode Reward: 3.0
Step 206 (1776) @ Episode 7/10000, loss: 0.03152327984571457407
Episode Reward: 1.0
Step 272 (2048) @ Episode 8/10000, loss: 0.03009570948779583054
Episode Reward: 2.0
Step 226 (2274) @ Episode 9/10000, loss: 6.037607818143442e-055
Episode Reward: 1.0
Step 445 (2719) @ Episode 10/10000, loss: 4.8983056331053376e-05
Episode Reward: 5.0
Step 281 (3000) @ Episode 11/10000, loss: 2.314084667887073e-055
Episode Reward: 2.0
Step 209 (3209) @ Episode 12/10000, loss: 3.346690

Step 170 (24020) @ Episode 96/10000, loss: 4.845025614486076e-052
Episode Reward: 0.0
Step 425 (24445) @ Episode 97/10000, loss: 9.250587936548982e-065
Episode Reward: 5.0
Step 346 (24791) @ Episode 98/10000, loss: 4.404157880344428e-055
Episode Reward: 3.0
Step 214 (25005) @ Episode 99/10000, loss: 0.03012499772012233755
Episode Reward: 1.0
Step 172 (25177) @ Episode 100/10000, loss: 3.377646498847753e-055
Episode Reward: 0.0
Step 501 (25678) @ Episode 101/10000, loss: 5.274887371342629e-051
Episode Reward: 5.0
Step 180 (25858) @ Episode 102/10000, loss: 9.223233791999519e-055
Episode Reward: 0.0
Step 258 (26116) @ Episode 103/10000, loss: 3.5661538277054206e-05
Episode Reward: 2.0
Step 246 (26362) @ Episode 104/10000, loss: 8.048547897487879e-055
Episode Reward: 1.0
Step 161 (26523) @ Episode 105/10000, loss: 1.895881723612547e-055
Episode Reward: 0.0
Step 203 (26726) @ Episode 106/10000, loss: 4.1532679460942745e-05
Episode Reward: 0.0
Step 239 (26965) @ Episode 107/10000, loss: 2.4

Step 231 (47576) @ Episode 190/10000, loss: 7.434788858518004e-053
Episode Reward: 1.0
Step 273 (47849) @ Episode 191/10000, loss: 0.00029037136118859056
Episode Reward: 2.0
Step 166 (48015) @ Episode 192/10000, loss: 0.00107302283868193636
Episode Reward: 0.0
Step 340 (48355) @ Episode 193/10000, loss: 9.22090039239265e-0505
Episode Reward: 3.0
Step 176 (48531) @ Episode 194/10000, loss: 0.00122952146921306853
Episode Reward: 0.0
Step 271 (48802) @ Episode 195/10000, loss: 9.461704030400142e-054
Episode Reward: 2.0
Step 173 (48975) @ Episode 196/10000, loss: 0.00018338412337470853
Episode Reward: 0.0
Step 319 (49294) @ Episode 197/10000, loss: 0.00013743744057137528
Episode Reward: 3.0
Step 265 (49559) @ Episode 198/10000, loss: 0.00115204846952110531
Episode Reward: 2.0
Step 350 (49909) @ Episode 199/10000, loss: 0.00012475856055971235
Episode Reward: 3.0
Step 180 (50089) @ Episode 200/10000, loss: 0.00011957099195569754
Episode Reward: 0.0
Step 218 (50307) @ Episode 201/10000, loss:

Step 273 (71135) @ Episode 284/10000, loss: 0.00048117898404598236
Episode Reward: 2.0
Step 274 (71409) @ Episode 285/10000, loss: 6.667975685559213e-054
Episode Reward: 2.0
Step 191 (71600) @ Episode 286/10000, loss: 0.00419952347874641424
Episode Reward: 0.0
Step 241 (71841) @ Episode 287/10000, loss: 0.00015072307724040002
Episode Reward: 1.0
Step 217 (72058) @ Episode 288/10000, loss: 0.00019580312073230743
Episode Reward: 1.0
Step 310 (72368) @ Episode 289/10000, loss: 0.00018159260798711335
Episode Reward: 2.0
Step 346 (72714) @ Episode 290/10000, loss: 0.00522595364600422416
Episode Reward: 3.0
Step 228 (72942) @ Episode 291/10000, loss: 0.00146656366996467115
Episode Reward: 1.0
Step 269 (73211) @ Episode 292/10000, loss: 5.0282142183277756e-05
Episode Reward: 2.0
Step 221 (73432) @ Episode 293/10000, loss: 0.00105478346813470136
Episode Reward: 1.0
Step 172 (73604) @ Episode 294/10000, loss: 0.00036682264180853963
Episode Reward: 0.0
Step 168 (73772) @ Episode 295/10000, loss:

Step 199 (94310) @ Episode 378/10000, loss: 0.00204327423125505454
Episode Reward: 0.0
Step 177 (94487) @ Episode 379/10000, loss: 5.0852966523962095e-05
Episode Reward: 0.0
Step 389 (94876) @ Episode 380/10000, loss: 0.00057720579206943515
Episode Reward: 3.0
Step 275 (95151) @ Episode 381/10000, loss: 0.00026379665359854764
Episode Reward: 2.0
Step 304 (95455) @ Episode 382/10000, loss: 0.00066008180147036913
Episode Reward: 3.0
Step 171 (95626) @ Episode 383/10000, loss: 0.00028988771373406055
Episode Reward: 0.0
Step 256 (95882) @ Episode 384/10000, loss: 0.00014585866301786155
Episode Reward: 1.0
Step 271 (96153) @ Episode 385/10000, loss: 0.00327455624938011178
Episode Reward: 1.0
Step 279 (96432) @ Episode 386/10000, loss: 0.00174046377651393415
Episode Reward: 2.0
Step 321 (96753) @ Episode 387/10000, loss: 0.00062379892915487295
Episode Reward: 2.0
Step 178 (96931) @ Episode 388/10000, loss: 0.00190415198449045428
Episode Reward: 0.0
Step 186 (97117) @ Episode 389/10000, loss:

Step 228 (116682) @ Episode 471/10000, loss: 0.00042644474888220433
Episode Reward: 1.0
Step 195 (116877) @ Episode 472/10000, loss: 0.00114816846325993547
Episode Reward: 0.0
Step 201 (117078) @ Episode 473/10000, loss: 0.00081537704681977635
Episode Reward: 0.0
Step 187 (117265) @ Episode 474/10000, loss: 0.00021595100406557322
Episode Reward: 0.0
Step 322 (117587) @ Episode 475/10000, loss: 0.00022673298371955752
Episode Reward: 2.0
Step 296 (117883) @ Episode 476/10000, loss: 2.3967742890818045e-05
Episode Reward: 2.0
Step 199 (118082) @ Episode 477/10000, loss: 0.00035660300636664033
Episode Reward: 0.0
Step 241 (118323) @ Episode 478/10000, loss: 5.075911394669674e-055
Episode Reward: 1.0
Step 241 (118564) @ Episode 479/10000, loss: 0.00018020349671132863
Episode Reward: 1.0
Step 233 (118797) @ Episode 480/10000, loss: 0.00029268607613630593
Episode Reward: 1.0
Step 174 (118971) @ Episode 481/10000, loss: 0.00153561623301357031
Episode Reward: 0.0
Step 166 (119137) @ Episode 482/

Step 435 (140067) @ Episode 564/10000, loss: 0.01093116588890552584
Episode Reward: 5.0
Step 281 (140348) @ Episode 565/10000, loss: 0.00066695315763354372
Episode Reward: 2.0
Step 407 (140755) @ Episode 566/10000, loss: 0.00090006156824529173
Episode Reward: 4.0
Step 277 (141032) @ Episode 567/10000, loss: 0.00082562537863850593
Episode Reward: 2.0
Step 173 (141205) @ Episode 568/10000, loss: 0.00025212450418621336
Episode Reward: 0.0
Step 317 (141522) @ Episode 569/10000, loss: 0.00045542709995061165
Episode Reward: 3.0
Step 211 (141733) @ Episode 570/10000, loss: 4.642140993382782e-053
Episode Reward: 1.0
Step 172 (141905) @ Episode 571/10000, loss: 0.00014885253040120006
Episode Reward: 0.0
Step 185 (142090) @ Episode 572/10000, loss: 0.00010206873412244022
Episode Reward: 0.0
Step 308 (142398) @ Episode 573/10000, loss: 5.873216287000105e-052
Episode Reward: 2.0
Step 323 (142721) @ Episode 574/10000, loss: 0.00012650698772631586
Episode Reward: 3.0
Step 179 (142900) @ Episode 575/

Step 163 (162904) @ Episode 657/10000, loss: 0.00065735582029446965
Episode Reward: 0.0
Step 173 (163077) @ Episode 658/10000, loss: 0.00024130381643772125
Episode Reward: 0.0
Step 236 (163313) @ Episode 659/10000, loss: 0.00047097413334995511
Episode Reward: 1.0
Step 231 (163544) @ Episode 660/10000, loss: 0.00033195805735886097
Episode Reward: 1.0
Step 213 (163757) @ Episode 661/10000, loss: 0.00049826328177005055
Episode Reward: 1.0
Step 235 (163992) @ Episode 662/10000, loss: 0.00025834154803305864
Episode Reward: 1.0
Step 218 (164210) @ Episode 663/10000, loss: 0.00044466875260695815
Episode Reward: 1.0
Step 444 (164654) @ Episode 664/10000, loss: 0.00038593274075537922
Episode Reward: 5.0
Step 331 (164985) @ Episode 665/10000, loss: 0.00018194282893091442
Episode Reward: 3.0
Step 383 (165368) @ Episode 666/10000, loss: 0.00080100749619305132
Episode Reward: 3.0
Step 169 (165537) @ Episode 667/10000, loss: 0.00150854547973722223
Episode Reward: 0.0
Step 240 (165777) @ Episode 668/

Step 227 (184664) @ Episode 750/10000, loss: 0.00017433428729418665
Episode Reward: 1.0
Step 253 (184917) @ Episode 751/10000, loss: 0.00025478345924057066
Episode Reward: 2.0
Step 231 (185148) @ Episode 752/10000, loss: 8.797793998382986e-053
Episode Reward: 1.0
Step 253 (185401) @ Episode 753/10000, loss: 0.00216341810300946246
Episode Reward: 2.0
Step 383 (185784) @ Episode 754/10000, loss: 3.8204871088964865e-05
Episode Reward: 4.0
Step 302 (186086) @ Episode 755/10000, loss: 0.00122876686509698632
Episode Reward: 2.0
Step 172 (186258) @ Episode 756/10000, loss: 0.00310863275080919278
Episode Reward: 0.0
Step 235 (186493) @ Episode 757/10000, loss: 0.00081572355702519423
Episode Reward: 1.0
Step 235 (186728) @ Episode 758/10000, loss: 9.671870793681592e-053
Episode Reward: 1.0
Step 302 (187030) @ Episode 759/10000, loss: 0.00065891997655853633
Episode Reward: 3.0
Step 280 (187310) @ Episode 760/10000, loss: 0.00166765437461435874
Episode Reward: 2.0
Step 214 (187524) @ Episode 761/

Step 171 (207612) @ Episode 843/10000, loss: 0.00075507839210331443
Episode Reward: 0.0
Step 244 (207856) @ Episode 844/10000, loss: 6.871018558740616e-053
Episode Reward: 1.0
Step 180 (208036) @ Episode 845/10000, loss: 0.00046178349293768406
Episode Reward: 0.0
Step 355 (208391) @ Episode 846/10000, loss: 0.00012340172543190425
Episode Reward: 3.0
Step 163 (208554) @ Episode 847/10000, loss: 0.00028684452991001315
Episode Reward: 0.0
Step 176 (208730) @ Episode 848/10000, loss: 0.00015308283036574724
Episode Reward: 0.0
Step 173 (208903) @ Episode 849/10000, loss: 0.00049721123650670051
Episode Reward: 0.0
Step 183 (209086) @ Episode 850/10000, loss: 0.00036804398405365646
Episode Reward: 0.0
Step 178 (209264) @ Episode 851/10000, loss: 0.00011509993055369705
Episode Reward: 0.0
Step 506 (209770) @ Episode 852/10000, loss: 0.00090447824914008386
Episode Reward: 6.0
Step 171 (209941) @ Episode 853/10000, loss: 0.00036188319791108374
Episode Reward: 0.0
Step 171 (210112) @ Episode 854/

Step 257 (230399) @ Episode 936/10000, loss: 0.00037053128471598034
Episode Reward: 1.0
Step 197 (230596) @ Episode 937/10000, loss: 0.00021451456996146594
Episode Reward: 0.0
Step 177 (230773) @ Episode 938/10000, loss: 0.00010985518747474998
Episode Reward: 0.0
Step 241 (231014) @ Episode 939/10000, loss: 3.63254475814756e-0553
Episode Reward: 1.0
Step 183 (231197) @ Episode 940/10000, loss: 9.204013622365892e-053
Episode Reward: 0.0
Step 254 (231451) @ Episode 941/10000, loss: 0.00027056061662733555
Episode Reward: 2.0
Step 228 (231679) @ Episode 942/10000, loss: 0.00188072375021874927
Episode Reward: 1.0
Step 304 (231983) @ Episode 943/10000, loss: 0.00095068442169576886
Episode Reward: 3.0
Step 258 (232241) @ Episode 944/10000, loss: 0.00067226291866973045
Episode Reward: 2.0
Step 208 (232449) @ Episode 945/10000, loss: 0.00033915298990905285
Episode Reward: 0.0
Step 171 (232620) @ Episode 946/10000, loss: 0.00040230751619674265
Episode Reward: 0.0
Step 323 (232943) @ Episode 947/

Step 177 (253490) @ Episode 1029/10000, loss: 0.01025628484785556866
Episode Reward: 0.0
Step 191 (253681) @ Episode 1030/10000, loss: 0.00041415102896280587
Episode Reward: 0.0
Step 176 (253857) @ Episode 1031/10000, loss: 0.00010928004485322163
Episode Reward: 0.0
Step 174 (254031) @ Episode 1032/10000, loss: 8.591704681748524e-054
Episode Reward: 0.0
Step 207 (254238) @ Episode 1033/10000, loss: 5.893930938327685e-054
Episode Reward: 1.0
Step 227 (254465) @ Episode 1034/10000, loss: 8.561936556361616e-058
Episode Reward: 1.0
Step 351 (254816) @ Episode 1035/10000, loss: 0.00027399259852245452
Episode Reward: 3.0
Step 275 (255091) @ Episode 1036/10000, loss: 0.00220012129284441479
Episode Reward: 2.0
Step 182 (255273) @ Episode 1037/10000, loss: 0.00433938438072800627
Episode Reward: 0.0
Step 173 (255446) @ Episode 1038/10000, loss: 0.00014100116095505655
Episode Reward: 0.0
Step 233 (255679) @ Episode 1039/10000, loss: 0.00049739674432203174
Episode Reward: 1.0
Step 172 (255851) @ E

Step 258 (275397) @ Episode 1121/10000, loss: 0.00192533305380493418
Episode Reward: 1.0
Step 325 (275722) @ Episode 1122/10000, loss: 0.00080172874731943012
Episode Reward: 3.0
Step 231 (275953) @ Episode 1123/10000, loss: 0.00077546335523948075
Episode Reward: 1.0
Step 355 (276308) @ Episode 1124/10000, loss: 0.00082306086551398045
Episode Reward: 3.0
Step 303 (276611) @ Episode 1125/10000, loss: 0.00066300685284659273
Episode Reward: 2.0
Step 185 (276796) @ Episode 1126/10000, loss: 0.00133568793535232545
Episode Reward: 0.0
Step 176 (276972) @ Episode 1127/10000, loss: 6.8506138632074e-05742
Episode Reward: 0.0
Step 380 (277352) @ Episode 1128/10000, loss: 0.00051399943185970196
Episode Reward: 3.0
Step 252 (277604) @ Episode 1129/10000, loss: 0.00040181068470701575
Episode Reward: 2.0
Step 217 (277821) @ Episode 1130/10000, loss: 0.00013300516002345836
Episode Reward: 1.0
Step 297 (278118) @ Episode 1131/10000, loss: 0.00013078816118650138
Episode Reward: 2.0
Step 218 (278336) @ E

Step 379 (298551) @ Episode 1213/10000, loss: 0.00023338597384281456
Episode Reward: 3.0
Step 281 (298832) @ Episode 1214/10000, loss: 0.00100852514151483773
Episode Reward: 1.0
Step 252 (299084) @ Episode 1215/10000, loss: 0.00140284490771591664
Episode Reward: 1.0
Step 277 (299361) @ Episode 1216/10000, loss: 0.00202735443599522166
Episode Reward: 2.0
Step 226 (299587) @ Episode 1217/10000, loss: 0.00029955967329442583
Episode Reward: 1.0
Step 247 (299834) @ Episode 1218/10000, loss: 0.00687480717897415225
Episode Reward: 2.0
Step 206 (300040) @ Episode 1219/10000, loss: 0.00070758123183622963
Episode Reward: 0.0
Step 327 (300367) @ Episode 1220/10000, loss: 0.00206969701685011491
Episode Reward: 3.0
Step 185 (300552) @ Episode 1221/10000, loss: 0.00089639896759763364
Episode Reward: 0.0
Step 347 (300899) @ Episode 1222/10000, loss: 0.00029063550755381584
Episode Reward: 3.0
Step 368 (301267) @ Episode 1223/10000, loss: 0.00037265336140990257
Episode Reward: 3.0
Step 175 (301442) @ E

Step 246 (322086) @ Episode 1305/10000, loss: 0.00220681750215594138
Episode Reward: 1.0
Step 334 (322420) @ Episode 1306/10000, loss: 0.00024243965162895626
Episode Reward: 3.0
Step 185 (322605) @ Episode 1307/10000, loss: 0.00062177865765988836
Episode Reward: 0.0
Step 379 (322984) @ Episode 1308/10000, loss: 0.00812666770070791228
Episode Reward: 4.0
Step 186 (323170) @ Episode 1309/10000, loss: 0.00058397330576553942
Episode Reward: 0.0
Step 240 (323410) @ Episode 1310/10000, loss: 0.00853012874722480889
Episode Reward: 1.0
Step 330 (323740) @ Episode 1311/10000, loss: 0.00045692158164456487
Episode Reward: 3.0
Step 324 (324064) @ Episode 1312/10000, loss: 0.00028844605549238627
Episode Reward: 2.0
Step 274 (324338) @ Episode 1313/10000, loss: 0.00146028352901339535
Episode Reward: 2.0
Step 266 (324604) @ Episode 1314/10000, loss: 0.00019676394003909086
Episode Reward: 1.0
Step 251 (324855) @ Episode 1315/10000, loss: 0.00080308149335905915
Episode Reward: 1.0
Step 428 (325283) @ E

Step 247 (351580) @ Episode 1397/10000, loss: 0.00070993037661537536
Episode Reward: 1.0
Step 268 (351848) @ Episode 1398/10000, loss: 0.00062215246725827466
Episode Reward: 2.0
Step 220 (352068) @ Episode 1399/10000, loss: 0.00157511513680219654
Episode Reward: 1.0
Step 372 (352440) @ Episode 1400/10000, loss: 0.00054895348148420454
Episode Reward: 3.0
Step 268 (352708) @ Episode 1401/10000, loss: 0.00107964873313903855
Episode Reward: 1.0
Step 243 (352951) @ Episode 1402/10000, loss: 0.00216936809010803753
Episode Reward: 0.0
Step 285 (353236) @ Episode 1403/10000, loss: 0.00044655782403424386
Episode Reward: 2.0
Step 282 (353518) @ Episode 1404/10000, loss: 0.00083718157839030037
Episode Reward: 1.0
Step 309 (353827) @ Episode 1405/10000, loss: 0.00080464815255254518
Episode Reward: 3.0
Step 282 (354109) @ Episode 1406/10000, loss: 0.00154860434122383644
Episode Reward: 1.0
Step 259 (354368) @ Episode 1407/10000, loss: 0.00138324860017746697
Episode Reward: 1.0
Step 281 (354649) @ E

Step 253 (377920) @ Episode 1489/10000, loss: 0.00129112112335860734
Episode Reward: 2.0
Step 378 (378298) @ Episode 1490/10000, loss: 0.00043189863208681345
Episode Reward: 3.0
Step 294 (378592) @ Episode 1491/10000, loss: 0.00326689891517162323
Episode Reward: 2.0
Step 266 (378858) @ Episode 1492/10000, loss: 0.00842995755374431666
Episode Reward: 1.0
Step 310 (379168) @ Episode 1493/10000, loss: 0.00114357681013643746
Episode Reward: 2.0
Step 221 (379389) @ Episode 1494/10000, loss: 0.00110341014806181213
Episode Reward: 1.0
Step 278 (379667) @ Episode 1495/10000, loss: 0.00041237717960029843
Episode Reward: 1.0
Step 223 (379890) @ Episode 1496/10000, loss: 0.00139145459979772574
Episode Reward: 1.0
Step 439 (380329) @ Episode 1497/10000, loss: 0.00129516911692917356
Episode Reward: 4.0
Step 326 (380655) @ Episode 1498/10000, loss: 0.00816211197525262856
Episode Reward: 3.0
Step 176 (380831) @ Episode 1499/10000, loss: 0.0016005041543394327
Episode Reward: 0.0
Step 355 (381186) @ Ep

Step 637 (419928) @ Episode 1581/10000, loss: 0.00139140104874968536
Episode Reward: 9.0
Step 513 (420441) @ Episode 1582/10000, loss: 0.00086243881378322844
Episode Reward: 6.0
Step 721 (421162) @ Episode 1583/10000, loss: 0.00148081535007804636
Episode Reward: 14.0
Step 482 (421644) @ Episode 1584/10000, loss: 0.00189465528819710023
Episode Reward: 6.0
Step 544 (422188) @ Episode 1585/10000, loss: 0.00077609840082004674
Episode Reward: 7.0
Step 433 (422621) @ Episode 1586/10000, loss: 0.00347640179097652446
Episode Reward: 4.0
Step 588 (423209) @ Episode 1587/10000, loss: 0.00215789093635976385
Episode Reward: 7.0
Step 719 (423928) @ Episode 1588/10000, loss: 0.00121061294339597233
Episode Reward: 11.0
Step 607 (424535) @ Episode 1589/10000, loss: 0.00264433771371841436
Episode Reward: 8.0
Step 639 (425174) @ Episode 1590/10000, loss: 0.00075349991675466354
Episode Reward: 9.0
Step 711 (425885) @ Episode 1591/10000, loss: 0.01235209871083498244
Episode Reward: 9.0
Step 785 (426670) @