In [1]:
from keras.layers.convolutional import Conv2D
from keras.layers import Dense, Flatten
from keras.optimizers import RMSprop
from keras.models import Sequential
from skimage.transform import resize
from skimage.color import rgb2gray
from collections import deque
from keras import backend as K
import tensorflow as tf
import numpy as np
import random
import gym
import os

Using TensorFlow backend.


In [2]:
model_path = os.path.join(os.getcwd(),'save_model')

if not os.path.isdir(model_path):
    os.mkdir(model_path)
EPISODES = 50000

In [3]:
class DQNAgent:
    def __init__(self, action_size):
        self.render = False
        self.load_model = False
        # 상태와 행동의 크기 정의
        self.state_size = (84, 84, 4)
        self.action_size = action_size
        # DQN 하이퍼파라미터
        self.epsilon = 1.
        self.epsilon_start, self.epsilon_end = 1.0, 0.1
        self.exploration_steps = 1000000.
        self.epsilon_decay_step = (self.epsilon_start - self.epsilon_end) \
                                  / self.exploration_steps
        self.batch_size = 32
        self.train_start = 50000
        self.update_target_rate = 10000
        self.discount_factor = 0.99
        # 리플레이 메모리, 최대 크기 400000
        self.memory = deque(maxlen=400000)
        self.no_op_steps = 30
        # 모델과 타겟모델을 생성하고 타겟모델 초기화
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()
 
        self.optimizer = self.optimizer()
 
        # 텐서보드 설정
        self.sess = tf.InteractiveSession()
        K.set_session(self.sess)
 
        self.avg_q_max, self.avg_loss = 0, 0
        self.summary_placeholders, self.update_ops, self.summary_op = \
            self.setup_summary()
        self.summary_writer = tf.summary.FileWriter(
            'summary/breakout_dqn', self.sess.graph)
        self.sess.run(tf.global_variables_initializer())
 
        if self.load_model:
            self.model.load_weights("./save_model/breakout_dqn_trained.h5")
 
    # Huber Loss를 이용하기 위해 최적화 함수를 직접 정의
    def optimizer(self):
        a = K.placeholder(shape=(None,), dtype='int32')
        y = K.placeholder(shape=(None,), dtype='float32')
 
        prediction = self.model.output
 
        a_one_hot = K.one_hot(a, self.action_size)
        q_value = K.sum(prediction * a_one_hot, axis=1)
        error = K.abs(y - q_value)
 
        quadratic_part = K.clip(error, 0.0, 1.0)
        linear_part = error - quadratic_part
        loss = K.mean(0.5 * K.square(quadratic_part) + linear_part)
 
        optimizer = RMSprop(lr=0.00025, epsilon=0.01)
        updates = optimizer.get_updates(self.model.trainable_weights, [], loss)
        train = K.function([self.model.input, a, y], [loss], updates=updates)
 
        return train
 
    # 상태가 입력, 큐함수가 출력인 인공신경망 생성
    def build_model(self):
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=(4, 4),
                         activation='relu',
                         input_shape=self.state_size))
        model.add(Conv2D(64, (4, 4), strides=(2, 2),
                         activation='relu'))
        model.add(Conv2D(64, (3, 3), strides=(1, 1),
                         activation='relu'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.action_size))
        model.summary()
        return model
 
    # 타겟 모델을 모델의 가중치로 업데이트
    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
 
    # 입실론 탐욕 정책으로 행동 선택
    def get_action(self, history):
        history = np.float32(history / 255.0)
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            q_value = self.model.predict(history)
            return np.argmax(q_value[0])
 
    # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장
    def append_sample(self, history, action, reward, next_history, dead):
        self.memory.append((history, action, reward, next_history, dead))
 
    # 리플레이 메모리에서 무작위로 추출한 배치로 모델 학습
    def train_model(self):
        if self.epsilon > self.epsilon_end:
            self.epsilon -= self.epsilon_decay_step
 
        mini_batch = random.sample(self.memory, self.batch_size)
 
        history = np.zeros((self.batch_size, self.state_size[0],
                            self.state_size[1], self.state_size[2]))
        next_history = np.zeros((self.batch_size, self.state_size[0],
                                 self.state_size[1], self.state_size[2]))
        target = np.zeros((self.batch_size,))
        action, reward, dead = [], [], []
 
        for i in range(self.batch_size):
            history[i] = np.float32(mini_batch[i][0] / 255.)
            next_history[i] = np.float32(mini_batch[i][3] / 255.)
            action.append(mini_batch[i][1])
            reward.append(mini_batch[i][2])
            dead.append(mini_batch[i][4])
 
        target_value = self.target_model.predict(next_history)
 
        for i in range(self.batch_size):
            if dead[i]:
                target[i] = reward[i]
            else:
                target[i] = reward[i] + self.discount_factor * \
                                        np.amax(target_value[i])
 
        loss = self.optimizer([history, action, target])
        self.avg_loss += loss[0]
 
    # 각 에피소드 당 학습 정보를 기록
    def setup_summary(self):
        episode_total_reward = tf.Variable(0.)
        episode_avg_max_q = tf.Variable(0.)
        episode_duration = tf.Variable(0.)
        episode_avg_loss = tf.Variable(0.)
 
        tf.summary.scalar('Total Reward/Episode', episode_total_reward)
        tf.summary.scalar('Average Max Q/Episode', episode_avg_max_q)
        tf.summary.scalar('Duration/Episode', episode_duration)
        tf.summary.scalar('Average Loss/Episode', episode_avg_loss)
 
        summary_vars = [episode_total_reward, episode_avg_max_q,
                        episode_duration, episode_avg_loss]
        summary_placeholders = [tf.placeholder(tf.float32) for _ in
                                range(len(summary_vars))]
        update_ops = [summary_vars[i].assign(summary_placeholders[i]) for i in
                      range(len(summary_vars))]
        summary_op = tf.summary.merge_all()
        return summary_placeholders, update_ops, summary_op

In [4]:
def pre_processing(observe):
    processed_observe = np.uint8(
        resize(rgb2gray(observe), (84, 84), mode='constant') * 255)
    return processed_observe

In [None]:
if __name__ == "__main__":
    # 환경과 DQN 에이전트 생성
    env = gym.make('BreakoutDeterministic-v4')
    agent = DQNAgent(action_size=3)
 
    scores, episodes, global_step = [], [], 0
 
    for e in range(EPISODES):
        done = False
        dead = False
 
        step, score, start_life = 0, 0, 5
        observe = env.reset()
 
        for _ in range(random.randint(1, agent.no_op_steps)):
            observe, _, _, _ = env.step(1)
 
        state = pre_processing(observe)
        history = np.stack((state, state, state, state), axis=2)
        history = np.reshape([history], (1, 84, 84, 4))
 
        while not done:
            if agent.render:
                env.render()
            global_step += 1
            step += 1
 
            # 바로 전 4개의 상태로 행동을 선택
            action = agent.get_action(history)
            # 1: 정지, 2: 왼쪽, 3: 오른쪽
            if action == 0:
                real_action = 1
            elif action == 1:
                real_action = 2
            else:
                real_action = 3
 
            # 선택한 행동으로 환경에서 한 타임스텝 진행
            observe, reward, done, info = env.step(real_action)
            # 각 타임스텝마다 상태 전처리
            next_state = pre_processing(observe)
            next_state = np.reshape([next_state], (1, 84, 84, 1))
            next_history = np.append(next_state, history[:, :, :, :3], axis=3)
 
            agent.avg_q_max += np.amax(
                agent.model.predict(np.float32(history / 255.))[0])
 
            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']
 
            reward = np.clip(reward, -1., 1.)
            # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습
            agent.append_sample(history, action, reward, next_history, dead)
 
            if len(agent.memory) >= agent.train_start:
                agent.train_model()
 
            # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트
            if global_step % agent.update_target_rate == 0:
                agent.update_target_model()
 
            score += reward
 
            if dead:
                dead = False
            else:
                history = next_history
 
            if done:
                # 각 에피소드 당 학습 정보를 기록
                if global_step > agent.train_start:
                    stats = [score, agent.avg_q_max / float(step), step,
                             agent.avg_loss / float(step)]
 
                    for i in range(len(stats)):
                        agent.sess.run(agent.update_ops[i], feed_dict={
                            agent.summary_placeholders[i]: float(stats[i])
                        })
                    summary_str = agent.sess.run(agent.summary_op)
                    agent.summary_writer.add_summary(summary_str, e + 1)
 
                print("episode:", e, "  score:", score, "  memory length:",
                      len(agent.memory), "  epsilon:", agent.epsilon,
                      "  global_step:", global_step, "  average_q:",
                      agent.avg_q_max / float(step), "  average loss:",
                      agent.avg_loss / float(step))
 
                agent.avg_q_max, agent.avg_loss = 0, 0
 
        # 1000 에피소드마다 모델 저장
        if e % 1000 == 0:
            agent.model.save_weights("./save_model/breakout_dqn.h5")

  result = entry_point.load(False)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 20, 20, 32)        8224      
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 9, 9, 64)          32832     
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 7, 7, 64)          36928     
_________________________________________________________________
flatten_1 (Flatten)          (None, 3136)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               1606144   
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 1539      
Total params: 1,685,667
Trainable params: 1,685,667
Non-trainable params: 0
_________________________________________________________________


  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


episode: 0   score: 3.0   memory length: 259   epsilon: 1.0   global_step: 259   average_q: 0.06500755830103366   average loss: 0.0
episode: 1   score: 0.0   memory length: 381   epsilon: 1.0   global_step: 381   average_q: 0.06435538385613043   average loss: 0.0
episode: 2   score: 0.0   memory length: 509   epsilon: 1.0   global_step: 509   average_q: 0.06385445722844452   average loss: 0.0
episode: 3   score: 0.0   memory length: 622   epsilon: 1.0   global_step: 622   average_q: 0.06405395285877506   average loss: 0.0
episode: 4   score: 0.0   memory length: 725   epsilon: 1.0   global_step: 725   average_q: 0.06468751961455761   average loss: 0.0
episode: 5   score: 1.0   memory length: 871   epsilon: 1.0   global_step: 871   average_q: 0.06422523322374854   average loss: 0.0
episode: 6   score: 1.0   memory length: 1038   epsilon: 1.0   global_step: 1038   average_q: 0.06264332666279313   average loss: 0.0
episode: 7   score: 3.0   memory length: 1312   epsilon: 1.0   global_step

episode: 61   score: 2.0   memory length: 10063   epsilon: 1.0   global_step: 10063   average_q: 0.06224365178599704   average loss: 0.0
episode: 62   score: 1.0   memory length: 10212   epsilon: 1.0   global_step: 10212   average_q: 0.06159651011928616   average loss: 0.0
episode: 63   score: 4.0   memory length: 10469   epsilon: 1.0   global_step: 10469   average_q: 0.06384091364792349   average loss: 0.0
episode: 64   score: 2.0   memory length: 10649   epsilon: 1.0   global_step: 10649   average_q: 0.06304176538768742   average loss: 0.0
episode: 65   score: 1.0   memory length: 10805   epsilon: 1.0   global_step: 10805   average_q: 0.061739394321846656   average loss: 0.0
episode: 66   score: 0.0   memory length: 10905   epsilon: 1.0   global_step: 10905   average_q: 0.06424182444810868   average loss: 0.0
episode: 67   score: 3.0   memory length: 11134   epsilon: 1.0   global_step: 11134   average_q: 0.06511348012558238   average loss: 0.0
episode: 68   score: 1.0   memory length

episode: 121   score: 1.0   memory length: 19948   epsilon: 1.0   global_step: 19948   average_q: 0.06411499601717179   average loss: 0.0
episode: 122   score: 0.0   memory length: 20078   epsilon: 1.0   global_step: 20078   average_q: 0.0641700386427916   average loss: 0.0
episode: 123   score: 1.0   memory length: 20236   epsilon: 1.0   global_step: 20236   average_q: 0.06289297627591633   average loss: 0.0
episode: 124   score: 0.0   memory length: 20350   epsilon: 1.0   global_step: 20350   average_q: 0.06403641517094352   average loss: 0.0
episode: 125   score: 1.0   memory length: 20484   epsilon: 1.0   global_step: 20484   average_q: 0.06315211670945829   average loss: 0.0
episode: 126   score: 2.0   memory length: 20697   epsilon: 1.0   global_step: 20697   average_q: 0.06175635006506119   average loss: 0.0
episode: 127   score: 0.0   memory length: 20803   epsilon: 1.0   global_step: 20803   average_q: 0.06412202295548511   average loss: 0.0
episode: 128   score: 3.0   memory 

episode: 181   score: 5.0   memory length: 29693   epsilon: 1.0   global_step: 29693   average_q: 0.06613320808269475   average loss: 0.0
episode: 182   score: 0.0   memory length: 29816   epsilon: 1.0   global_step: 29816   average_q: 0.06423224245266217   average loss: 0.0
episode: 183   score: 0.0   memory length: 29921   epsilon: 1.0   global_step: 29921   average_q: 0.06468566464526312   average loss: 0.0
episode: 184   score: 0.0   memory length: 30030   epsilon: 1.0   global_step: 30030   average_q: 0.06427826154806199   average loss: 0.0
episode: 185   score: 0.0   memory length: 30149   epsilon: 1.0   global_step: 30149   average_q: 0.06418617789735313   average loss: 0.0
episode: 186   score: 0.0   memory length: 30252   epsilon: 1.0   global_step: 30252   average_q: 0.06437162519658654   average loss: 0.0
episode: 187   score: 0.0   memory length: 30373   epsilon: 1.0   global_step: 30373   average_q: 0.06414400958571552   average loss: 0.0
episode: 188   score: 2.0   memory

episode: 241   score: 2.0   memory length: 38699   epsilon: 1.0   global_step: 38699   average_q: 0.061778295253004344   average loss: 0.0
episode: 242   score: 0.0   memory length: 38803   epsilon: 1.0   global_step: 38803   average_q: 0.06416657169421132   average loss: 0.0
episode: 243   score: 1.0   memory length: 38968   epsilon: 1.0   global_step: 38968   average_q: 0.06365740109573712   average loss: 0.0
episode: 244   score: 2.0   memory length: 39158   epsilon: 1.0   global_step: 39158   average_q: 0.06201928655960058   average loss: 0.0
episode: 245   score: 2.0   memory length: 39349   epsilon: 1.0   global_step: 39349   average_q: 0.06331630698673388   average loss: 0.0
episode: 246   score: 1.0   memory length: 39500   epsilon: 1.0   global_step: 39500   average_q: 0.06207840339533541   average loss: 0.0
episode: 247   score: 1.0   memory length: 39669   epsilon: 1.0   global_step: 39669   average_q: 0.06402825672157417   average loss: 0.0
episode: 248   score: 0.0   memor

episode: 301   score: 2.0   memory length: 48105   epsilon: 1.0   global_step: 48105   average_q: 0.06292428636067622   average loss: 0.0
episode: 302   score: 0.0   memory length: 48226   epsilon: 1.0   global_step: 48226   average_q: 0.06420792349852807   average loss: 0.0
episode: 303   score: 2.0   memory length: 48438   epsilon: 1.0   global_step: 48438   average_q: 0.06419243816426902   average loss: 0.0
episode: 304   score: 0.0   memory length: 48539   epsilon: 1.0   global_step: 48539   average_q: 0.06396750146799748   average loss: 0.0
episode: 305   score: 1.0   memory length: 48714   epsilon: 1.0   global_step: 48714   average_q: 0.06303303331136703   average loss: 0.0
episode: 306   score: 0.0   memory length: 48846   epsilon: 1.0   global_step: 48846   average_q: 0.06394430986520919   average loss: 0.0
episode: 307   score: 1.0   memory length: 48992   epsilon: 1.0   global_step: 48992   average_q: 0.06202059660157929   average loss: 0.0
episode: 308   score: 0.0   memory

episode: 352   score: 1.0   memory length: 55847   epsilon: 0.9947368000001733   global_step: 55847   average_q: 0.07161789441108704   average loss: 0.0036921618885317003
episode: 353   score: 5.0   memory length: 56163   epsilon: 0.9944524000001826   global_step: 56163   average_q: 0.06921232563643893   average loss: 0.0030205092607797604
episode: 354   score: 1.0   memory length: 56331   epsilon: 0.9943012000001876   global_step: 56331   average_q: 0.07485143137386158   average loss: 0.005053906747643093
episode: 355   score: 2.0   memory length: 56526   epsilon: 0.9941257000001934   global_step: 56526   average_q: 0.06839690107183578   average loss: 0.0030155032871894386
episode: 356   score: 0.0   memory length: 56641   epsilon: 0.9940222000001968   global_step: 56641   average_q: 0.06871505564321642   average loss: 0.0030586366135554746
episode: 357   score: 4.0   memory length: 56953   epsilon: 0.993741400000206   global_step: 56953   average_q: 0.07234441398236996   average loss

episode: 401   score: 0.0   memory length: 63571   epsilon: 0.9877852000004022   global_step: 63571   average_q: 0.08211404998220649   average loss: 0.003446333798454397
episode: 402   score: 0.0   memory length: 63676   epsilon: 0.9876907000004053   global_step: 63676   average_q: 0.0801735106678236   average loss: 0.003058890378436748
episode: 403   score: 0.0   memory length: 63789   epsilon: 0.9875890000004086   global_step: 63789   average_q: 0.08313057815606616   average loss: 0.004489077959036458
episode: 404   score: 2.0   memory length: 63993   epsilon: 0.9874054000004147   global_step: 63993   average_q: 0.0795517173394853   average loss: 0.0032131830515603342
episode: 405   score: 0.0   memory length: 64108   epsilon: 0.9873019000004181   global_step: 64108   average_q: 0.07769508484912956   average loss: 0.002668708037129403
episode: 406   score: 0.0   memory length: 64230   epsilon: 0.9871921000004217   global_step: 64230   average_q: 0.07970782607549527   average loss: 0.

episode: 450   score: 2.0   memory length: 70905   epsilon: 0.9811846000006195   global_step: 70905   average_q: 0.08775229735506905   average loss: 0.005261675215665997
episode: 451   score: 0.0   memory length: 71026   epsilon: 0.981075700000623   global_step: 71026   average_q: 0.08602059983517513   average loss: 0.004060016142925971
episode: 452   score: 0.0   memory length: 71152   epsilon: 0.9809623000006268   global_step: 71152   average_q: 0.0825893288212163   average loss: 0.003668709244049664
episode: 453   score: 1.0   memory length: 71311   epsilon: 0.9808192000006315   global_step: 71311   average_q: 0.08660500613773393   average loss: 0.00448788312159561
episode: 454   score: 1.0   memory length: 71456   epsilon: 0.9806887000006358   global_step: 71456   average_q: 0.08497308168945641   average loss: 0.003634516759360661
episode: 455   score: 5.0   memory length: 71774   epsilon: 0.9804025000006452   global_step: 71774   average_q: 0.08583188574737723   average loss: 0.00

episode: 499   score: 3.0   memory length: 78948   epsilon: 0.9739459000008578   global_step: 78948   average_q: 0.08051303118329993   average loss: 0.0031055780021470463
episode: 500   score: 0.0   memory length: 79071   epsilon: 0.9738352000008614   global_step: 79071   average_q: 0.08395062786776845   average loss: 0.003882521707585814
episode: 501   score: 0.0   memory length: 79187   epsilon: 0.9737308000008649   global_step: 79187   average_q: 0.08478900655333338   average loss: 0.0040767796703954475
episode: 502   score: 0.0   memory length: 79311   epsilon: 0.9736192000008685   global_step: 79311   average_q: 0.08447440533388045   average loss: 0.0038468803672472977
episode: 503   score: 0.0   memory length: 79410   epsilon: 0.9735301000008715   global_step: 79410   average_q: 0.08243834784235617   average loss: 0.0026031854670328365
episode: 504   score: 0.0   memory length: 79530   epsilon: 0.973422100000875   global_step: 79530   average_q: 0.0828230840464433   average loss:

episode: 548   score: 1.0   memory length: 86616   epsilon: 0.967044700001085   global_step: 86616   average_q: 0.09224154021899411   average loss: 0.0035788082858277687
episode: 549   score: 0.0   memory length: 86742   epsilon: 0.9669313000010887   global_step: 86742   average_q: 0.09347621132693594   average loss: 0.003924940034299392
episode: 550   score: 1.0   memory length: 86914   epsilon: 0.9667765000010938   global_step: 86914   average_q: 0.09525611104313718   average loss: 0.004433703659803414
episode: 551   score: 2.0   memory length: 87092   epsilon: 0.9666163000010991   global_step: 87092   average_q: 0.09341297294484095   average loss: 0.00325753563451977
episode: 552   score: 0.0   memory length: 87215   epsilon: 0.9665056000011027   global_step: 87215   average_q: 0.09480033421177204   average loss: 0.0041667670420460256
episode: 553   score: 4.0   memory length: 87477   epsilon: 0.9662698000011105   global_step: 87477   average_q: 0.0918401425405768   average loss: 0.

episode: 597   score: 2.0   memory length: 94376   epsilon: 0.9600607000013149   global_step: 94376   average_q: 0.10567053489348027   average loss: 0.0036731702278931244
episode: 598   score: 2.0   memory length: 94593   epsilon: 0.9598654000013214   global_step: 94593   average_q: 0.10566604899645951   average loss: 0.0029156580415443923
episode: 599   score: 3.0   memory length: 94807   epsilon: 0.9596728000013277   global_step: 94807   average_q: 0.10669278844355423   average loss: 0.0038613211602434483
episode: 600   score: 0.0   memory length: 94921   epsilon: 0.9595702000013311   global_step: 94921   average_q: 0.10439367239412509   average loss: 0.003050660607729234
episode: 601   score: 1.0   memory length: 95075   epsilon: 0.9594316000013356   global_step: 95075   average_q: 0.1058888372953062   average loss: 0.00340269573548067
episode: 602   score: 2.0   memory length: 95245   epsilon: 0.9592786000013407   global_step: 95245   average_q: 0.10858265617314507   average loss: 

episode: 646   score: 0.0   memory length: 102705   epsilon: 0.9525646000015617   global_step: 102705   average_q: 0.12099971396453453   average loss: 0.003415319221058973
episode: 647   score: 1.0   memory length: 102842   epsilon: 0.9524413000015658   global_step: 102842   average_q: 0.12331653108997066   average loss: 0.0032297523749867073
episode: 648   score: 4.0   memory length: 103145   epsilon: 0.9521686000015748   global_step: 103145   average_q: 0.12578547202517884   average loss: 0.0040573184832997395
episode: 649   score: 1.0   memory length: 103302   epsilon: 0.9520273000015794   global_step: 103302   average_q: 0.12334166476680974   average loss: 0.00439233658979971
episode: 650   score: 8.0   memory length: 103739   epsilon: 0.9516340000015924   global_step: 103739   average_q: 0.12108884410809052   average loss: 0.004018401502753061
episode: 651   score: 0.0   memory length: 103860   epsilon: 0.951525100001596   global_step: 103860   average_q: 0.12186878008290755   ave

episode: 694   score: 0.0   memory length: 110981   epsilon: 0.945116200001807   global_step: 110981   average_q: 0.12171552978731968   average loss: 0.003955594322544837
episode: 695   score: 2.0   memory length: 111183   epsilon: 0.944934400001813   global_step: 111183   average_q: 0.12596259701370013   average loss: 0.004028859059257464
episode: 696   score: 1.0   memory length: 111336   epsilon: 0.9447967000018175   global_step: 111336   average_q: 0.12731142965407152   average loss: 0.0049552806183046
episode: 697   score: 1.0   memory length: 111460   epsilon: 0.9446851000018212   global_step: 111460   average_q: 0.12287790196076516   average loss: 0.0029400453932618303
episode: 698   score: 2.0   memory length: 111677   epsilon: 0.9444898000018276   global_step: 111677   average_q: 0.12622604247886465   average loss: 0.004368077687005739
episode: 699   score: 0.0   memory length: 111778   epsilon: 0.9443989000018306   global_step: 111778   average_q: 0.12547868977088739   averag

episode: 742   score: 0.0   memory length: 118801   epsilon: 0.9380782000020387   global_step: 118801   average_q: 0.12491589813556486   average loss: 0.004852072126315532
episode: 743   score: 1.0   memory length: 118959   epsilon: 0.9379360000020434   global_step: 118959   average_q: 0.12445150023397011   average loss: 0.003786201537047441
episode: 744   score: 2.0   memory length: 119157   epsilon: 0.9377578000020492   global_step: 119157   average_q: 0.13114433110964419   average loss: 0.004501308730653483
episode: 745   score: 1.0   memory length: 119308   epsilon: 0.9376219000020537   global_step: 119308   average_q: 0.12538143015460462   average loss: 0.00410995336206598
episode: 746   score: 2.0   memory length: 119491   epsilon: 0.9374572000020591   global_step: 119491   average_q: 0.11664452831276127   average loss: 0.0028547028488238074
episode: 747   score: 0.0   memory length: 119603   epsilon: 0.9373564000020624   global_step: 119603   average_q: 0.1245421627536416   aver

episode: 790   score: 0.0   memory length: 126275   epsilon: 0.9313516000022601   global_step: 126275   average_q: 0.12702337658502222   average loss: 0.0052349544692488715
episode: 791   score: 2.0   memory length: 126463   epsilon: 0.9311824000022657   global_step: 126463   average_q: 0.12012323467655385   average loss: 0.003589606695573395
episode: 792   score: 1.0   memory length: 126647   epsilon: 0.9310168000022712   global_step: 126647   average_q: 0.1286360024596038   average loss: 0.0032866207229843617
episode: 793   score: 2.0   memory length: 126840   epsilon: 0.9308431000022769   global_step: 126840   average_q: 0.1257844227583297   average loss: 0.00445485836087595
episode: 794   score: 0.0   memory length: 126948   epsilon: 0.9307459000022801   global_step: 126948   average_q: 0.12278414614222667   average loss: 0.0035173586586804756
episode: 795   score: 2.0   memory length: 127167   epsilon: 0.9305488000022866   global_step: 127167   average_q: 0.1302334288619969   aver

episode: 838   score: 0.0   memory length: 134036   epsilon: 0.9243667000024901   global_step: 134036   average_q: 0.15426595357515044   average loss: 0.0038171910674248202
episode: 839   score: 0.0   memory length: 134144   epsilon: 0.9242695000024933   global_step: 134144   average_q: 0.15750953334349174   average loss: 0.004220022036452907
episode: 840   score: 3.0   memory length: 134372   epsilon: 0.9240643000025001   global_step: 134372   average_q: 0.1598211755616623   average loss: 0.004089904587600725
episode: 841   score: 3.0   memory length: 134592   epsilon: 0.9238663000025066   global_step: 134592   average_q: 0.1581541675058278   average loss: 0.003138724842649894
episode: 842   score: 0.0   memory length: 134711   epsilon: 0.9237592000025101   global_step: 134711   average_q: 0.1571554545344425   average loss: 0.004072989251959493
episode: 843   score: 0.0   memory length: 134822   epsilon: 0.9236593000025134   global_step: 134822   average_q: 0.15782632940524333   avera

episode: 886   score: 1.0   memory length: 141830   epsilon: 0.917352100002721   global_step: 141830   average_q: 0.16710185474413305   average loss: 0.003548413555253782
episode: 887   score: 0.0   memory length: 141953   epsilon: 0.9172414000027247   global_step: 141953   average_q: 0.1536025825312467   average loss: 0.003808523551059607
episode: 888   score: 1.0   memory length: 142100   epsilon: 0.917109100002729   global_step: 142100   average_q: 0.15669001344920827   average loss: 0.0036571372705925167
episode: 889   score: 0.0   memory length: 142223   epsilon: 0.9169984000027327   global_step: 142223   average_q: 0.15403275627915453   average loss: 0.0031249322067980565
episode: 890   score: 3.0   memory length: 142487   epsilon: 0.9167608000027405   global_step: 142487   average_q: 0.16045982774459955   average loss: 0.003496796188359798
episode: 891   score: 0.0   memory length: 142590   epsilon: 0.9166681000027436   global_step: 142590   average_q: 0.1600808132331348   avera

episode: 934   score: 1.0   memory length: 149996   epsilon: 0.910002700002963   global_step: 149996   average_q: 0.16136638308638956   average loss: 0.004241950761884729
episode: 935   score: 1.0   memory length: 150158   epsilon: 0.9098569000029678   global_step: 150158   average_q: 0.17308362472204514   average loss: 0.004282964013611243
episode: 936   score: 1.0   memory length: 150306   epsilon: 0.9097237000029722   global_step: 150306   average_q: 0.1684861901040013   average loss: 0.0034531016824100357
episode: 937   score: 2.0   memory length: 150486   epsilon: 0.9095617000029775   global_step: 150486   average_q: 0.17366720454560386   average loss: 0.004321310067040132
episode: 938   score: 0.0   memory length: 150592   epsilon: 0.9094663000029807   global_step: 150592   average_q: 0.16799788640917473   average loss: 0.0040557001857279215
episode: 939   score: 1.0   memory length: 150745   epsilon: 0.9093286000029852   global_step: 150745   average_q: 0.17273099038725584   ave

episode: 982   score: 3.0   memory length: 157538   epsilon: 0.9032149000031865   global_step: 157538   average_q: 0.1740366033129934   average loss: 0.003960935587565802
episode: 983   score: 0.0   memory length: 157658   epsilon: 0.90310690000319   global_step: 157658   average_q: 0.1674696858972311   average loss: 0.004362431583134215
episode: 984   score: 0.0   memory length: 157752   epsilon: 0.9030223000031928   global_step: 157752   average_q: 0.16729911242393739   average loss: 0.003657868556206751
episode: 985   score: 3.0   memory length: 157980   epsilon: 0.9028171000031996   global_step: 157980   average_q: 0.17038898124245175   average loss: 0.0033677233989656565
episode: 986   score: 0.0   memory length: 158103   epsilon: 0.9027064000032032   global_step: 158103   average_q: 0.16490360512966062   average loss: 0.0040290938464302714
episode: 987   score: 3.0   memory length: 158306   epsilon: 0.9025237000032093   global_step: 158306   average_q: 0.1738356227798415   averag

episode: 1030   score: 1.0   memory length: 165342   epsilon: 0.8961913000034177   global_step: 165342   average_q: 0.16701527022653156   average loss: 0.0041943306311225115
episode: 1031   score: 2.0   memory length: 165514   epsilon: 0.8960365000034228   global_step: 165514   average_q: 0.16812674268040545   average loss: 0.004018608466224599
episode: 1032   score: 4.0   memory length: 165781   epsilon: 0.8957962000034307   global_step: 165781   average_q: 0.16915342981895704   average loss: 0.0049784743649134875
episode: 1033   score: 1.0   memory length: 165912   epsilon: 0.8956783000034346   global_step: 165912   average_q: 0.17402095865202313   average loss: 0.004839116815667118
episode: 1034   score: 3.0   memory length: 166182   epsilon: 0.8954353000034426   global_step: 166182   average_q: 0.17432054071514694   average loss: 0.004030175397312834
episode: 1035   score: 1.0   memory length: 166322   epsilon: 0.8953093000034468   global_step: 166322   average_q: 0.170024244274411

episode: 1078   score: 2.0   memory length: 172844   epsilon: 0.88943950000364   global_step: 172844   average_q: 0.17600571495645187   average loss: 0.004325799779125565
episode: 1079   score: 1.0   memory length: 173028   epsilon: 0.8892739000036455   global_step: 173028   average_q: 0.17430010470359222   average loss: 0.003996293575722654
episode: 1080   score: 2.0   memory length: 173203   epsilon: 0.8891164000036507   global_step: 173203   average_q: 0.17117350816726684   average loss: 0.004411789253044845
episode: 1081   score: 2.0   memory length: 173396   epsilon: 0.8889427000036564   global_step: 173396   average_q: 0.17291141652690314   average loss: 0.005015419125625464
episode: 1082   score: 2.0   memory length: 173574   epsilon: 0.8887825000036617   global_step: 173574   average_q: 0.17194812133740844   average loss: 0.004652459135329766
episode: 1083   score: 3.0   memory length: 173778   epsilon: 0.8885989000036677   global_step: 173778   average_q: 0.16574629060193605  

episode: 1126   score: 2.0   memory length: 180709   epsilon: 0.8823610000038731   global_step: 180709   average_q: 0.17919207068543938   average loss: 0.004086339096185345
episode: 1127   score: 2.0   memory length: 180886   epsilon: 0.8822017000038783   global_step: 180886   average_q: 0.17976258686706845   average loss: 0.003807811310738451
episode: 1128   score: 2.0   memory length: 181051   epsilon: 0.8820532000038832   global_step: 181051   average_q: 0.16154317422346634   average loss: 0.0035910339697638296
episode: 1129   score: 0.0   memory length: 181163   epsilon: 0.8819524000038865   global_step: 181163   average_q: 0.16962486239416258   average loss: 0.004005796394795068
episode: 1130   score: 0.0   memory length: 181267   epsilon: 0.8818588000038896   global_step: 181267   average_q: 0.16760027723816726   average loss: 0.002991994986193556
episode: 1131   score: 0.0   memory length: 181389   epsilon: 0.8817490000038932   global_step: 181389   average_q: 0.1680965474883063

episode: 1174   score: 0.0   memory length: 188343   epsilon: 0.8754904000040993   global_step: 188343   average_q: 0.16781737627806487   average loss: 0.0031186866736132085
episode: 1175   score: 0.0   memory length: 188450   epsilon: 0.8753941000041024   global_step: 188450   average_q: 0.17184195145268308   average loss: 0.004110532184347342
episode: 1176   score: 3.0   memory length: 188656   epsilon: 0.8752087000041086   global_step: 188656   average_q: 0.17894052663474408   average loss: 0.004460878006593526
episode: 1177   score: 2.0   memory length: 188854   epsilon: 0.8750305000041144   global_step: 188854   average_q: 0.18497156389433927   average loss: 0.003909143095223393
episode: 1178   score: 0.0   memory length: 188973   epsilon: 0.874923400004118   global_step: 188973   average_q: 0.16904011544059305   average loss: 0.0036279776257196463
episode: 1179   score: 5.0   memory length: 189277   epsilon: 0.874649800004127   global_step: 189277   average_q: 0.18564472318087755

episode: 1222   score: 2.0   memory length: 196171   epsilon: 0.8684452000043312   global_step: 196171   average_q: 0.18261164979836375   average loss: 0.003548172199227394
episode: 1223   score: 1.0   memory length: 196311   epsilon: 0.8683192000043354   global_step: 196311   average_q: 0.17839557081460952   average loss: 0.0034463774137845704
episode: 1224   score: 0.0   memory length: 196415   epsilon: 0.8682256000043385   global_step: 196415   average_q: 0.17175472470430228   average loss: 0.004326440640135423
episode: 1225   score: 1.0   memory length: 196570   epsilon: 0.868086100004343   global_step: 196570   average_q: 0.1800093118221529   average loss: 0.004147134297618909
episode: 1226   score: 0.0   memory length: 196692   epsilon: 0.8679763000043467   global_step: 196692   average_q: 0.16938547838906773   average loss: 0.002876563744103194
episode: 1227   score: 2.0   memory length: 196870   epsilon: 0.8678161000043519   global_step: 196870   average_q: 0.16594812273979187 

episode: 1270   score: 2.0   memory length: 203851   epsilon: 0.8615332000045588   global_step: 203851   average_q: 0.1852679559162685   average loss: 0.003242008373481873
episode: 1271   score: 2.0   memory length: 204064   epsilon: 0.8613415000045651   global_step: 204064   average_q: 0.19949573488302633   average loss: 0.003704073773717685
episode: 1272   score: 2.0   memory length: 204252   epsilon: 0.8611723000045707   global_step: 204252   average_q: 0.18795242826355266   average loss: 0.004469231208834069
episode: 1273   score: 1.0   memory length: 204395   epsilon: 0.8610436000045749   global_step: 204395   average_q: 0.17893939230825517   average loss: 0.003422959856198485
episode: 1274   score: 1.0   memory length: 204573   epsilon: 0.8608834000045802   global_step: 204573   average_q: 0.19485775473412503   average loss: 0.0047500989144180165
episode: 1275   score: 2.0   memory length: 204734   epsilon: 0.860738500004585   global_step: 204734   average_q: 0.18247457431710284 

episode: 1318   score: 0.0   memory length: 211084   epsilon: 0.8550235000047731   global_step: 211084   average_q: 0.17496462695854753   average loss: 0.004178064492166902
episode: 1319   score: 2.0   memory length: 211285   epsilon: 0.8548426000047791   global_step: 211285   average_q: 0.21428239152799197   average loss: 0.0034921876948426156
episode: 1320   score: 0.0   memory length: 211406   epsilon: 0.8547337000047827   global_step: 211406   average_q: 0.17802902563544346   average loss: 0.00419822689992761
episode: 1321   score: 0.0   memory length: 211530   epsilon: 0.8546221000047863   global_step: 211530   average_q: 0.17551677121270087   average loss: 0.0038698377773236587
episode: 1322   score: 0.0   memory length: 211646   epsilon: 0.8545177000047898   global_step: 211646   average_q: 0.17199560152045612   average loss: 0.0032112788525692254
episode: 1323   score: 0.0   memory length: 211763   epsilon: 0.8544124000047932   global_step: 211763   average_q: 0.177049770059748

episode: 1366   score: 0.0   memory length: 218753   epsilon: 0.8481214000050004   global_step: 218753   average_q: 0.18013793402466893   average loss: 0.0044925501350774515
episode: 1367   score: 0.0   memory length: 218862   epsilon: 0.8480233000050036   global_step: 218862   average_q: 0.17325179538595567   average loss: 0.0031198371375091623
episode: 1368   score: 2.0   memory length: 219038   epsilon: 0.8478649000050088   global_step: 219038   average_q: 0.21559668146073818   average loss: 0.0030991310681697955
episode: 1369   score: 1.0   memory length: 219189   epsilon: 0.8477290000050133   global_step: 219189   average_q: 0.1831140745159806   average loss: 0.0030661645431647286
episode: 1370   score: 1.0   memory length: 219332   epsilon: 0.8476003000050175   global_step: 219332   average_q: 0.18656974218108438   average loss: 0.0037212878625367367
episode: 1371   score: 1.0   memory length: 219486   epsilon: 0.8474617000050221   global_step: 219486   average_q: 0.1789901012724

episode: 1414   score: 0.0   memory length: 226392   epsilon: 0.8412463000052267   global_step: 226392   average_q: 0.17967520762573588   average loss: 0.003322025337398703
episode: 1415   score: 0.0   memory length: 226513   epsilon: 0.8411374000052303   global_step: 226513   average_q: 0.1815542898887445   average loss: 0.003673675602990706
episode: 1416   score: 1.0   memory length: 226671   epsilon: 0.840995200005235   global_step: 226671   average_q: 0.1867214692167089   average loss: 0.0032557577054992378
episode: 1417   score: 2.0   memory length: 226888   epsilon: 0.8407999000052414   global_step: 226888   average_q: 0.21028067164706743   average loss: 0.0030456134703443593
episode: 1418   score: 1.0   memory length: 227061   epsilon: 0.8406442000052465   global_step: 227061   average_q: 0.19874335248346273   average loss: 0.0029584654957049987
episode: 1419   score: 3.0   memory length: 227288   epsilon: 0.8404399000052533   global_step: 227288   average_q: 0.20127406230582015

episode: 1462   score: 3.0   memory length: 234952   epsilon: 0.8335423000054804   global_step: 234952   average_q: 0.21433472660332245   average loss: 0.002632478713023037
episode: 1463   score: 1.0   memory length: 235104   epsilon: 0.8334055000054849   global_step: 235104   average_q: 0.20436623359197065   average loss: 0.0029272561771569216
episode: 1464   score: 2.0   memory length: 235303   epsilon: 0.8332264000054908   global_step: 235303   average_q: 0.22464100829320938   average loss: 0.0028722941024901974
episode: 1465   score: 2.0   memory length: 235486   epsilon: 0.8330617000054962   global_step: 235486   average_q: 0.18122854665980312   average loss: 0.003130318800646168
episode: 1466   score: 1.0   memory length: 235633   epsilon: 0.8329294000055005   global_step: 235633   average_q: 0.20627735220656104   average loss: 0.002393830588040319
episode: 1467   score: 1.0   memory length: 235779   epsilon: 0.8327980000055049   global_step: 235779   average_q: 0.180457911466899

episode: 1510   score: 2.0   memory length: 243091   epsilon: 0.8262172000057215   global_step: 243091   average_q: 0.20429605778894927   average loss: 0.003660444804690518
episode: 1511   score: 1.0   memory length: 243235   epsilon: 0.8260876000057258   global_step: 243235   average_q: 0.18039039501713383   average loss: 0.0032418988661599237
episode: 1512   score: 0.0   memory length: 243364   epsilon: 0.8259715000057296   global_step: 243364   average_q: 0.16766419544700503   average loss: 0.0027651041833845836
episode: 1513   score: 1.0   memory length: 243531   epsilon: 0.8258212000057346   global_step: 243531   average_q: 0.17920529521154072   average loss: 0.0028727196294895036
episode: 1514   score: 3.0   memory length: 243754   epsilon: 0.8256205000057412   global_step: 243754   average_q: 0.14325004401762925   average loss: 0.002710130783389145
episode: 1515   score: 2.0   memory length: 243951   epsilon: 0.825443200005747   global_step: 243951   average_q: 0.209193468850276

episode: 1558   score: 0.0   memory length: 250962   epsilon: 0.8191333000059547   global_step: 250962   average_q: 0.16121686724099246   average loss: 0.0022164751973667244
episode: 1559   score: 7.0   memory length: 251314   epsilon: 0.8188165000059652   global_step: 251314   average_q: 0.18303193956274877   average loss: 0.003147425593479883
episode: 1560   score: 1.0   memory length: 251444   epsilon: 0.818699500005969   global_step: 251444   average_q: 0.1818849524626365   average loss: 0.0026189050674614777
episode: 1561   score: 0.0   memory length: 251546   epsilon: 0.818607700005972   global_step: 251546   average_q: 0.16043743345082975   average loss: 0.0033831834041527196
episode: 1562   score: 1.0   memory length: 251704   epsilon: 0.8184655000059767   global_step: 251704   average_q: 0.2049426327023325   average loss: 0.0032762702892773343
episode: 1563   score: 1.0   memory length: 251857   epsilon: 0.8183278000059813   global_step: 251857   average_q: 0.1700494071237402 

episode: 1606   score: 3.0   memory length: 258430   epsilon: 0.812412100006176   global_step: 258430   average_q: 0.17610049424785199   average loss: 0.0024886557300963226
episode: 1607   score: 3.0   memory length: 258658   epsilon: 0.8122069000061828   global_step: 258658   average_q: 0.14824033384783225   average loss: 0.0022913745688767227
episode: 1608   score: 2.0   memory length: 258835   epsilon: 0.812047600006188   global_step: 258835   average_q: 0.1861889476830003   average loss: 0.002262974632171816
episode: 1609   score: 2.0   memory length: 259029   epsilon: 0.8118730000061938   global_step: 259029   average_q: 0.17973880515885107   average loss: 0.0025841088996157945
episode: 1610   score: 0.0   memory length: 259152   epsilon: 0.8117623000061974   global_step: 259152   average_q: 0.1628982066138973   average loss: 0.0023687057587599932
episode: 1611   score: 1.0   memory length: 259304   epsilon: 0.8116255000062019   global_step: 259304   average_q: 0.18086213912618787

episode: 1654   score: 0.0   memory length: 266572   epsilon: 0.8050843000064173   global_step: 266572   average_q: 0.16049389858195123   average loss: 0.003218840353586897
episode: 1655   score: 0.0   memory length: 266690   epsilon: 0.8049781000064208   global_step: 266690   average_q: 0.1517553084482581   average loss: 0.0023007934865067214
episode: 1656   score: 1.0   memory length: 266830   epsilon: 0.8048521000064249   global_step: 266830   average_q: 0.17778087598936898   average loss: 0.0031863225103734294
episode: 1657   score: 1.0   memory length: 266963   epsilon: 0.8047324000064289   global_step: 266963   average_q: 0.16863854419916197   average loss: 0.002227644636669928
episode: 1658   score: 1.0   memory length: 267136   epsilon: 0.804576700006434   global_step: 267136   average_q: 0.1982638777335944   average loss: 0.003065126751733922
episode: 1659   score: 1.0   memory length: 267278   epsilon: 0.8044489000064382   global_step: 267278   average_q: 0.17674514609323422 

episode: 1702   score: 1.0   memory length: 274178   epsilon: 0.7982389000066427   global_step: 274178   average_q: 0.19632037353995663   average loss: 0.003508207002132561
episode: 1703   score: 0.0   memory length: 274276   epsilon: 0.7981507000066456   global_step: 274276   average_q: 0.16156430937805955   average loss: 0.004036661565564549
episode: 1704   score: 5.0   memory length: 274551   epsilon: 0.7979032000066537   global_step: 274551   average_q: 0.19641898599537935   average loss: 0.0030769970287722326
episode: 1705   score: 1.0   memory length: 274719   epsilon: 0.7977520000066587   global_step: 274719   average_q: 0.24382336082912626   average loss: 0.0022217616638694642
episode: 1706   score: 1.0   memory length: 274863   epsilon: 0.797622400006663   global_step: 274863   average_q: 0.18845031017230618   average loss: 0.0027753342461033347
episode: 1707   score: 2.0   memory length: 275036   epsilon: 0.7974667000066681   global_step: 275036   average_q: 0.219565927982330

episode: 1750   score: 1.0   memory length: 281854   epsilon: 0.7913305000068701   global_step: 281854   average_q: 0.19566349696743396   average loss: 0.0027539347457893955
episode: 1751   score: 3.0   memory length: 282094   epsilon: 0.7911145000068772   global_step: 282094   average_q: 0.21892872912188371   average loss: 0.0026038448947550328
episode: 1752   score: 0.0   memory length: 282205   epsilon: 0.7910146000068805   global_step: 282205   average_q: 0.15873087231103364   average loss: 0.0026769799936504045
episode: 1753   score: 1.0   memory length: 282362   epsilon: 0.7908733000068852   global_step: 282362   average_q: 0.2479617844341667   average loss: 0.002592905712885646
episode: 1754   score: 2.0   memory length: 282534   epsilon: 0.7907185000068903   global_step: 282534   average_q: 0.22344857922127082   average loss: 0.002434517426843021
episode: 1755   score: 1.0   memory length: 282677   epsilon: 0.7905898000068945   global_step: 282677   average_q: 0.185541151912062

episode: 1798   score: 2.0   memory length: 290051   epsilon: 0.783953200007113   global_step: 290051   average_q: 0.17132680560206318   average loss: 0.0025471444039414943
episode: 1799   score: 3.0   memory length: 290268   epsilon: 0.7837579000071194   global_step: 290268   average_q: 0.2270322656301859   average loss: 0.0024956794244946467
episode: 1800   score: 2.0   memory length: 290449   epsilon: 0.7835950000071248   global_step: 290449   average_q: 0.21021750733997283   average loss: 0.002841728584193213
episode: 1801   score: 3.0   memory length: 290675   epsilon: 0.7833916000071315   global_step: 290675   average_q: 0.25549305355654356   average loss: 0.003769754529176012
episode: 1802   score: 2.0   memory length: 290894   epsilon: 0.783194500007138   global_step: 290894   average_q: 0.2617219834839372   average loss: 0.003526418274126311
episode: 1803   score: 3.0   memory length: 291143   epsilon: 0.7829704000071454   global_step: 291143   average_q: 0.28005072474479675  

episode: 1846   score: 5.0   memory length: 298540   epsilon: 0.7763131000073645   global_step: 298540   average_q: 0.2458942305521677   average loss: 0.002246055266271121
episode: 1847   score: 0.0   memory length: 298665   epsilon: 0.7762006000073682   global_step: 298665   average_q: 0.15342632269859313   average loss: 0.001469124409952201
episode: 1848   score: 2.0   memory length: 298850   epsilon: 0.7760341000073737   global_step: 298850   average_q: 0.20660009351936545   average loss: 0.002632902591486109
episode: 1849   score: 2.0   memory length: 299036   epsilon: 0.7758667000073792   global_step: 299036   average_q: 0.21712795156304554   average loss: 0.0019175365748480388
episode: 1850   score: 2.0   memory length: 299241   epsilon: 0.7756822000073853   global_step: 299241   average_q: 0.19245230206629124   average loss: 0.0026308053447126707
episode: 1851   score: 3.0   memory length: 299435   epsilon: 0.775507600007391   global_step: 299435   average_q: 0.23245576362019962

episode: 1894   score: 1.0   memory length: 306318   epsilon: 0.769312900007595   global_step: 306318   average_q: 0.16438332618009754   average loss: 0.0020424305921107767
episode: 1895   score: 1.0   memory length: 306451   epsilon: 0.7691932000075989   global_step: 306451   average_q: 0.1729528816570913   average loss: 0.002305626382875012
episode: 1896   score: 1.0   memory length: 306591   epsilon: 0.7690672000076031   global_step: 306591   average_q: 0.17226210023675645   average loss: 0.0018154317703842286
episode: 1897   score: 1.0   memory length: 306771   epsilon: 0.7689052000076084   global_step: 306771   average_q: 0.13796190255218083   average loss: 0.001983710155319487
episode: 1898   score: 1.0   memory length: 306906   epsilon: 0.7687837000076124   global_step: 306906   average_q: 0.18137772017055087   average loss: 0.002289629003846332
episode: 1899   score: 0.0   memory length: 307015   epsilon: 0.7686856000076157   global_step: 307015   average_q: 0.12070925656808626

episode: 1942   score: 0.0   memory length: 313723   epsilon: 0.7626484000078144   global_step: 313723   average_q: 0.12308272015391372   average loss: 0.0015624228082427674
episode: 1943   score: 2.0   memory length: 313901   epsilon: 0.7624882000078197   global_step: 313901   average_q: 0.20310319575031152   average loss: 0.0021956366009986046
episode: 1944   score: 1.0   memory length: 314048   epsilon: 0.762355900007824   global_step: 314048   average_q: 0.1645406375125963   average loss: 0.0020462998337422017
episode: 1945   score: 2.0   memory length: 314204   epsilon: 0.7622155000078287   global_step: 314204   average_q: 0.1482442869589879   average loss: 0.0020740384809939246
episode: 1946   score: 1.0   memory length: 314353   epsilon: 0.7620814000078331   global_step: 314353   average_q: 0.16619498177662792   average loss: 0.003044707426516536
episode: 1947   score: 2.0   memory length: 314551   epsilon: 0.761903200007839   global_step: 314551   average_q: 0.1753539363242159 

episode: 1990   score: 3.0   memory length: 322248   epsilon: 0.754975900008067   global_step: 322248   average_q: 0.19764777305649547   average loss: 0.002409317220274427
episode: 1991   score: 2.0   memory length: 322436   epsilon: 0.7548067000080726   global_step: 322436   average_q: 0.19132926378478396   average loss: 0.0021329259938129134
episode: 1992   score: 1.0   memory length: 322581   epsilon: 0.7546762000080769   global_step: 322581   average_q: 0.14208499213744855   average loss: 0.002716306830778445
episode: 1993   score: 2.0   memory length: 322778   epsilon: 0.7544989000080827   global_step: 322778   average_q: 0.28925765619665234   average loss: 0.0026782048272405074
episode: 1994   score: 1.0   memory length: 322936   epsilon: 0.7543567000080874   global_step: 322936   average_q: 0.14303911373585085   average loss: 0.0022973803363267415
episode: 1995   score: 1.0   memory length: 323084   epsilon: 0.7542235000080918   global_step: 323084   average_q: 0.217267580733105

episode: 2038   score: 2.0   memory length: 330662   epsilon: 0.7474033000083163   global_step: 330662   average_q: 0.20907101166117323   average loss: 0.0029101745035354774
episode: 2039   score: 1.0   memory length: 330846   epsilon: 0.7472377000083218   global_step: 330846   average_q: 0.22082543356910997   average loss: 0.0021660750945438454
episode: 2040   score: 1.0   memory length: 331013   epsilon: 0.7470874000083267   global_step: 331013   average_q: 0.23258504271507263   average loss: 0.0023613989159604605
episode: 2041   score: 2.0   memory length: 331196   epsilon: 0.7469227000083322   global_step: 331196   average_q: 0.19543091912087196   average loss: 0.002535854988697092
episode: 2042   score: 2.0   memory length: 331404   epsilon: 0.7467355000083383   global_step: 331404   average_q: 0.2656240440331973   average loss: 0.0026593009971433135
episode: 2043   score: 5.0   memory length: 331707   epsilon: 0.7464628000083473   global_step: 331707   average_q: 0.27889010133129

episode: 2086   score: 2.0   memory length: 339344   epsilon: 0.7395895000085736   global_step: 339344   average_q: 0.1789725908055538   average loss: 0.0019626751717424203
episode: 2087   score: 4.0   memory length: 339628   epsilon: 0.739333900008582   global_step: 339628   average_q: 0.194411162654279   average loss: 0.002218134840110048
episode: 2088   score: 0.0   memory length: 339747   epsilon: 0.7392268000085855   global_step: 339747   average_q: 0.11985313391485133   average loss: 0.002070677178403648
episode: 2089   score: 3.0   memory length: 340000   epsilon: 0.738999100008593   global_step: 340000   average_q: 0.193184293305921   average loss: 0.0024499369237134592
episode: 2090   score: 3.0   memory length: 340237   epsilon: 0.7387858000086001   global_step: 340237   average_q: 0.22197978720383302   average loss: 0.002600592723992341
episode: 2091   score: 3.0   memory length: 340483   epsilon: 0.7385644000086073   global_step: 340483   average_q: 0.2581570574907753   ave

episode: 2134   score: 0.0   memory length: 348873   epsilon: 0.731013400008856   global_step: 348873   average_q: 0.10009278102618892   average loss: 0.001979887060540902
episode: 2135   score: 2.0   memory length: 349064   epsilon: 0.7308415000088616   global_step: 349064   average_q: 0.22271144015626756   average loss: 0.002180276472459447
episode: 2136   score: 2.0   memory length: 349272   epsilon: 0.7306543000088678   global_step: 349272   average_q: 0.21243193072195238   average loss: 0.0024178100331762554
episode: 2137   score: 3.0   memory length: 349458   epsilon: 0.7304869000088733   global_step: 349458   average_q: 0.1464436076661592   average loss: 0.002238024885892012
episode: 2138   score: 0.0   memory length: 349570   epsilon: 0.7303861000088766   global_step: 349570   average_q: 0.09546245874038764   average loss: 0.0022667128154612976
episode: 2139   score: 1.0   memory length: 349738   epsilon: 0.7302349000088816   global_step: 349738   average_q: 0.2176242573630242 

episode: 2182   score: 1.0   memory length: 357281   epsilon: 0.7234462000091051   global_step: 357281   average_q: 0.18664890359824812   average loss: 0.0023525092665935723
episode: 2183   score: 0.0   memory length: 357394   epsilon: 0.7233445000091084   global_step: 357394   average_q: 0.11976376597860218   average loss: 0.003092631115191781
episode: 2184   score: 3.0   memory length: 357602   epsilon: 0.7231573000091146   global_step: 357602   average_q: 0.20327231961374098   average loss: 0.0020497308300946315
episode: 2185   score: 4.0   memory length: 357885   epsilon: 0.722902600009123   global_step: 357885   average_q: 0.2146831612915538   average loss: 0.0023224261694399775
episode: 2186   score: 1.0   memory length: 358037   epsilon: 0.7227658000091275   global_step: 358037   average_q: 0.1857948956128798   average loss: 0.0025341458167531528
episode: 2187   score: 5.0   memory length: 358347   epsilon: 0.7224868000091367   global_step: 358347   average_q: 0.2088982810897212

episode: 2230   score: 0.0   memory length: 366087   epsilon: 0.715520800009366   global_step: 366087   average_q: 0.11691157467532576   average loss: 0.0015576542577228362
episode: 2231   score: 0.0   memory length: 366193   epsilon: 0.7154254000093692   global_step: 366193   average_q: 0.12017211514823842   average loss: 0.0031784578151825183
episode: 2232   score: 4.0   memory length: 366464   epsilon: 0.7151815000093772   global_step: 366464   average_q: 0.1782559183910764   average loss: 0.0023720928852710996
episode: 2233   score: 3.0   memory length: 366678   epsilon: 0.7149889000093835   global_step: 366678   average_q: 0.22945062175532369   average loss: 0.0021524616392959283
episode: 2234   score: 1.0   memory length: 366820   epsilon: 0.7148611000093877   global_step: 366820   average_q: 0.1899863748063504   average loss: 0.0027723913115064343
episode: 2235   score: 1.0   memory length: 366977   epsilon: 0.7147198000093924   global_step: 366977   average_q: 0.222949998963410

episode: 2278   score: 2.0   memory length: 375134   epsilon: 0.7073785000096341   global_step: 375134   average_q: 0.31536486446857454   average loss: 0.002266260109718132
episode: 2279   score: 3.0   memory length: 375382   epsilon: 0.7071553000096414   global_step: 375382   average_q: 0.15225188818670088   average loss: 0.002705108828899205
episode: 2280   score: 1.0   memory length: 375556   epsilon: 0.7069987000096466   global_step: 375556   average_q: 0.2078754336669527   average loss: 0.0028012763248353757
episode: 2281   score: 5.0   memory length: 375852   epsilon: 0.7067323000096554   global_step: 375852   average_q: 0.2331493523072552   average loss: 0.0020922776606115607
episode: 2282   score: 0.0   memory length: 375969   epsilon: 0.7066270000096588   global_step: 375969   average_q: 0.1279837942530966   average loss: 0.003122865562934158
episode: 2283   score: 2.0   memory length: 376166   epsilon: 0.7064497000096647   global_step: 376166   average_q: 0.21907277367441788 