# Atari

### Import libraries

In [1]:
import sys
sys.path

['',
 '/home/ricardo/anaconda2_5.1/envs/rl/lib/python27.zip',
 '/home/ricardo/anaconda2_5.1/envs/rl/lib/python2.7',
 '/home/ricardo/anaconda2_5.1/envs/rl/lib/python2.7/plat-linux2',
 '/home/ricardo/anaconda2_5.1/envs/rl/lib/python2.7/lib-tk',
 '/home/ricardo/anaconda2_5.1/envs/rl/lib/python2.7/lib-old',
 '/home/ricardo/anaconda2_5.1/envs/rl/lib/python2.7/lib-dynload',
 '/home/ricardo/anaconda2_5.1/envs/rl/lib/python2.7/site-packages',
 '/home/ricardo/anaconda2_5.1/envs/rl/lib/python2.7/site-packages/IPython/extensions',
 '/home/ricardo/.ipython']

In [12]:
%matplotlib inline

import os
import h5py
import gym
import numpy as np
import pandas as pd
import random
from time import sleep
from numpy.random import randint
from collections import deque


from keras.initializers import normal, identity

from keras.models import load_model
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import SGD , Adam, Adagrad, RMSprop
from keras.layers.normalization import BatchNormalization
from keras.layers import Input, Dense, Merge
from keras.models import Model


from tqdm import tqdm

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

### Creating the game environment

In [2]:
GAME = 'Boxing-ram-v0'
aux = gym.make(GAME)


### Default parameters

In [3]:
# can be configured
BATCH_SIZE = 10
MAX_ITERATIONS_PER_EPISODE = 200
LEARNING_RATE = 0.00025
TARGET_UPDATE_LIMIT = 1
EPSILON = 1.0
EPSILON_UPDATE = 0.1
MIN_EPSILON = 0.1
GAMMA = 0.95
TERMINAL_REWARD = 0
NUMBER_OF_FRAMES = 1

# cannot be configured
INPUT_SIZE = aux.observation_space.shape[0]
OUTPUT_SIZE = aux.action_space.n
MIN_EXPERIENCE_REPLAY_SIZE = 1000
MAX_EXPERIENCE_REPLAY_SIZE = 100000
DATA_TYPE = np.uint8

In [4]:
# np.concatenate(dqn.last_states).shape

In [5]:
# len(dqn.replay)

### Random positions for average q-value

In [6]:
SIZE=10000

random_states_filename = 'random_states_boxing.h5'
if not os.path.isfile(random_states_filename):
    print 'Generating random states'
    with h5py.File(random_states_filename, 'w') as h5:
        random_states = h5.create_dataset('random_states', (SIZE, NUMBER_OF_FRAMES, INPUT_SIZE), dtype=DATA_TYPE)
        data = deque()
        data.append(aux.reset())
        done = False
        for i in range(0, SIZE):
            if done:
                data.clear()
                state = aux.reset()
                done = False
            else:
                state, _, done, _ = aux.step(aux.action_space.sample())                
            
            data.append(state)
            if (len(data) > NUMBER_OF_FRAMES):
                data.popleft()
            if (len(data) == NUMBER_OF_FRAMES):
                random_states[i] = state
else:
    print 'Random states already exists'
            
random_states = np.zeros((SIZE, NUMBER_OF_FRAMES * INPUT_SIZE), dtype=DATA_TYPE)
with h5py.File(random_states_filename, 'r') as h5:
    X = h5.get('random_states')
    for i in range(0, SIZE):
        for j in range(0, NUMBER_OF_FRAMES):
            random_states[i] = np.concatenate(X[i])
            
random_states = random_states / 256.

Random states already exists


In [7]:
random_states.shape

(10000, 128)

# Main source code

In [32]:
def build_model(learning_rate):
    """"Return the neural network"""        
    model = Sequential()
    model.add(Dense(1024, kernel_initializer='he_normal', activation='relu', input_dim=(INPUT_SIZE * NUMBER_OF_FRAMES)))
    model.add(BatchNormalization())
    model.add(Dense(1024, kernel_initializer='he_normal', activation='relu'))
    model.add(BatchNormalization())
    model.add(Dense(OUTPUT_SIZE, kernel_initializer='truncated_normal'))

    optimizer = SGD(lr=learning_rate)
    model.compile(loss='mse', optimizer=optimizer)
    return model

class DeepQNetwork:
    
    def __init__(self, **kwargs):       
        if 'batch_size' in kwargs:
            self.batch_size = kwargs['batch_size']
        else:
            self.batch_size = BATCH_SIZE

        if 'min_experience_replay_size' in kwargs:
            self.minimum_experience_replay_size = kwargs['min_experience_replay_size']
        else:
            self.minimum_experience_replay_size = MIN_EXPERIENCE_REPLAY_SIZE
            
        if 'learning_rate' in kwargs:
            self.learning_rate = kwargs['learning_rate']
        else:
            self.learning_rate = LEARNING_RATE

        if 'epsilon' in kwargs:
            self.epsilon = kwargs['epsilon']
        else:
            self.epsilon = EPSILON
        
        if 'gamma' in kwargs:
            self.gamma = kwargs['gamma']
        else:
            self.gamma = GAMMA
            
        if 'target_update_limit' in kwargs:
            self.target_update_limit = kwargs['target_update_limit']
        else:
            self.target_update_limit = TARGET_UPDATE_LIMIT
            
        if 'max_iterations_per_episode' in kwargs:
            self.max_iterations_per_episode = kwargs['max_iterations_per_episode']
        else:
            self.max_iterations_per_episode = MAX_ITERATIONS_PER_EPISODE
            
        if 'min_epsilon' in kwargs:
            self.min_epsilon = kwargs['min_epsilon']
        else:
            self.min_epsilon = MIN_EPSILON
            
        if 'terminal_reward' in kwargs:
            self.terminal_reward = kwargs['terminal_reward']
        else:
            self.terminal_reward = TERMINAL_REWARD
            
        if 'epsilon_update' in kwargs:
            self.epsilon_update = kwargs['epsilon_update']
        else:
            self.epsilon_update = EPSILON_UPDATE
        
        self.last_action = None
        self.last_states = deque()
        self.total_last_states = 0
        
        self.replay = deque()
        self.total_replay = 0
        
        self.env = gym.make(GAME)
        
        self.reward_sum = 0
        self.total_episodes = 0
        
        self.episode_iterations = 0
        self.target_update = 0                
        
        self.model = build_model(self.learning_rate)
        self.frozen_model = build_model(self.learning_rate)
        
        
    def choose_best_action(self):
        """Return the action a that maximizes q(self.last_states, a)"""
        s = np.array([np.concatenate([self.last_states]).reshape(NUMBER_OF_FRAMES * INPUT_SIZE)])
        q = self.model.predict(s/256.)[0]
        action = np.argmax(q)
        return action, q[action]

    
    def choose_random_action(self):
        return self.env.action_space.sample()
    
    
    def choose_e_greedy_action(self):
        """Return an action chosen following the e-greedy policy"""
        if random.random() <= self.epsilon or self.total_last_states < NUMBER_OF_FRAMES:
            return self.choose_random_action()
        else:
            action, _ = self.choose_best_action()
            return action
        
        
    def execute_action(self, action):
        """Return the reward for executing the action and a boolean
           indicating if the new state is terminal.
        
        """
        state, original_reward, done, _ = self.env.step(action)        
        reward = original_reward
        self.reward_sum += reward
        
#         if done:
#             reward = self.terminal_reward
        
        self.add_replay(action, reward, state, done)        
        
        self.episode_iterations += 1
        if done or self.episode_iterations >= self.max_iterations_per_episode:
            self.episode_iterations = 0
            done = True
            self.total_episodes += 1
            self.reset_states()
        else:
            self.add_state(state)
                
        return original_reward, done
    
    
    def add_replay(self, action, reward, state, done):
        previous_state = None
        if self.total_last_states == NUMBER_OF_FRAMES:
            previous_state = self.copy_last_states()
            
        self.add_state(state)
            
        if previous_state is not None:
            current_state = self.copy_last_states()            
            self.replay.append((previous_state, action, reward, current_state, done))
            self.total_replay += 1
            if self.total_replay > MAX_EXPERIENCE_REPLAY_SIZE:
                self.replay.popleft()      
                self.total_replay -= 1

    
    def copy_last_states(self):
        v = []
        for s in self.last_states:
            v.append(np.copy(s))
        return v
    
    
    def add_state(self, state):
        self.last_states.append(np.array(state, dtype=DATA_TYPE))
        self.total_last_states += 1
        if self.total_last_states > NUMBER_OF_FRAMES:
            self.last_states.popleft()
            self.total_last_states -= 1
            
            
    def reset_states(self):
        self.last_states.clear()
        self.last_states.append(np.array(self.env.reset(), dtype=DATA_TYPE))
        self.total_last_states = 1            
        
    def run_test_average_reward(self, total_episodes, render): 
        """Run the environment without traning.
        
           Keyword arguments:
           total_episodes -- number of times the environment 
               will be run.
           render -- boolean indicating if the screen must be
               rendered
        """
        reward_sum = 0
        total_iterations = 0
                
        for _ in range(0, total_episodes):
            self.episode_iterations = 0
            self.reset_states()
            done = False
            reward = 0
            
            while not done:
                if render:
                    sleep(0.03)
                    self.env.render()
                
                if self.total_last_states < NUMBER_OF_FRAMES:
                    action = self.choose_random_action()
                else:  
#                     print self.total_last_states
#                     print self.last_states.shape
                    action, _ = self.choose_best_action()
                total_iterations += 1
                
                r, done = self.execute_action(action)
                reward += r              
            reward_sum += reward
            if render:
                sleep(2.0)
            
        avg_reward = reward_sum / float(total_iterations)
        
        return avg_reward
    
    
    def run_test_average_qvalue(self):
        """Calculate the average max q-value for the random_states"""
        y = self.model.predict(random_states)        
        return np.average(np.amax(y, axis=1))
        
    def update_network(self):
        """Execute a mini-batch update
        """
        batch = random.sample(self.replay, self.batch_size - 1)
        batch.append(self.replay[-1])

        X_last = np.zeros((self.batch_size, NUMBER_OF_FRAMES * INPUT_SIZE), dtype=DATA_TYPE)
        X_current = np.zeros((self.batch_size, NUMBER_OF_FRAMES * INPUT_SIZE), dtype=DATA_TYPE)        

        for i in range(0, self.batch_size):
            ls, la, r, s, d = batch[i]
            X_last[i] = np.concatenate([ls]).reshape(NUMBER_OF_FRAMES * INPUT_SIZE)
            X_current[i] = np.concatenate([s]).reshape(NUMBER_OF_FRAMES * INPUT_SIZE)
            
        y = self.model.predict(X_last/256.)
        
        self.target_update += 1        
        if self.target_update >= self.target_update_limit:
            self.target_update = 0
            self.frozen_model.set_weights(self.model.get_weights())
            
        q_theta = self.frozen_model.predict(X_current/256.)
        
        for i in range(0, self.batch_size):
            _, la, r, _, d = batch[i]

            if d:
                score = r
            else:
                score = r + self.gamma * np.max(q_theta[i])

            y[i][la] = score

        loss = self.model.train_on_batch(X_last/256., y)
#         loss = self.model.fit(X_last, y, batch_size=32, nb_epoch=3, verbose=1)        
            
    def train(self, total_frames, render):
        """Run the neural network training
        
           Keyword arguments:
           total_frames -- number of times the training process will be executed
           render -- if the screen should be rendered
           
        """
        self.reset_states()
               
        training_iterations = 0
        
        self.episode_iterations = 0
        while(training_iterations < total_frames):
            if render:
                self.env.render()           
        
            action = self.choose_e_greedy_action()
            self.execute_action(action)
            
            if self.total_replay > MIN_EXPERIENCE_REPLAY_SIZE:
                training_iterations += 1
                self.update_network()
                
        # update epsilon        
        self.epsilon = max(self.min_epsilon, self.epsilon - self.epsilon_update)

                
class Experiment:
    def __init__(self):
        self.data = []
        self.best_score = None
        
    def add_data(self, epoch, average_reward, average_qvalue):
        t = (epoch, average_reward, average_qvalue)
        print 'epoch: {} avg_reward: {} avg_qvalue: {}'.format(epoch, average_reward, average_qvalue)
        self.data.append(t)
        
        
    def get_dataframe(self):
        df = pd.DataFrame(self.data, columns=['epoch', 'avg_reward', 'avg_qvalue'])
        return df
    
    
    def execute(self, epochs, dqn, model_name):    
        self.best_score = None 
        self.best_qvalue = None
        self.best_filename = None
        c = 0
        
        for i in tqdm(range(epochs)):
            c += 1
            
            filename = '/tmp/%s.%03d.h5'%(model_name, c)
            
            # train
            dqn.train(2000, False)                 

            # test
            avg_reward = dqn.run_test_average_reward(100, False)            
            avg_qvalue = dqn.run_test_average_qvalue()
            if self.best_score is None or avg_reward > self.best_score or \
                (avg_reward == self.best_score and avg_qvalue >= self.best_qvalue):
                    
                self.best_score = avg_reward
                self.best_qvalue = avg_qvalue
                self.best_filename = filename

            self.add_data(c, avg_reward, avg_qvalue)

            # save model            
            dqn.model.save_weights(filename)

            
def generate_graphic(filename, adjusted_qvalue, qvalue_range=[-100,150], reward_range=[0,250]):
    sns.set_context("paper")
    sns.set_style('dark')
    
    df = pd.DataFrame(experiment.get_dataframe(), columns=['epoch', 'avg_reward', 'avg_qvalue'])
    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    ax1.plot(df['epoch'], df['avg_reward'])
    if reward_range is not None:
        ax1.set_ylim(reward_range)
    ax1.set_xlabel('Number of epochs')
    ax1.set_ylabel('Average reward')
    
    ax1.get_xaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    ax1.get_yaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    ax1.grid(b=True, which='major', color='w', linewidth=1.5)
    ax1.grid(b=True, which='minor', color='w', linewidth=0.5)

    ax2 = ax1.twinx()
    ax2.plot(df['epoch'], df['avg_qvalue'] + adjusted_qvalue, color='r')
    ax2.set_ylabel('Average q-value')
    if qvalue_range is not None:
        ax2.set_ylim(qvalue_range)
    
    ax1.legend(loc=2)
    ax2.legend(loc=0)
    
    plt.savefig(filename)

# Experiments

The running timing for most of training is about 4 to 10 minutes, except when the maximum number of iterations in an episode is greater than the default value (200).

### Best model - restricted
- target update 200
- gamma 0.99
- reward at end -100
- learning rate 0.0001

epoch, average_reward, average_qvalue

In [28]:
random_states.shape

(10000, 128)

In [29]:
label = 'best_restricted'

config = {
    'epsilon': 0.5,
    'gamma' : 0.9,
    'target_update_limit' : 100,
    'learning_rate' : 0.001
#     'max_iterations_per_episode' : 500
}

dqn = DeepQNetwork(**config)
dqn.batch_size = 50

experiment = Experiment()
experiment.execute(500, dqn, label)
print 'Best iteration'
print 'score: {} | qvalue: {} | model: {}' \
    .format(experiment.best_score, experiment.best_qvalue, experiment.best_filename)
generate_graphic('report/images/{}.pdf'.format(label), 0)






  0%|          | 0/500 [00:00<?, ?it/s][A[A[A[A[A




  0%|          | 1/500 [01:38<13:37:06, 98.25s/it][A[A[A[A[A

epoch: 1 avg_reward: -0.004 avg_qvalue: 1.5465708971







  0%|          | 2/500 [03:06<12:54:25, 93.30s/it][A[A[A[A[A

epoch: 2 avg_reward: 0.0042 avg_qvalue: 1.43246245384







  1%|          | 3/500 [04:33<12:36:16, 91.30s/it][A[A[A[A[A

epoch: 3 avg_reward: 0.00305 avg_qvalue: 1.36249279976







  1%|          | 4/500 [06:05<12:35:38, 91.41s/it][A[A[A[A[A

epoch: 4 avg_reward: 0.0019 avg_qvalue: 1.31984615326







  1%|          | 5/500 [07:33<12:28:03, 90.67s/it][A[A[A[A[A

epoch: 5 avg_reward: 0.00285 avg_qvalue: 1.28795802593







  1%|          | 6/500 [08:59<12:20:46, 89.97s/it][A[A[A[A[A

epoch: 6 avg_reward: -0.0018 avg_qvalue: 1.26082706451







  1%|▏         | 7/500 [10:27<12:16:04, 89.58s/it][A[A[A[A[A

epoch: 7 avg_reward: 0.0032 avg_qvalue: 1.22907483578







  2%|▏         | 8/500 [11:53<12:11:43, 89.23s/it][A[A[A[A[A

epoch: 8 avg_reward: 5e-05 avg_qvalue: 1.20551133156







  2%|▏         | 9/500 [13:20<12:07:39, 88.92s/it][A[A[A[A[A

epoch: 9 avg_reward: 0.0002 avg_qvalue: 1.16938686371







  2%|▏         | 10/500 [14:46<12:04:13, 88.68s/it][A[A[A[A[A

epoch: 10 avg_reward: -0.00245 avg_qvalue: 1.14169955254







  2%|▏         | 11/500 [16:13<12:01:14, 88.50s/it][A[A[A[A[A

epoch: 11 avg_reward: -0.00225 avg_qvalue: 1.11212658882







  2%|▏         | 12/500 [17:50<12:05:51, 89.24s/it][A[A[A[A[A

epoch: 12 avg_reward: 0.0016 avg_qvalue: 1.09531068802







  3%|▎         | 13/500 [19:20<12:04:51, 89.30s/it][A[A[A[A[A

epoch: 13 avg_reward: 0.0 avg_qvalue: 1.0727789402







  3%|▎         | 14/500 [20:49<12:03:01, 89.26s/it][A[A[A[A[A

epoch: 14 avg_reward: 0.0002 avg_qvalue: 1.05212605







  3%|▎         | 15/500 [22:17<12:00:59, 89.19s/it][A[A[A[A[A

epoch: 15 avg_reward: 0.00095 avg_qvalue: 1.02817797661







  3%|▎         | 16/500 [23:44<11:57:56, 89.00s/it][A[A[A[A[A

epoch: 16 avg_reward: 0.00215 avg_qvalue: 1.0209993124







  3%|▎         | 17/500 [25:09<11:54:43, 88.79s/it][A[A[A[A[A

epoch: 17 avg_reward: 0.0024 avg_qvalue: 0.995613873005







  4%|▎         | 18/500 [26:35<11:52:00, 88.63s/it][A[A[A[A[A

epoch: 18 avg_reward: 0.0012 avg_qvalue: 0.986033022404







  4%|▍         | 19/500 [28:02<11:49:52, 88.55s/it][A[A[A[A[A

epoch: 19 avg_reward: 0.0009 avg_qvalue: 0.968136012554







  4%|▍         | 20/500 [29:30<11:48:03, 88.51s/it][A[A[A[A[A

epoch: 20 avg_reward: 0.00085 avg_qvalue: 0.95838946104







  4%|▍         | 21/500 [30:55<11:45:27, 88.37s/it][A[A[A[A[A

epoch: 21 avg_reward: 0.0022 avg_qvalue: 0.947191774845







  4%|▍         | 22/500 [32:27<11:45:24, 88.54s/it][A[A[A[A[A

epoch: 22 avg_reward: 0.00285 avg_qvalue: 0.939319849014







  5%|▍         | 23/500 [33:54<11:43:12, 88.45s/it][A[A[A[A[A

epoch: 23 avg_reward: 0.0044 avg_qvalue: 0.931153893471







  5%|▍         | 24/500 [35:20<11:41:05, 88.37s/it][A[A[A[A[A

epoch: 24 avg_reward: 0.0027 avg_qvalue: 0.919090032578







  5%|▌         | 25/500 [36:47<11:38:58, 88.29s/it][A[A[A[A[A

epoch: 25 avg_reward: -0.00055 avg_qvalue: 0.906021952629







  5%|▌         | 26/500 [38:14<11:37:03, 88.24s/it][A[A[A[A[A

epoch: 26 avg_reward: 0.0014 avg_qvalue: 0.89236843586







  5%|▌         | 27/500 [39:40<11:35:05, 88.17s/it][A[A[A[A[A

epoch: 27 avg_reward: 0.00225 avg_qvalue: 0.887768268585







  6%|▌         | 28/500 [41:06<11:32:57, 88.09s/it][A[A[A[A[A

epoch: 28 avg_reward: 0.00155 avg_qvalue: 0.876259207726







  6%|▌         | 29/500 [42:32<11:30:56, 88.02s/it][A[A[A[A[A

epoch: 29 avg_reward: 0.00195 avg_qvalue: 0.865933597088







  6%|▌         | 30/500 [43:58<11:28:53, 87.94s/it][A[A[A[A[A

epoch: 30 avg_reward: 0.0022 avg_qvalue: 0.858985364437







  6%|▌         | 31/500 [45:24<11:27:05, 87.90s/it][A[A[A[A[A

epoch: 31 avg_reward: 0.0028 avg_qvalue: 0.853391289711







  6%|▋         | 32/500 [46:50<11:25:03, 87.83s/it][A[A[A[A[A

epoch: 32 avg_reward: 0.00155 avg_qvalue: 0.84537011385







  7%|▋         | 33/500 [48:16<11:23:06, 87.77s/it][A[A[A[A[A

epoch: 33 avg_reward: 0.0039 avg_qvalue: 0.837652146816







  7%|▋         | 34/500 [49:42<11:21:13, 87.71s/it][A[A[A[A[A

epoch: 34 avg_reward: -0.00075 avg_qvalue: 0.83429479599







  7%|▋         | 35/500 [51:08<11:19:21, 87.66s/it][A[A[A[A[A

epoch: 35 avg_reward: 0.0011 avg_qvalue: 0.827459096909







  7%|▋         | 36/500 [52:33<11:17:29, 87.61s/it][A[A[A[A[A

epoch: 36 avg_reward: 0.0044 avg_qvalue: 0.821823835373







  7%|▋         | 37/500 [53:59<11:15:42, 87.57s/it][A[A[A[A[A

epoch: 37 avg_reward: 0.00235 avg_qvalue: 0.812978744507







  8%|▊         | 38/500 [55:26<11:14:05, 87.54s/it][A[A[A[A[A

epoch: 38 avg_reward: 0.0028 avg_qvalue: 0.805828511715







  8%|▊         | 39/500 [57:00<11:13:51, 87.70s/it][A[A[A[A[A

epoch: 39 avg_reward: 0.00295 avg_qvalue: 0.802082121372







  8%|▊         | 40/500 [58:47<11:16:08, 88.19s/it][A[A[A[A[A

epoch: 40 avg_reward: 0.0021 avg_qvalue: 0.794429421425







  8%|▊         | 41/500 [1:00:24<11:16:17, 88.40s/it][A[A[A[A[A

epoch: 41 avg_reward: 0.0012 avg_qvalue: 0.792669057846







  8%|▊         | 42/500 [1:02:01<11:16:19, 88.60s/it][A[A[A[A[A

epoch: 42 avg_reward: 0.00355 avg_qvalue: 0.77863150835







  9%|▊         | 43/500 [1:03:31<11:15:08, 88.64s/it][A[A[A[A[A

epoch: 43 avg_reward: 0.0033 avg_qvalue: 0.782872617245







  9%|▉         | 44/500 [1:05:17<11:16:35, 89.02s/it][A[A[A[A[A

epoch: 44 avg_reward: 0.0057 avg_qvalue: 0.763475179672







  9%|▉         | 45/500 [1:06:55<11:16:37, 89.23s/it][A[A[A[A[A

epoch: 45 avg_reward: 0.00315 avg_qvalue: 0.758453845978







  9%|▉         | 46/500 [1:08:39<11:17:32, 89.54s/it][A[A[A[A[A

epoch: 46 avg_reward: 0.00185 avg_qvalue: 0.761116623878







  9%|▉         | 47/500 [1:10:15<11:17:09, 89.69s/it][A[A[A[A[A

epoch: 47 avg_reward: 0.00075 avg_qvalue: 0.751265883446







 10%|▉         | 48/500 [1:11:41<11:15:07, 89.62s/it][A[A[A[A[A

epoch: 48 avg_reward: 0.00395 avg_qvalue: 0.745303988457







 10%|▉         | 49/500 [1:13:07<11:13:03, 89.54s/it][A[A[A[A[A

epoch: 49 avg_reward: 0.00525 avg_qvalue: 0.744922637939







 10%|█         | 50/500 [1:14:33<11:11:05, 89.48s/it][A[A[A[A[A

epoch: 50 avg_reward: 0.00545 avg_qvalue: 0.734149515629







 10%|█         | 51/500 [1:16:00<11:09:08, 89.42s/it][A[A[A[A[A

epoch: 51 avg_reward: 0.0037 avg_qvalue: 0.727303922176







 10%|█         | 52/500 [1:17:26<11:07:12, 89.36s/it][A[A[A[A[A

epoch: 52 avg_reward: 0.00215 avg_qvalue: 0.721979796886







 11%|█         | 53/500 [1:18:51<11:05:07, 89.28s/it][A[A[A[A[A

epoch: 53 avg_reward: 0.00525 avg_qvalue: 0.730162620544







 11%|█         | 54/500 [1:20:17<11:03:07, 89.21s/it][A[A[A[A[A

epoch: 54 avg_reward: 0.00225 avg_qvalue: 0.721188187599







 11%|█         | 55/500 [1:21:42<11:01:06, 89.14s/it][A[A[A[A[A

epoch: 55 avg_reward: 0.00645 avg_qvalue: 0.716693282127







 11%|█         | 56/500 [1:23:07<10:59:00, 89.06s/it][A[A[A[A[A

epoch: 56 avg_reward: 0.0022 avg_qvalue: 0.710305273533







 11%|█▏        | 57/500 [1:24:32<10:56:59, 88.98s/it][A[A[A[A[A

epoch: 57 avg_reward: 0.00405 avg_qvalue: 0.70346981287







 12%|█▏        | 58/500 [1:25:56<10:54:59, 88.91s/it][A[A[A[A[A

epoch: 58 avg_reward: 0.00295 avg_qvalue: 0.699046075344







 12%|█▏        | 59/500 [1:27:21<10:52:59, 88.84s/it][A[A[A[A[A

epoch: 59 avg_reward: 0.0061 avg_qvalue: 0.70050573349







 12%|█▏        | 60/500 [1:28:45<10:50:55, 88.76s/it][A[A[A[A[A

epoch: 60 avg_reward: 0.0036 avg_qvalue: 0.692841231823







 12%|█▏        | 61/500 [1:30:10<10:48:57, 88.70s/it][A[A[A[A[A

epoch: 61 avg_reward: 0.00315 avg_qvalue: 0.692662477493







 12%|█▏        | 62/500 [1:31:35<10:47:02, 88.64s/it][A[A[A[A[A

epoch: 62 avg_reward: 0.0042 avg_qvalue: 0.688013851643







 13%|█▎        | 63/500 [1:33:00<10:45:07, 88.57s/it][A[A[A[A[A

epoch: 63 avg_reward: 0.0017 avg_qvalue: 0.682074964046







 13%|█▎        | 64/500 [1:34:24<10:43:11, 88.51s/it][A[A[A[A[A

epoch: 64 avg_reward: 0.00255 avg_qvalue: 0.681843936443







 13%|█▎        | 65/500 [1:35:49<10:41:17, 88.45s/it][A[A[A[A[A

epoch: 65 avg_reward: 0.00245 avg_qvalue: 0.677308320999







 13%|█▎        | 66/500 [1:37:13<10:39:22, 88.39s/it][A[A[A[A[A

epoch: 66 avg_reward: 0.0027 avg_qvalue: 0.673467457294







 13%|█▎        | 67/500 [1:38:38<10:37:29, 88.34s/it][A[A[A[A[A

epoch: 67 avg_reward: 0.0026 avg_qvalue: 0.669336259365







 14%|█▎        | 68/500 [1:40:03<10:35:38, 88.28s/it][A[A[A[A[A

epoch: 68 avg_reward: 0.0017 avg_qvalue: 0.669997930527







 14%|█▍        | 69/500 [1:41:28<10:33:50, 88.24s/it][A[A[A[A[A

epoch: 69 avg_reward: 0.00235 avg_qvalue: 0.662499785423







 14%|█▍        | 70/500 [1:42:52<10:31:58, 88.18s/it][A[A[A[A[A

epoch: 70 avg_reward: 0.004 avg_qvalue: 0.662498950958







 14%|█▍        | 71/500 [1:44:17<10:30:10, 88.14s/it][A[A[A[A[A

epoch: 71 avg_reward: -0.00075 avg_qvalue: 0.66288203001







 14%|█▍        | 72/500 [1:45:42<10:28:21, 88.09s/it][A[A[A[A[A

epoch: 72 avg_reward: 0.0021 avg_qvalue: 0.658409118652







 15%|█▍        | 73/500 [1:47:06<10:26:31, 88.04s/it][A[A[A[A[A

epoch: 73 avg_reward: 0.0021 avg_qvalue: 0.65212726593







 15%|█▍        | 74/500 [1:48:31<10:24:47, 88.00s/it][A[A[A[A[A

epoch: 74 avg_reward: 0.00095 avg_qvalue: 0.649167656898







 15%|█▌        | 75/500 [1:49:58<10:23:11, 87.98s/it][A[A[A[A[A

epoch: 75 avg_reward: 0.00065 avg_qvalue: 0.650261640549







 15%|█▌        | 76/500 [1:51:23<10:21:29, 87.95s/it][A[A[A[A[A

epoch: 76 avg_reward: 0.00295 avg_qvalue: 0.646754562855







 15%|█▌        | 77/500 [1:52:48<10:19:43, 87.90s/it][A[A[A[A[A

epoch: 77 avg_reward: 0.00045 avg_qvalue: 0.637589812279







 16%|█▌        | 78/500 [1:54:14<10:18:02, 87.87s/it][A[A[A[A[A

epoch: 78 avg_reward: 0.00375 avg_qvalue: 0.639348804951







 16%|█▌        | 79/500 [1:55:39<10:16:21, 87.84s/it][A[A[A[A[A

epoch: 79 avg_reward: 0.00235 avg_qvalue: 0.641101658344







 16%|█▌        | 80/500 [1:57:04<10:14:36, 87.80s/it][A[A[A[A[A

epoch: 80 avg_reward: 0.002 avg_qvalue: 0.636457920074







 16%|█▌        | 81/500 [1:58:28<10:12:53, 87.76s/it][A[A[A[A[A

epoch: 81 avg_reward: 0.00315 avg_qvalue: 0.632496654987







 16%|█▋        | 82/500 [1:59:53<10:11:10, 87.73s/it][A[A[A[A[A

epoch: 82 avg_reward: -0.0001 avg_qvalue: 0.628294527531







 17%|█▋        | 83/500 [2:01:19<10:09:31, 87.70s/it][A[A[A[A[A

epoch: 83 avg_reward: 0.00105 avg_qvalue: 0.623145639896







 17%|█▋        | 84/500 [2:02:45<10:07:58, 87.69s/it][A[A[A[A[A

epoch: 84 avg_reward: 0.00125 avg_qvalue: 0.618650972843







 17%|█▋        | 85/500 [2:04:12<10:06:24, 87.67s/it][A[A[A[A[A

epoch: 85 avg_reward: 0.00115 avg_qvalue: 0.615324020386







 17%|█▋        | 86/500 [2:05:39<10:04:54, 87.67s/it][A[A[A[A[A

epoch: 86 avg_reward: 0.00015 avg_qvalue: 0.614167869091







 17%|█▋        | 87/500 [2:07:06<10:03:22, 87.66s/it][A[A[A[A[A

epoch: 87 avg_reward: -0.00065 avg_qvalue: 0.609667241573







 18%|█▊        | 88/500 [2:08:33<10:01:55, 87.66s/it][A[A[A[A[A

epoch: 88 avg_reward: 0.00195 avg_qvalue: 0.609617471695







 18%|█▊        | 89/500 [2:10:02<10:00:31, 87.67s/it][A[A[A[A[A

epoch: 89 avg_reward: 0.001 avg_qvalue: 0.607652842999







 18%|█▊        | 90/500 [2:11:29<9:59:01, 87.66s/it] [A[A[A[A[A

epoch: 90 avg_reward: 0.0045 avg_qvalue: 0.605978012085







 18%|█▊        | 91/500 [2:12:56<9:57:30, 87.65s/it][A[A[A[A[A

epoch: 91 avg_reward: 0.00295 avg_qvalue: 0.599923849106







 18%|█▊        | 92/500 [2:14:23<9:56:00, 87.65s/it][A[A[A[A[A

epoch: 92 avg_reward: 0.00235 avg_qvalue: 0.599678397179







 19%|█▊        | 93/500 [2:15:50<9:54:27, 87.64s/it][A[A[A[A[A

epoch: 93 avg_reward: 0.00045 avg_qvalue: 0.596203863621







 19%|█▉        | 94/500 [2:17:17<9:52:57, 87.63s/it][A[A[A[A[A

epoch: 94 avg_reward: 0.00235 avg_qvalue: 0.592735290527







 19%|█▉        | 95/500 [2:18:41<9:51:16, 87.60s/it][A[A[A[A[A

epoch: 95 avg_reward: 0.00225 avg_qvalue: 0.588505983353







 19%|█▉        | 96/500 [2:20:06<9:49:37, 87.57s/it][A[A[A[A[A

epoch: 96 avg_reward: 0.0054 avg_qvalue: 0.586819648743







 19%|█▉        | 97/500 [2:21:31<9:47:59, 87.54s/it][A[A[A[A[A

epoch: 97 avg_reward: 0.0007 avg_qvalue: 0.590950906277







 20%|█▉        | 98/500 [2:22:56<9:46:19, 87.51s/it][A[A[A[A[A

epoch: 98 avg_reward: 0.0024 avg_qvalue: 0.587283611298







 20%|█▉        | 99/500 [2:24:21<9:44:41, 87.49s/it][A[A[A[A[A

epoch: 99 avg_reward: 0.0049 avg_qvalue: 0.580280840397







 20%|██        | 100/500 [2:25:46<9:43:04, 87.46s/it][A[A[A[A[A

epoch: 100 avg_reward: 0.0013 avg_qvalue: 0.583826303482







 20%|██        | 101/500 [2:27:10<9:41:24, 87.43s/it][A[A[A[A[A

epoch: 101 avg_reward: 0.0023 avg_qvalue: 0.574902534485







 20%|██        | 102/500 [2:28:35<9:39:47, 87.41s/it][A[A[A[A[A

epoch: 102 avg_reward: 0.0022 avg_qvalue: 0.574777126312







 21%|██        | 103/500 [2:30:00<9:38:10, 87.38s/it][A[A[A[A[A

epoch: 103 avg_reward: 0.0052 avg_qvalue: 0.572113990784







 21%|██        | 104/500 [2:31:25<9:36:33, 87.36s/it][A[A[A[A[A

epoch: 104 avg_reward: 0.00195 avg_qvalue: 0.567796885967







 21%|██        | 105/500 [2:32:50<9:34:59, 87.34s/it][A[A[A[A[A

epoch: 105 avg_reward: 0.00335 avg_qvalue: 0.564960420132







 21%|██        | 106/500 [2:34:15<9:33:22, 87.32s/it][A[A[A[A[A

epoch: 106 avg_reward: 0.0026 avg_qvalue: 0.567353308201







 21%|██▏       | 107/500 [2:35:40<9:31:46, 87.29s/it][A[A[A[A[A

epoch: 107 avg_reward: 0.00085 avg_qvalue: 0.567223608494







 22%|██▏       | 108/500 [2:37:04<9:30:08, 87.27s/it][A[A[A[A[A

epoch: 108 avg_reward: 0.00305 avg_qvalue: 0.564447879791







 22%|██▏       | 109/500 [2:38:29<9:28:33, 87.25s/it][A[A[A[A[A

epoch: 109 avg_reward: -0.001 avg_qvalue: 0.562723517418







 22%|██▏       | 110/500 [2:39:55<9:26:59, 87.23s/it][A[A[A[A[A

epoch: 110 avg_reward: 0.0042 avg_qvalue: 0.558654487133







 22%|██▏       | 111/500 [2:41:20<9:25:25, 87.21s/it][A[A[A[A[A

epoch: 111 avg_reward: 0.00235 avg_qvalue: 0.557373344898







 22%|██▏       | 112/500 [2:42:45<9:23:49, 87.19s/it][A[A[A[A[A

epoch: 112 avg_reward: 0.00335 avg_qvalue: 0.554072439671







 23%|██▎       | 113/500 [2:44:10<9:22:15, 87.17s/it][A[A[A[A[A

epoch: 113 avg_reward: 0.0032 avg_qvalue: 0.554578006268







 23%|██▎       | 114/500 [2:45:35<9:20:40, 87.15s/it][A[A[A[A[A

epoch: 114 avg_reward: 0.0018 avg_qvalue: 0.554960131645







 23%|██▎       | 115/500 [2:47:00<9:19:05, 87.13s/it][A[A[A[A[A

epoch: 115 avg_reward: 0.0058 avg_qvalue: 0.551482498646







 23%|██▎       | 116/500 [2:48:27<9:17:38, 87.13s/it][A[A[A[A[A

epoch: 116 avg_reward: 0.00225 avg_qvalue: 0.544745385647







 23%|██▎       | 117/500 [2:49:53<9:16:09, 87.13s/it][A[A[A[A[A

epoch: 117 avg_reward: 0.0027 avg_qvalue: 0.547619640827







 24%|██▎       | 118/500 [2:51:21<9:14:43, 87.13s/it][A[A[A[A[A

epoch: 118 avg_reward: 0.00455 avg_qvalue: 0.545391976833







 24%|██▍       | 119/500 [2:52:48<9:13:15, 87.13s/it][A[A[A[A[A

epoch: 119 avg_reward: 0.00405 avg_qvalue: 0.538779258728







 24%|██▍       | 120/500 [2:54:15<9:11:48, 87.13s/it][A[A[A[A[A

epoch: 120 avg_reward: 0.0053 avg_qvalue: 0.533820450306







 24%|██▍       | 121/500 [2:55:40<9:10:15, 87.11s/it][A[A[A[A[A

epoch: 121 avg_reward: 0.0062 avg_qvalue: 0.538390159607







 24%|██▍       | 122/500 [2:57:05<9:08:40, 87.09s/it][A[A[A[A[A

epoch: 122 avg_reward: 0.0025 avg_qvalue: 0.535636901855







 25%|██▍       | 123/500 [2:58:30<9:07:08, 87.08s/it][A[A[A[A[A

epoch: 123 avg_reward: 0.0035 avg_qvalue: 0.532891631126







 25%|██▍       | 124/500 [2:59:55<9:05:35, 87.06s/it][A[A[A[A[A

epoch: 124 avg_reward: 0.003 avg_qvalue: 0.532157957554







 25%|██▌       | 125/500 [3:01:21<9:04:03, 87.05s/it][A[A[A[A[A

epoch: 125 avg_reward: 0.0009 avg_qvalue: 0.53651458025







 25%|██▌       | 126/500 [3:02:46<9:02:30, 87.03s/it][A[A[A[A[A

epoch: 126 avg_reward: 0.004 avg_qvalue: 0.535149097443







 25%|██▌       | 127/500 [3:04:11<9:00:58, 87.02s/it][A[A[A[A[A

epoch: 127 avg_reward: 0.0024 avg_qvalue: 0.533363819122







 26%|██▌       | 128/500 [3:05:36<8:59:25, 87.00s/it][A[A[A[A[A

epoch: 128 avg_reward: 0.0022 avg_qvalue: 0.532265126705







 26%|██▌       | 129/500 [3:07:00<8:57:50, 86.98s/it][A[A[A[A[A

epoch: 129 avg_reward: 0.0033 avg_qvalue: 0.527582883835







 26%|██▌       | 130/500 [3:08:26<8:56:18, 86.97s/it][A[A[A[A[A

epoch: 130 avg_reward: 0.0028 avg_qvalue: 0.52811807394







 26%|██▌       | 131/500 [3:09:51<8:54:46, 86.96s/it][A[A[A[A[A

epoch: 131 avg_reward: 0.0037 avg_qvalue: 0.522363603115







 26%|██▋       | 132/500 [3:11:16<8:53:15, 86.95s/it][A[A[A[A[A

epoch: 132 avg_reward: 0.00455 avg_qvalue: 0.519703149796







 27%|██▋       | 133/500 [3:12:41<8:51:44, 86.93s/it][A[A[A[A[A

epoch: 133 avg_reward: 0.0044 avg_qvalue: 0.522704720497







 27%|██▋       | 134/500 [3:14:07<8:50:12, 86.92s/it][A[A[A[A[A

epoch: 134 avg_reward: 0.0089 avg_qvalue: 0.524323225021







 27%|██▋       | 135/500 [3:15:32<8:48:41, 86.91s/it][A[A[A[A[A

epoch: 135 avg_reward: 0.00595 avg_qvalue: 0.521416902542







 27%|██▋       | 136/500 [3:16:57<8:47:08, 86.89s/it][A[A[A[A[A

epoch: 136 avg_reward: 0.0021 avg_qvalue: 0.517883300781







 27%|██▋       | 137/500 [3:18:22<8:45:37, 86.88s/it][A[A[A[A[A

epoch: 137 avg_reward: 0.0041 avg_qvalue: 0.519525647163







 28%|██▊       | 138/500 [3:19:47<8:44:06, 86.87s/it][A[A[A[A[A

epoch: 138 avg_reward: 0.0062 avg_qvalue: 0.515288114548







 28%|██▊       | 139/500 [3:21:12<8:42:33, 86.85s/it][A[A[A[A[A

epoch: 139 avg_reward: 0.009 avg_qvalue: 0.514632225037







 28%|██▊       | 140/500 [3:22:37<8:41:02, 86.84s/it][A[A[A[A[A

epoch: 140 avg_reward: 0.00395 avg_qvalue: 0.515253543854







 28%|██▊       | 141/500 [3:24:03<8:39:31, 86.83s/it][A[A[A[A[A

epoch: 141 avg_reward: 0.0061 avg_qvalue: 0.516238868237







 28%|██▊       | 142/500 [3:25:27<8:38:00, 86.82s/it][A[A[A[A[A

epoch: 142 avg_reward: 0.0054 avg_qvalue: 0.511437177658







 29%|██▊       | 143/500 [3:26:52<8:36:28, 86.80s/it][A[A[A[A[A

epoch: 143 avg_reward: 0.0052 avg_qvalue: 0.514711499214







 29%|██▉       | 144/500 [3:28:19<8:35:01, 86.80s/it][A[A[A[A[A

epoch: 144 avg_reward: 0.00555 avg_qvalue: 0.511130928993







 29%|██▉       | 145/500 [3:29:46<8:33:34, 86.80s/it][A[A[A[A[A

epoch: 145 avg_reward: 0.00675 avg_qvalue: 0.506460547447







 29%|██▉       | 146/500 [3:31:13<8:32:07, 86.80s/it][A[A[A[A[A

epoch: 146 avg_reward: 0.0067 avg_qvalue: 0.509374678135







 29%|██▉       | 147/500 [3:32:40<8:30:41, 86.80s/it][A[A[A[A[A

epoch: 147 avg_reward: 0.0051 avg_qvalue: 0.510168969631







 30%|██▉       | 148/500 [3:34:05<8:29:11, 86.79s/it][A[A[A[A[A

epoch: 148 avg_reward: 0.00415 avg_qvalue: 0.501254320145







 30%|██▉       | 149/500 [3:35:30<8:27:40, 86.78s/it][A[A[A[A[A

epoch: 149 avg_reward: 0.0079 avg_qvalue: 0.503592789173







 30%|███       | 150/500 [3:36:55<8:26:09, 86.77s/it][A[A[A[A[A

epoch: 150 avg_reward: 0.00875 avg_qvalue: 0.507080197334







 30%|███       | 151/500 [3:38:21<8:24:40, 86.76s/it][A[A[A[A[A

epoch: 151 avg_reward: 0.00815 avg_qvalue: 0.504088103771







 30%|███       | 152/500 [3:39:47<8:23:12, 86.76s/it][A[A[A[A[A

epoch: 152 avg_reward: 0.0095 avg_qvalue: 0.504079937935







 31%|███       | 153/500 [3:41:12<8:21:40, 86.75s/it][A[A[A[A[A

epoch: 153 avg_reward: 0.00775 avg_qvalue: 0.503632247448







 31%|███       | 154/500 [3:42:37<8:20:09, 86.73s/it][A[A[A[A[A

epoch: 154 avg_reward: 0.0106 avg_qvalue: 0.507670342922







 31%|███       | 155/500 [3:44:02<8:18:39, 86.72s/it][A[A[A[A[A

epoch: 155 avg_reward: 0.0096 avg_qvalue: 0.502967655659







 31%|███       | 156/500 [3:45:28<8:17:11, 86.72s/it][A[A[A[A[A

epoch: 156 avg_reward: 0.0111 avg_qvalue: 0.503806591034







 31%|███▏      | 157/500 [3:46:52<8:15:40, 86.71s/it][A[A[A[A[A

epoch: 157 avg_reward: 0.00755 avg_qvalue: 0.507227659225







 32%|███▏      | 158/500 [3:48:18<8:14:10, 86.70s/it][A[A[A[A[A

epoch: 158 avg_reward: 0.01235 avg_qvalue: 0.501711130142







 32%|███▏      | 159/500 [3:49:43<8:12:40, 86.69s/it][A[A[A[A[A

epoch: 159 avg_reward: 0.01005 avg_qvalue: 0.503832638264







 32%|███▏      | 160/500 [3:51:08<8:11:11, 86.68s/it][A[A[A[A[A

epoch: 160 avg_reward: 0.00755 avg_qvalue: 0.502801537514







 32%|███▏      | 161/500 [3:52:33<8:09:41, 86.67s/it][A[A[A[A[A

epoch: 161 avg_reward: 0.00795 avg_qvalue: 0.498971194029







 32%|███▏      | 162/500 [3:53:58<8:08:11, 86.66s/it][A[A[A[A[A

epoch: 162 avg_reward: 0.00995 avg_qvalue: 0.498926997185







 33%|███▎      | 163/500 [3:55:24<8:06:41, 86.65s/it][A[A[A[A[A

epoch: 163 avg_reward: 0.00705 avg_qvalue: 0.498101770878







 33%|███▎      | 164/500 [3:56:49<8:05:11, 86.64s/it][A[A[A[A[A

epoch: 164 avg_reward: 0.0121 avg_qvalue: 0.498317480087







 33%|███▎      | 165/500 [3:58:17<8:03:47, 86.65s/it][A[A[A[A[A

epoch: 165 avg_reward: 0.0089 avg_qvalue: 0.492900967598







 33%|███▎      | 166/500 [3:59:45<8:02:23, 86.66s/it][A[A[A[A[A

epoch: 166 avg_reward: 0.00955 avg_qvalue: 0.493766099215







 33%|███▎      | 167/500 [4:01:12<8:00:58, 86.66s/it][A[A[A[A[A

epoch: 167 avg_reward: 0.0088 avg_qvalue: 0.496687650681







 34%|███▎      | 168/500 [4:02:38<7:59:30, 86.66s/it][A[A[A[A[A

epoch: 168 avg_reward: 0.00865 avg_qvalue: 0.496261715889







 34%|███▍      | 169/500 [4:04:04<7:58:02, 86.66s/it][A[A[A[A[A

epoch: 169 avg_reward: 0.0107 avg_qvalue: 0.490832239389







 34%|███▍      | 170/500 [4:05:30<7:56:33, 86.65s/it][A[A[A[A[A

epoch: 170 avg_reward: 0.01235 avg_qvalue: 0.493223100901







 34%|███▍      | 171/500 [4:06:54<7:55:03, 86.64s/it][A[A[A[A[A

epoch: 171 avg_reward: 0.0115 avg_qvalue: 0.492722570896







 34%|███▍      | 172/500 [4:08:20<7:53:34, 86.63s/it][A[A[A[A[A

epoch: 172 avg_reward: 0.01 avg_qvalue: 0.492037147284







 35%|███▍      | 173/500 [4:09:45<7:52:05, 86.62s/it][A[A[A[A[A

epoch: 173 avg_reward: 0.0104 avg_qvalue: 0.495036125183







 35%|███▍      | 174/500 [4:11:10<7:50:35, 86.61s/it][A[A[A[A[A

epoch: 174 avg_reward: 0.01125 avg_qvalue: 0.494070798159







 35%|███▌      | 175/500 [4:12:36<7:49:07, 86.61s/it][A[A[A[A[A

epoch: 175 avg_reward: 0.01315 avg_qvalue: 0.493119835854







 35%|███▌      | 176/500 [4:14:01<7:47:38, 86.60s/it][A[A[A[A[A

epoch: 176 avg_reward: 0.01225 avg_qvalue: 0.49579384923







 35%|███▌      | 177/500 [4:15:27<7:46:09, 86.59s/it][A[A[A[A[A

epoch: 177 avg_reward: 0.01135 avg_qvalue: 0.496001571417







 36%|███▌      | 178/500 [4:16:52<7:44:40, 86.59s/it][A[A[A[A[A

epoch: 178 avg_reward: 0.01105 avg_qvalue: 0.49448505044







 36%|███▌      | 179/500 [4:18:18<7:43:13, 86.58s/it][A[A[A[A[A

epoch: 179 avg_reward: 0.0119 avg_qvalue: 0.499150425196







 36%|███▌      | 180/500 [4:19:44<7:41:45, 86.58s/it][A[A[A[A[A

epoch: 180 avg_reward: 0.0073 avg_qvalue: 0.499920755625







 36%|███▌      | 181/500 [4:21:09<7:40:16, 86.57s/it][A[A[A[A[A

epoch: 181 avg_reward: 0.01625 avg_qvalue: 0.497454583645







 36%|███▋      | 182/500 [4:22:34<7:38:47, 86.57s/it][A[A[A[A[A

epoch: 182 avg_reward: 0.00975 avg_qvalue: 0.497833937407







 37%|███▋      | 183/500 [4:24:00<7:37:19, 86.56s/it][A[A[A[A[A

epoch: 183 avg_reward: 0.01245 avg_qvalue: 0.494569540024







 37%|███▋      | 184/500 [4:25:26<7:35:51, 86.55s/it][A[A[A[A[A

epoch: 184 avg_reward: 0.01545 avg_qvalue: 0.495280504227







 37%|███▋      | 185/500 [4:26:51<7:34:21, 86.55s/it][A[A[A[A[A

epoch: 185 avg_reward: 0.01535 avg_qvalue: 0.49286070466







 37%|███▋      | 186/500 [4:28:16<7:32:53, 86.54s/it][A[A[A[A[A

epoch: 186 avg_reward: 0.0143 avg_qvalue: 0.496061235666







 37%|███▋      | 187/500 [4:29:42<7:31:26, 86.54s/it][A[A[A[A[A

epoch: 187 avg_reward: 0.01215 avg_qvalue: 0.496983140707







 38%|███▊      | 188/500 [4:31:07<7:29:57, 86.53s/it][A[A[A[A[A

epoch: 188 avg_reward: 0.0099 avg_qvalue: 0.494606435299







 38%|███▊      | 189/500 [4:32:34<7:28:30, 86.53s/it][A[A[A[A[A

epoch: 189 avg_reward: 0.01565 avg_qvalue: 0.499397367239







 38%|███▊      | 190/500 [4:34:00<7:27:03, 86.53s/it][A[A[A[A[A

epoch: 190 avg_reward: 0.01465 avg_qvalue: 0.495425641537







 38%|███▊      | 191/500 [4:35:25<7:25:35, 86.52s/it][A[A[A[A[A

epoch: 191 avg_reward: 0.01515 avg_qvalue: 0.496374368668







 38%|███▊      | 192/500 [4:36:50<7:24:06, 86.51s/it][A[A[A[A[A

epoch: 192 avg_reward: 0.0134 avg_qvalue: 0.494639933109







 39%|███▊      | 193/500 [4:38:16<7:22:38, 86.51s/it][A[A[A[A[A

epoch: 193 avg_reward: 0.01195 avg_qvalue: 0.491357564926







 39%|███▉      | 194/500 [4:39:42<7:21:11, 86.51s/it][A[A[A[A[A

epoch: 194 avg_reward: 0.01295 avg_qvalue: 0.493628799915







 39%|███▉      | 195/500 [4:41:07<7:19:41, 86.50s/it][A[A[A[A[A

epoch: 195 avg_reward: 0.0148 avg_qvalue: 0.495267868042







 39%|███▉      | 196/500 [4:42:32<7:18:13, 86.49s/it][A[A[A[A[A

epoch: 196 avg_reward: 0.01605 avg_qvalue: 0.497658193111







 39%|███▉      | 197/500 [4:43:57<7:16:45, 86.49s/it][A[A[A[A[A

epoch: 197 avg_reward: 0.01305 avg_qvalue: 0.493308246136







 40%|███▉      | 198/500 [4:45:22<7:15:16, 86.48s/it][A[A[A[A[A

epoch: 198 avg_reward: 0.01765 avg_qvalue: 0.49203363061







 40%|███▉      | 199/500 [4:46:47<7:13:47, 86.47s/it][A[A[A[A[A

epoch: 199 avg_reward: 0.01095 avg_qvalue: 0.495808452368







 40%|████      | 200/500 [4:48:12<7:12:18, 86.46s/it][A[A[A[A[A

epoch: 200 avg_reward: 0.01305 avg_qvalue: 0.496782660484







 40%|████      | 201/500 [4:49:38<7:10:51, 86.46s/it][A[A[A[A[A

epoch: 201 avg_reward: 0.0168 avg_qvalue: 0.496847569942







 40%|████      | 202/500 [4:51:03<7:09:22, 86.45s/it][A[A[A[A[A

epoch: 202 avg_reward: 0.0157 avg_qvalue: 0.495952397585







 41%|████      | 203/500 [4:52:29<7:07:55, 86.45s/it][A[A[A[A[A

epoch: 203 avg_reward: 0.01835 avg_qvalue: 0.496541798115







 41%|████      | 204/500 [4:53:55<7:06:28, 86.45s/it][A[A[A[A[A

epoch: 204 avg_reward: 0.0182 avg_qvalue: 0.489227235317







 41%|████      | 205/500 [4:55:20<7:05:00, 86.44s/it][A[A[A[A[A

epoch: 205 avg_reward: 0.01345 avg_qvalue: 0.489045202732







 41%|████      | 206/500 [4:56:45<7:03:31, 86.44s/it][A[A[A[A[A

epoch: 206 avg_reward: 0.0177 avg_qvalue: 0.491830855608







 41%|████▏     | 207/500 [4:58:10<7:02:03, 86.43s/it][A[A[A[A[A

epoch: 207 avg_reward: 0.01655 avg_qvalue: 0.493036955595







 42%|████▏     | 208/500 [4:59:36<7:00:36, 86.42s/it][A[A[A[A[A

epoch: 208 avg_reward: 0.0194 avg_qvalue: 0.492178082466







 42%|████▏     | 209/500 [5:01:01<6:59:07, 86.42s/it][A[A[A[A[A

epoch: 209 avg_reward: 0.0145 avg_qvalue: 0.48983284831







 42%|████▏     | 210/500 [5:02:26<6:57:40, 86.41s/it][A[A[A[A[A

epoch: 210 avg_reward: 0.0154 avg_qvalue: 0.489265143871







 42%|████▏     | 211/500 [5:03:52<6:56:12, 86.41s/it][A[A[A[A[A

epoch: 211 avg_reward: 0.0203 avg_qvalue: 0.491482913494







 42%|████▏     | 212/500 [5:05:17<6:54:44, 86.40s/it][A[A[A[A[A

epoch: 212 avg_reward: 0.01485 avg_qvalue: 0.492659270763







 43%|████▎     | 213/500 [5:06:42<6:53:15, 86.40s/it][A[A[A[A[A

epoch: 213 avg_reward: 0.0126 avg_qvalue: 0.488357514143







 43%|████▎     | 214/500 [5:08:09<6:51:49, 86.40s/it][A[A[A[A[A

epoch: 214 avg_reward: 0.01765 avg_qvalue: 0.489816606045







 43%|████▎     | 215/500 [5:09:35<6:50:23, 86.40s/it][A[A[A[A[A

epoch: 215 avg_reward: 0.01175 avg_qvalue: 0.488071084023







 43%|████▎     | 216/500 [5:11:01<6:48:56, 86.40s/it][A[A[A[A[A

epoch: 216 avg_reward: 0.01865 avg_qvalue: 0.492896139622







 43%|████▎     | 217/500 [5:12:27<6:47:29, 86.39s/it][A[A[A[A[A

epoch: 217 avg_reward: 0.0188 avg_qvalue: 0.487184464931







 44%|████▎     | 218/500 [5:13:52<6:46:01, 86.39s/it][A[A[A[A[A

epoch: 218 avg_reward: 0.0142 avg_qvalue: 0.490067690611







 44%|████▍     | 219/500 [5:15:18<6:44:34, 86.38s/it][A[A[A[A[A

epoch: 219 avg_reward: 0.01365 avg_qvalue: 0.492270499468







 44%|████▍     | 220/500 [5:16:43<6:43:06, 86.38s/it][A[A[A[A[A

epoch: 220 avg_reward: 0.01895 avg_qvalue: 0.493631899357







 44%|████▍     | 221/500 [5:18:09<6:41:39, 86.38s/it][A[A[A[A[A

epoch: 221 avg_reward: 0.01215 avg_qvalue: 0.49609965086







 44%|████▍     | 222/500 [5:19:34<6:40:11, 86.37s/it][A[A[A[A[A

epoch: 222 avg_reward: 0.0187 avg_qvalue: 0.500215649605







 45%|████▍     | 223/500 [5:20:59<6:38:43, 86.37s/it][A[A[A[A[A

epoch: 223 avg_reward: 0.0118 avg_qvalue: 0.491635203362







 45%|████▍     | 224/500 [5:22:26<6:37:17, 86.37s/it][A[A[A[A[A

epoch: 224 avg_reward: 0.01835 avg_qvalue: 0.491341263056







 45%|████▌     | 225/500 [5:23:51<6:35:50, 86.36s/it][A[A[A[A[A

epoch: 225 avg_reward: 0.01745 avg_qvalue: 0.492885798216







 45%|████▌     | 226/500 [5:25:17<6:34:22, 86.36s/it][A[A[A[A[A

epoch: 226 avg_reward: 0.01635 avg_qvalue: 0.489986032248







 45%|████▌     | 227/500 [5:26:42<6:32:54, 86.36s/it][A[A[A[A[A

epoch: 227 avg_reward: 0.01755 avg_qvalue: 0.483712166548







 46%|████▌     | 228/500 [5:28:08<6:31:27, 86.35s/it][A[A[A[A[A

epoch: 228 avg_reward: 0.0174 avg_qvalue: 0.485989391804







 46%|████▌     | 229/500 [5:29:34<6:30:01, 86.35s/it][A[A[A[A[A

epoch: 229 avg_reward: 0.0151 avg_qvalue: 0.484527826309







 46%|████▌     | 230/500 [5:31:00<6:28:34, 86.35s/it][A[A[A[A[A

epoch: 230 avg_reward: 0.01375 avg_qvalue: 0.488374859095







 46%|████▌     | 231/500 [5:32:27<6:27:09, 86.35s/it][A[A[A[A[A

epoch: 231 avg_reward: 0.0116 avg_qvalue: 0.488138586283







 46%|████▋     | 232/500 [5:33:54<6:25:43, 86.35s/it][A[A[A[A[A

epoch: 232 avg_reward: 0.0176 avg_qvalue: 0.49348321557







 47%|████▋     | 233/500 [5:35:19<6:24:15, 86.35s/it][A[A[A[A[A

epoch: 233 avg_reward: 0.01355 avg_qvalue: 0.488739609718







 47%|████▋     | 234/500 [5:36:45<6:22:48, 86.35s/it][A[A[A[A[A

epoch: 234 avg_reward: 0.01865 avg_qvalue: 0.484816223383







 47%|████▋     | 235/500 [5:38:12<6:21:23, 86.35s/it][A[A[A[A[A

epoch: 235 avg_reward: 0.01145 avg_qvalue: 0.486029773951







 47%|████▋     | 236/500 [5:39:40<6:19:58, 86.36s/it][A[A[A[A[A

epoch: 236 avg_reward: 0.0161 avg_qvalue: 0.482065737247







 47%|████▋     | 237/500 [5:41:07<6:18:32, 86.36s/it][A[A[A[A[A

epoch: 237 avg_reward: 0.02355 avg_qvalue: 0.485864400864







 48%|████▊     | 238/500 [5:42:35<6:17:08, 86.37s/it][A[A[A[A[A

epoch: 238 avg_reward: 0.0175 avg_qvalue: 0.485726267099







 48%|████▊     | 239/500 [5:44:03<6:15:43, 86.38s/it][A[A[A[A[A

epoch: 239 avg_reward: 0.0144 avg_qvalue: 0.484064310789







 48%|████▊     | 240/500 [5:45:30<6:14:18, 86.38s/it][A[A[A[A[A

epoch: 240 avg_reward: 0.01385 avg_qvalue: 0.485761284828







 48%|████▊     | 241/500 [5:46:55<6:12:50, 86.37s/it][A[A[A[A[A

epoch: 241 avg_reward: 0.01265 avg_qvalue: 0.487142622471







 48%|████▊     | 242/500 [5:48:20<6:11:22, 86.37s/it][A[A[A[A[A

epoch: 242 avg_reward: 0.0167 avg_qvalue: 0.490436851978







 49%|████▊     | 243/500 [5:49:48<6:09:57, 86.37s/it][A[A[A[A[A

epoch: 243 avg_reward: 0.0198 avg_qvalue: 0.486161768436







 49%|████▉     | 244/500 [5:51:16<6:08:32, 86.38s/it][A[A[A[A[A

epoch: 244 avg_reward: 0.01535 avg_qvalue: 0.484239757061







 49%|████▉     | 245/500 [5:52:42<6:07:06, 86.38s/it][A[A[A[A[A

epoch: 245 avg_reward: 0.01935 avg_qvalue: 0.487110346556







 49%|████▉     | 246/500 [5:54:08<6:05:39, 86.38s/it][A[A[A[A[A

epoch: 246 avg_reward: 0.01685 avg_qvalue: 0.483596593142







 49%|████▉     | 247/500 [5:55:35<6:04:13, 86.38s/it][A[A[A[A[A

epoch: 247 avg_reward: 0.0192 avg_qvalue: 0.485982477665







 50%|████▉     | 248/500 [5:57:01<6:02:46, 86.38s/it][A[A[A[A[A

epoch: 248 avg_reward: 0.02 avg_qvalue: 0.483826607466







 50%|████▉     | 249/500 [5:58:26<6:01:19, 86.37s/it][A[A[A[A[A

epoch: 249 avg_reward: 0.0179 avg_qvalue: 0.478561967611







 50%|█████     | 250/500 [5:59:52<5:59:52, 86.37s/it][A[A[A[A[A

epoch: 250 avg_reward: 0.02255 avg_qvalue: 0.484474569559







 50%|█████     | 251/500 [6:01:17<5:58:24, 86.37s/it][A[A[A[A[A

epoch: 251 avg_reward: 0.01185 avg_qvalue: 0.484612554312







 50%|█████     | 252/500 [6:02:43<5:56:58, 86.36s/it][A[A[A[A[A

epoch: 252 avg_reward: 0.01285 avg_qvalue: 0.486506253481







 51%|█████     | 253/500 [6:04:09<5:55:30, 86.36s/it][A[A[A[A[A

epoch: 253 avg_reward: 0.01125 avg_qvalue: 0.487693160772







 51%|█████     | 254/500 [6:05:35<5:54:04, 86.36s/it][A[A[A[A[A

epoch: 254 avg_reward: 0.0204 avg_qvalue: 0.482626229525







 51%|█████     | 255/500 [6:07:01<5:52:37, 86.36s/it][A[A[A[A[A

epoch: 255 avg_reward: 0.02185 avg_qvalue: 0.483041346073







 51%|█████     | 256/500 [6:08:28<5:51:11, 86.36s/it][A[A[A[A[A

epoch: 256 avg_reward: 0.0171 avg_qvalue: 0.481293737888







 51%|█████▏    | 257/500 [6:09:56<5:49:47, 86.37s/it][A[A[A[A[A

epoch: 257 avg_reward: 0.0174 avg_qvalue: 0.484661132097







 52%|█████▏    | 258/500 [6:11:23<5:48:21, 86.37s/it][A[A[A[A[A

epoch: 258 avg_reward: 0.01805 avg_qvalue: 0.484002649784







 52%|█████▏    | 259/500 [6:12:48<5:46:54, 86.37s/it][A[A[A[A[A

epoch: 259 avg_reward: 0.01785 avg_qvalue: 0.484819859266







 52%|█████▏    | 260/500 [6:14:15<5:45:28, 86.37s/it][A[A[A[A[A

epoch: 260 avg_reward: 0.0144 avg_qvalue: 0.491002887487







 52%|█████▏    | 261/500 [6:15:41<5:44:01, 86.37s/it][A[A[A[A[A

epoch: 261 avg_reward: 0.01905 avg_qvalue: 0.489198923111







 52%|█████▏    | 262/500 [6:17:07<5:42:34, 86.36s/it][A[A[A[A[A

epoch: 262 avg_reward: 0.01595 avg_qvalue: 0.486838370562







 53%|█████▎    | 263/500 [6:18:32<5:41:07, 86.36s/it][A[A[A[A[A

epoch: 263 avg_reward: 0.0166 avg_qvalue: 0.486645013094







 53%|█████▎    | 264/500 [6:19:58<5:39:40, 86.36s/it][A[A[A[A[A

epoch: 264 avg_reward: 0.01925 avg_qvalue: 0.48629039526







 53%|█████▎    | 265/500 [6:21:24<5:38:13, 86.36s/it][A[A[A[A[A

epoch: 265 avg_reward: 0.0131 avg_qvalue: 0.486087292433







 53%|█████▎    | 266/500 [6:22:49<5:36:46, 86.35s/it][A[A[A[A[A

epoch: 266 avg_reward: 0.01695 avg_qvalue: 0.47807008028







 53%|█████▎    | 267/500 [6:24:15<5:35:19, 86.35s/it][A[A[A[A[A

epoch: 267 avg_reward: 0.0243 avg_qvalue: 0.484634190798







 54%|█████▎    | 268/500 [6:25:41<5:33:52, 86.35s/it][A[A[A[A[A

epoch: 268 avg_reward: 0.0158 avg_qvalue: 0.483199745417







 54%|█████▍    | 269/500 [6:27:06<5:32:25, 86.35s/it][A[A[A[A[A

epoch: 269 avg_reward: 0.021 avg_qvalue: 0.483062654734







 54%|█████▍    | 270/500 [6:28:33<5:30:59, 86.35s/it][A[A[A[A[A

epoch: 270 avg_reward: 0.0174 avg_qvalue: 0.478191643953







 54%|█████▍    | 271/500 [6:30:01<5:29:34, 86.35s/it][A[A[A[A[A

epoch: 271 avg_reward: 0.0169 avg_qvalue: 0.481942236423







 54%|█████▍    | 272/500 [6:31:28<5:28:08, 86.35s/it][A[A[A[A[A

epoch: 272 avg_reward: 0.0209 avg_qvalue: 0.479358792305







 55%|█████▍    | 273/500 [6:32:54<5:26:41, 86.35s/it][A[A[A[A[A

epoch: 273 avg_reward: 0.0185 avg_qvalue: 0.473792731762







 55%|█████▍    | 274/500 [6:34:21<5:25:16, 86.36s/it][A[A[A[A[A

epoch: 274 avg_reward: 0.01205 avg_qvalue: 0.478722959757







 55%|█████▌    | 275/500 [6:35:49<5:23:51, 86.36s/it][A[A[A[A[A

epoch: 275 avg_reward: 0.0163 avg_qvalue: 0.481179773808







 55%|█████▌    | 276/500 [6:37:18<5:22:27, 86.37s/it][A[A[A[A[A

epoch: 276 avg_reward: 0.02555 avg_qvalue: 0.480583161116







 55%|█████▌    | 277/500 [6:38:46<5:21:02, 86.38s/it][A[A[A[A[A

epoch: 277 avg_reward: 0.0234 avg_qvalue: 0.483014732599







 56%|█████▌    | 278/500 [6:40:13<5:19:36, 86.38s/it][A[A[A[A[A

epoch: 278 avg_reward: 0.0159 avg_qvalue: 0.479570120573







 56%|█████▌    | 279/500 [6:41:41<5:18:11, 86.39s/it][A[A[A[A[A

epoch: 279 avg_reward: 0.01855 avg_qvalue: 0.48284009099







 56%|█████▌    | 280/500 [6:43:09<5:16:46, 86.39s/it][A[A[A[A[A

epoch: 280 avg_reward: 0.0096 avg_qvalue: 0.483363807201







 56%|█████▌    | 281/500 [6:44:38<5:15:21, 86.40s/it][A[A[A[A[A

epoch: 281 avg_reward: 0.01635 avg_qvalue: 0.484290421009







 56%|█████▋    | 282/500 [6:46:05<5:13:56, 86.40s/it][A[A[A[A[A

epoch: 282 avg_reward: 0.018 avg_qvalue: 0.480698496103







 57%|█████▋    | 283/500 [6:47:33<5:12:30, 86.41s/it][A[A[A[A[A

epoch: 283 avg_reward: 0.0224 avg_qvalue: 0.478358507156







 57%|█████▋    | 284/500 [6:49:00<5:11:04, 86.41s/it][A[A[A[A[A

epoch: 284 avg_reward: 0.01655 avg_qvalue: 0.476283937693







 57%|█████▋    | 285/500 [6:50:26<5:09:38, 86.41s/it][A[A[A[A[A

epoch: 285 avg_reward: 0.0137 avg_qvalue: 0.479459524155







 57%|█████▋    | 286/500 [6:51:52<5:08:11, 86.41s/it][A[A[A[A[A

epoch: 286 avg_reward: 0.01825 avg_qvalue: 0.477358192205


KeyboardInterrupt: 

In [33]:
label = 'best_restricted'

config = {
    'epsilon': 0.5,
    'gamma' : 0.9,
    'target_update_limit' : 100,
    'learning_rate' : 0.0001
#     'max_iterations_per_episode' : 500
}

# dqn = DeepQNetwork(**config)
dqn.batch_size = 50

experiment = Experiment()
experiment.execute(500, dqn, label)
print 'Best iteration'
print 'score: {} | qvalue: {} | model: {}' \
    .format(experiment.best_score, experiment.best_qvalue, experiment.best_filename)
generate_graphic('report/images/{}.pdf'.format(label), 0)








  0%|          | 0/500 [00:00<?, ?it/s][A[A[A[A[A[A[A






  0%|          | 1/500 [02:13<18:32:18, 133.74s/it][A[A[A[A[A[A[A

epoch: 1 avg_reward: 0.01995 avg_qvalue: 0.480983704329









  0%|          | 2/500 [03:50<15:54:39, 115.02s/it][A[A[A[A[A[A[A

epoch: 2 avg_reward: 0.0172 avg_qvalue: 0.480925649405









  1%|          | 3/500 [05:19<14:42:47, 106.57s/it][A[A[A[A[A[A[A

epoch: 3 avg_reward: 0.0172 avg_qvalue: 0.483369678259









  1%|          | 4/500 [06:46<14:00:43, 101.70s/it][A[A[A[A[A[A[A

epoch: 4 avg_reward: 0.02145 avg_qvalue: 0.480020076036









  1%|          | 5/500 [08:19<13:43:37, 99.83s/it] [A[A[A[A[A[A[A

epoch: 5 avg_reward: 0.0164 avg_qvalue: 0.48027920723









  1%|          | 6/500 [09:47<13:25:32, 97.84s/it][A[A[A[A[A[A[A

epoch: 6 avg_reward: 0.02325 avg_qvalue: 0.484182953835









  1%|▏         | 7/500 [11:16<13:13:52, 96.62s/it][A[A[A[A[A[A[A

epoch: 7 avg_reward: 0.01845 avg_qvalue: 0.485024273396









  2%|▏         | 8/500 [12:47<13:06:35, 95.93s/it][A[A[A[A[A[A[A

epoch: 8 avg_reward: 0.0281 avg_qvalue: 0.479867905378









  2%|▏         | 9/500 [14:16<12:59:02, 95.20s/it][A[A[A[A[A[A[A

epoch: 9 avg_reward: 0.0229 avg_qvalue: 0.482556700706









  2%|▏         | 10/500 [15:44<12:51:25, 94.46s/it][A[A[A[A[A[A[A

epoch: 10 avg_reward: 0.0243 avg_qvalue: 0.482531875372









  2%|▏         | 11/500 [17:22<12:52:05, 94.73s/it][A[A[A[A[A[A[A

epoch: 11 avg_reward: 0.0229 avg_qvalue: 0.479199647903









  2%|▏         | 12/500 [18:50<12:46:29, 94.24s/it][A[A[A[A[A[A[A

epoch: 12 avg_reward: 0.03315 avg_qvalue: 0.48826533556









  3%|▎         | 13/500 [20:20<12:41:44, 93.85s/it][A[A[A[A[A[A[A

epoch: 13 avg_reward: 0.0255 avg_qvalue: 0.48701941967









  3%|▎         | 14/500 [21:48<12:36:56, 93.45s/it][A[A[A[A[A[A[A

epoch: 14 avg_reward: 0.02245 avg_qvalue: 0.483667731285









  3%|▎         | 15/500 [23:15<12:32:10, 93.05s/it][A[A[A[A[A[A[A

epoch: 15 avg_reward: 0.02685 avg_qvalue: 0.493878602982









  3%|▎         | 16/500 [24:43<12:28:01, 92.73s/it][A[A[A[A[A[A[A

epoch: 16 avg_reward: 0.02285 avg_qvalue: 0.49482730031









  3%|▎         | 17/500 [26:11<12:24:14, 92.45s/it][A[A[A[A[A[A[A

epoch: 17 avg_reward: 0.02535 avg_qvalue: 0.498107045889









  4%|▎         | 18/500 [27:40<12:21:17, 92.28s/it][A[A[A[A[A[A[A

epoch: 18 avg_reward: 0.03455 avg_qvalue: 0.49585801363









  4%|▍         | 19/500 [29:29<12:26:35, 93.13s/it][A[A[A[A[A[A[A

epoch: 19 avg_reward: 0.0356 avg_qvalue: 0.492697119713









  4%|▍         | 20/500 [31:09<12:27:59, 93.50s/it][A[A[A[A[A[A[A

epoch: 20 avg_reward: 0.0238 avg_qvalue: 0.497705698013









  4%|▍         | 21/500 [32:51<12:29:23, 93.87s/it][A[A[A[A[A[A[A

epoch: 21 avg_reward: 0.03305 avg_qvalue: 0.497995555401









  4%|▍         | 22/500 [34:19<12:25:50, 93.62s/it][A[A[A[A[A[A[A

epoch: 22 avg_reward: 0.03005 avg_qvalue: 0.493833065033









  5%|▍         | 23/500 [35:47<12:22:12, 93.36s/it][A[A[A[A[A[A[A

epoch: 23 avg_reward: 0.0298 avg_qvalue: 0.496817439795









  5%|▍         | 24/500 [37:14<12:18:43, 93.12s/it][A[A[A[A[A[A[A

epoch: 24 avg_reward: 0.024 avg_qvalue: 0.502975285053









  5%|▌         | 25/500 [38:47<12:17:04, 93.11s/it][A[A[A[A[A[A[A

epoch: 25 avg_reward: 0.0274 avg_qvalue: 0.495306938887









  5%|▌         | 26/500 [40:43<12:22:31, 93.99s/it][A[A[A[A[A[A[A

epoch: 26 avg_reward: 0.02825 avg_qvalue: 0.500692784786









  5%|▌         | 27/500 [42:31<12:24:50, 94.48s/it][A[A[A[A[A[A[A

epoch: 27 avg_reward: 0.02875 avg_qvalue: 0.497269541025









  6%|▌         | 28/500 [44:14<12:25:49, 94.81s/it][A[A[A[A[A[A[A

epoch: 28 avg_reward: 0.01805 avg_qvalue: 0.502735793591









  6%|▌         | 29/500 [45:50<12:24:30, 94.84s/it][A[A[A[A[A[A[A

epoch: 29 avg_reward: 0.02425 avg_qvalue: 0.503439188004









  6%|▌         | 30/500 [47:23<12:22:34, 94.80s/it][A[A[A[A[A[A[A

epoch: 30 avg_reward: 0.0238 avg_qvalue: 0.492898494005









  6%|▌         | 31/500 [49:12<12:24:25, 95.24s/it][A[A[A[A[A[A[A

epoch: 31 avg_reward: 0.02645 avg_qvalue: 0.496611267328









  6%|▋         | 32/500 [50:58<12:25:25, 95.57s/it][A[A[A[A[A[A[A

epoch: 32 avg_reward: 0.02355 avg_qvalue: 0.493688583374









  7%|▋         | 33/500 [52:40<12:25:20, 95.76s/it][A[A[A[A[A[A[A

epoch: 33 avg_reward: 0.02105 avg_qvalue: 0.495246082544









  7%|▋         | 34/500 [54:36<12:28:22, 96.36s/it][A[A[A[A[A[A[A

epoch: 34 avg_reward: 0.0277 avg_qvalue: 0.500700235367









  7%|▋         | 35/500 [56:06<12:25:29, 96.19s/it][A[A[A[A[A[A[A

epoch: 35 avg_reward: 0.02875 avg_qvalue: 0.498225063086









  7%|▋         | 36/500 [57:36<12:22:33, 96.02s/it][A[A[A[A[A[A[A

epoch: 36 avg_reward: 0.024 avg_qvalue: 0.504530489445









  7%|▋         | 37/500 [59:06<12:19:36, 95.84s/it][A[A[A[A[A[A[A

epoch: 37 avg_reward: 0.02365 avg_qvalue: 0.505912482738









  8%|▊         | 38/500 [1:00:35<12:16:42, 95.68s/it][A[A[A[A[A[A[A

epoch: 38 avg_reward: 0.0204 avg_qvalue: 0.502632498741









  8%|▊         | 39/500 [1:02:05<12:13:55, 95.52s/it][A[A[A[A[A[A[A

epoch: 39 avg_reward: 0.02425 avg_qvalue: 0.505295097828









  8%|▊         | 40/500 [1:03:35<12:11:15, 95.38s/it][A[A[A[A[A[A[A

epoch: 40 avg_reward: 0.02375 avg_qvalue: 0.506369411945









  8%|▊         | 41/500 [1:05:06<12:08:54, 95.28s/it][A[A[A[A[A[A[A

epoch: 41 avg_reward: 0.02005 avg_qvalue: 0.509540259838









  8%|▊         | 42/500 [1:06:33<12:05:49, 95.09s/it][A[A[A[A[A[A[A

epoch: 42 avg_reward: 0.0219 avg_qvalue: 0.505426764488









  9%|▊         | 43/500 [1:08:16<12:05:37, 95.27s/it][A[A[A[A[A[A[A

epoch: 43 avg_reward: 0.02095 avg_qvalue: 0.505201876163









  9%|▉         | 44/500 [1:09:57<12:05:01, 95.40s/it][A[A[A[A[A[A[A

epoch: 44 avg_reward: 0.0255 avg_qvalue: 0.509449124336









  9%|▉         | 45/500 [1:11:29<12:02:50, 95.32s/it][A[A[A[A[A[A[A

epoch: 45 avg_reward: 0.0255 avg_qvalue: 0.506108880043









  9%|▉         | 46/500 [1:13:08<12:01:56, 95.41s/it][A[A[A[A[A[A[A

epoch: 46 avg_reward: 0.02285 avg_qvalue: 0.51289665699









  9%|▉         | 47/500 [1:14:41<11:59:57, 95.36s/it][A[A[A[A[A[A[A

epoch: 47 avg_reward: 0.02905 avg_qvalue: 0.510941922665









 10%|▉         | 48/500 [1:16:15<11:58:07, 95.33s/it][A[A[A[A[A[A[A

epoch: 48 avg_reward: 0.03185 avg_qvalue: 0.518604576588


KeyboardInterrupt: 

In [37]:
label = 'best_restricted'

config = {
    'epsilon': 0.5,
    'gamma' : 0.9,
    'target_update_limit' : 100,
    'learning_rate' : 0.00001
#     'max_iterations_per_episode' : 500
}

# dqn = DeepQNetwork(**config)
dqn.batch_size = 50

experiment = Experiment()
experiment.execute(500, dqn, label)
print 'Best iteration'
print 'score: {} | qvalue: {} | model: {}' \
    .format(experiment.best_score, experiment.best_qvalue, experiment.best_filename)
generate_graphic('report/images/{}.pdf'.format(label), 0)










  0%|          | 0/500 [00:00<?, ?it/s][A[A[A[A[A[A[A[A[A








  0%|          | 1/500 [01:33<12:54:20, 93.11s/it][A[A[A[A[A[A[A[A[A

epoch: 1 avg_reward: 0.0141 avg_qvalue: 0.515337526798











  0%|          | 2/500 [03:02<12:38:09, 91.35s/it][A[A[A[A[A[A[A[A[A

epoch: 2 avg_reward: 0.0274 avg_qvalue: 0.513214886189


KeyboardInterrupt: 

In [20]:
label = 'best_restricted'

config = {
    'epsilon': 0.5,
    'gamma' : 0.9,
    'target_update_limit' : 100,
    'learning_rate' : 0.00001
#     'max_iterations_per_episode' : 500
}

# dqn = DeepQNetwork(**config)
dqn.batch_size = 50

experiment = Experiment()
experiment.execute(500, dqn, label)
print 'Best iteration'
print 'score: {} | qvalue: {} | model: {}' \
    .format(experiment.best_score, experiment.best_qvalue, experiment.best_filename)
generate_graphic('report/images/{}.pdf'.format(label), 0)


  0%|          | 0/500 [00:00<?, ?it/s][A
  0%|          | 1/500 [06:57<57:48:34, 417.06s/it][A

epoch: 1 avg_reward: 0.00393576494854 avg_qvalue: 0.488616853952



  0%|          | 2/500 [14:00<58:08:35, 420.31s/it][A

epoch: 2 avg_reward: 0.00134096154089 avg_qvalue: 0.541352272034



  1%|          | 3/500 [21:01<58:02:54, 420.47s/it][A

epoch: 3 avg_reward: 0.00241387431032 avg_qvalue: 0.544888556004



  1%|          | 4/500 [28:20<58:33:59, 425.08s/it][A

epoch: 4 avg_reward: 0.00251253109099 avg_qvalue: 0.536925077438



  1%|          | 5/500 [35:25<58:26:25, 425.02s/it][A

epoch: 5 avg_reward: 0.00243716935386 avg_qvalue: 0.53883266449



  1%|          | 6/500 [42:02<57:40:56, 420.36s/it][A

epoch: 6 avg_reward: 0.00268511087113 avg_qvalue: 0.538401544094



  1%|▏         | 7/500 [48:24<56:49:16, 414.92s/it][A

epoch: 7 avg_reward: 0.00178630721962 avg_qvalue: 0.540090858936



  2%|▏         | 8/500 [54:39<56:01:55, 409.99s/it][A

epoch: 8 avg_reward: 0.00184390519711 avg_qvalue: 0.542323946953



  2%|▏         | 9/500 [1:00:55<55:23:34, 406.14s/it][A

epoch: 9 avg_reward: 0.00318365527364 avg_qvalue: 0.540492534637



  2%|▏         | 10/500 [1:07:13<54:53:52, 403.33s/it][A

epoch: 10 avg_reward: 0.00257066169336 avg_qvalue: 0.543325662613



  2%|▏         | 11/500 [1:13:29<54:27:15, 400.89s/it][A

epoch: 11 avg_reward: 0.00313315049622 avg_qvalue: 0.538909316063



  2%|▏         | 12/500 [1:19:48<54:05:24, 399.03s/it][A

epoch: 12 avg_reward: 0.00270022844857 avg_qvalue: 0.541166126728



  3%|▎         | 13/500 [1:26:05<53:44:59, 397.33s/it][A

epoch: 13 avg_reward: 0.00239186178486 avg_qvalue: 0.539053440094



  3%|▎         | 14/500 [1:32:22<53:26:54, 395.92s/it][A

epoch: 14 avg_reward: 0.00223534107859 avg_qvalue: 0.539893984795



  3%|▎         | 15/500 [1:38:39<53:09:47, 394.61s/it][A

epoch: 15 avg_reward: 0.0025668817638 avg_qvalue: 0.537371993065



  3%|▎         | 16/500 [1:44:57<52:54:56, 393.59s/it][A

epoch: 16 avg_reward: 0.00343823162248 avg_qvalue: 0.535749733448



  3%|▎         | 17/500 [1:51:14<52:40:45, 392.64s/it][A

epoch: 17 avg_reward: 0.00300092045039 avg_qvalue: 0.532041728497



  4%|▎         | 18/500 [1:57:32<52:27:41, 391.83s/it][A

epoch: 18 avg_reward: 0.00309565012853 avg_qvalue: 0.536045968533



  4%|▍         | 19/500 [2:03:52<52:16:03, 391.19s/it][A

epoch: 19 avg_reward: 0.00370763823864 avg_qvalue: 0.529190123081



  4%|▍         | 20/500 [2:11:14<52:29:37, 393.70s/it][A

epoch: 20 avg_reward: 0.00274968619993 avg_qvalue: 0.534996330738



  4%|▍         | 21/500 [2:20:46<53:31:01, 402.22s/it][A

epoch: 21 avg_reward: 0.00269735432396 avg_qvalue: 0.534024775028



  4%|▍         | 22/500 [2:27:19<53:21:04, 401.81s/it][A

epoch: 22 avg_reward: 0.00315531635092 avg_qvalue: 0.532728612423



  5%|▍         | 23/500 [2:34:36<53:26:26, 403.33s/it][A

epoch: 23 avg_reward: 0.00254952033736 avg_qvalue: 0.534561157227



  5%|▍         | 24/500 [2:41:27<53:22:24, 403.67s/it][A

epoch: 24 avg_reward: 0.0033205437128 avg_qvalue: 0.532861828804



  5%|▌         | 25/500 [2:48:11<53:15:44, 403.67s/it][A

epoch: 25 avg_reward: 0.00304119900531 avg_qvalue: 0.537607908249



  5%|▌         | 26/500 [2:54:36<53:03:17, 402.95s/it][A

epoch: 26 avg_reward: 0.00384406942011 avg_qvalue: 0.532375574112



  5%|▌         | 27/500 [3:01:11<52:54:13, 402.65s/it][A

epoch: 27 avg_reward: 0.00314266173799 avg_qvalue: 0.532306373119



  6%|▌         | 28/500 [3:08:01<52:49:42, 402.93s/it][A

epoch: 28 avg_reward: 0.00360511901542 avg_qvalue: 0.534779906273



  6%|▌         | 29/500 [3:14:28<52:38:31, 402.36s/it][A

epoch: 29 avg_reward: 0.00275062151448 avg_qvalue: 0.532559096813



  6%|▌         | 30/500 [3:21:03<52:29:53, 402.11s/it][A

epoch: 30 avg_reward: 0.00365340724522 avg_qvalue: 0.528724968433



  6%|▌         | 31/500 [3:28:03<52:27:36, 402.68s/it][A

epoch: 31 avg_reward: 0.00337372537623 avg_qvalue: 0.529139637947



  6%|▋         | 32/500 [3:35:10<52:26:55, 403.45s/it][A

epoch: 32 avg_reward: 0.00339385996967 avg_qvalue: 0.530553877354



  7%|▋         | 33/500 [3:42:02<52:22:18, 403.72s/it][A

epoch: 33 avg_reward: 0.00312595848021 avg_qvalue: 0.531176447868



  7%|▋         | 34/500 [3:48:52<52:16:52, 403.89s/it][A

epoch: 34 avg_reward: 0.00289701733172 avg_qvalue: 0.532961249352



  7%|▋         | 35/500 [3:55:59<52:15:18, 404.56s/it][A

epoch: 35 avg_reward: 0.00359817949751 avg_qvalue: 0.530385136604



  7%|▋         | 36/500 [4:03:13<52:14:51, 405.37s/it][A

epoch: 36 avg_reward: 0.00333407517206 avg_qvalue: 0.5275426507



  7%|▋         | 37/500 [4:10:23<52:13:13, 406.03s/it][A

epoch: 37 avg_reward: 0.0041682582725 avg_qvalue: 0.530570328236



  8%|▊         | 38/500 [4:17:32<52:11:04, 406.63s/it][A

epoch: 38 avg_reward: 0.00407123400362 avg_qvalue: 0.528670668602



  8%|▊         | 39/500 [4:24:04<52:01:27, 406.26s/it][A

epoch: 39 avg_reward: 0.00419897049338 avg_qvalue: 0.527448177338



  8%|▊         | 40/500 [4:30:22<51:49:13, 405.55s/it][A

epoch: 40 avg_reward: 0.00394996155091 avg_qvalue: 0.526184797287



  8%|▊         | 41/500 [4:37:02<51:41:29, 405.42s/it][A

epoch: 41 avg_reward: 0.00363692868512 avg_qvalue: 0.528124690056



  8%|▊         | 42/500 [4:44:05<51:37:58, 405.85s/it][A

epoch: 42 avg_reward: 0.00393565188172 avg_qvalue: 0.526035964489



  9%|▊         | 43/500 [4:51:39<51:39:44, 406.97s/it][A

epoch: 43 avg_reward: 0.0044503651905 avg_qvalue: 0.528705954552



  9%|▉         | 44/500 [4:58:27<51:33:11, 407.00s/it][A

epoch: 44 avg_reward: 0.00419186827957 avg_qvalue: 0.529260396957



  9%|▉         | 45/500 [5:04:48<51:21:53, 406.40s/it][A

epoch: 45 avg_reward: 0.0039932312421 avg_qvalue: 0.522171139717



  9%|▉         | 46/500 [5:11:09<51:11:04, 405.87s/it][A

epoch: 46 avg_reward: 0.00385513429978 avg_qvalue: 0.523531556129



  9%|▉         | 47/500 [5:18:15<51:07:23, 406.28s/it][A

epoch: 47 avg_reward: 0.00391077879526 avg_qvalue: 0.524926364422



 10%|▉         | 48/500 [5:25:10<51:02:02, 406.47s/it][A

epoch: 48 avg_reward: 0.00406231977403 avg_qvalue: 0.520102620125



 10%|▉         | 49/500 [5:31:55<50:55:05, 406.44s/it][A

epoch: 49 avg_reward: 0.00396604860304 avg_qvalue: 0.520879209042



 10%|█         | 50/500 [5:38:48<50:49:17, 406.57s/it][A

epoch: 50 avg_reward: 0.00365944860386 avg_qvalue: 0.524547219276



 10%|█         | 51/500 [5:45:51<50:44:55, 406.90s/it][A

epoch: 51 avg_reward: 0.00496215412904 avg_qvalue: 0.520591676235



 10%|█         | 52/500 [5:52:45<50:39:10, 407.03s/it][A

epoch: 52 avg_reward: 0.00379228396622 avg_qvalue: 0.520338118076


KeyboardInterrupt: 

In [29]:
label = 'best_restricted'

config = {
    'epsilon': 0.5,
    'gamma' : 0.9,
    'target_update_limit' : 100,
    'learning_rate' : 0.00001
#     'max_iterations_per_episode' : 500
}

# dqn = DeepQNetwork(**config)
dqn.batch_size = 50

experiment = Experiment()
experiment.execute(500, dqn, label)
print 'Best iteration'
print 'score: {} | qvalue: {} | model: {}' \
    .format(experiment.best_score, experiment.best_qvalue, experiment.best_filename)
generate_graphic('report/images/{}.pdf'.format(label), 0)



  0%|          | 0/500 [00:00<?, ?it/s][A[A

  0%|          | 1/500 [07:04<58:49:14, 424.36s/it][A[A

epoch: 1 avg_reward: 0.00362959594428 avg_qvalue: 0.496249377728




  0%|          | 2/500 [13:56<57:51:35, 418.26s/it][A[A

epoch: 2 avg_reward: 0.00453375502582 avg_qvalue: 0.520751833916




  1%|          | 3/500 [21:11<58:31:53, 423.97s/it][A[A

epoch: 3 avg_reward: 0.00478702166558 avg_qvalue: 0.516398072243




  1%|          | 4/500 [27:49<57:29:33, 417.28s/it][A[A

epoch: 4 avg_reward: 0.00518843767578 avg_qvalue: 0.518562734127




  1%|          | 5/500 [34:24<56:45:58, 412.85s/it][A[A

epoch: 5 avg_reward: 0.00447191511339 avg_qvalue: 0.518863618374




  1%|          | 6/500 [40:57<56:12:17, 409.59s/it][A[A

epoch: 6 avg_reward: 0.00419538507642 avg_qvalue: 0.522697389126




  1%|▏         | 7/500 [47:40<55:57:20, 408.60s/it][A[A

epoch: 7 avg_reward: 0.00421348904619 avg_qvalue: 0.522579908371




  2%|▏         | 8/500 [54:55<56:17:25, 411.88s/it][A[A

epoch: 8 avg_reward: 0.00408276301931 avg_qvalue: 0.520943522453




  2%|▏         | 9/500 [1:02:44<57:03:02, 418.29s/it][A[A

epoch: 9 avg_reward: 0.00480396779952 avg_qvalue: 0.519364953041




  2%|▏         | 10/500 [1:10:28<57:33:40, 422.90s/it][A[A

epoch: 10 avg_reward: 0.00434819140444 avg_qvalue: 0.520337879658




  2%|▏         | 11/500 [1:18:14<57:58:32, 426.81s/it][A[A

epoch: 11 avg_reward: 0.00479995968538 avg_qvalue: 0.521554470062




  2%|▏         | 12/500 [1:25:14<57:46:46, 426.24s/it][A[A

epoch: 12 avg_reward: 0.00420311114287 avg_qvalue: 0.518301069736




  3%|▎         | 13/500 [1:32:27<57:43:31, 426.72s/it][A[A

epoch: 13 avg_reward: 0.00451321186042 avg_qvalue: 0.516060233116




  3%|▎         | 14/500 [1:40:53<58:22:36, 432.42s/it][A[A

epoch: 14 avg_reward: 0.00483394353113 avg_qvalue: 0.516905248165




  3%|▎         | 15/500 [1:47:29<57:55:38, 429.98s/it][A[A

epoch: 15 avg_reward: 0.00427257160125 avg_qvalue: 0.517282009125




  3%|▎         | 16/500 [1:54:19<57:38:30, 428.74s/it][A[A

epoch: 16 avg_reward: 0.00492475829472 avg_qvalue: 0.513588726521




  3%|▎         | 17/500 [2:01:38<57:36:01, 429.32s/it][A[A

epoch: 17 avg_reward: 0.00409380051645 avg_qvalue: 0.517664432526




  4%|▎         | 18/500 [2:10:09<58:05:31, 433.88s/it][A[A

epoch: 18 avg_reward: 0.00405777397318 avg_qvalue: 0.520166039467




  4%|▍         | 19/500 [2:18:39<58:30:04, 437.85s/it][A[A

epoch: 19 avg_reward: 0.0040519694505 avg_qvalue: 0.520617842674




  4%|▍         | 20/500 [2:25:20<58:08:19, 436.04s/it][A[A

epoch: 20 avg_reward: 0.00418000487317 avg_qvalue: 0.51770067215




  4%|▍         | 21/500 [2:34:26<58:42:46, 441.27s/it][A[A

epoch: 21 avg_reward: 0.00437033212005 avg_qvalue: 0.518659889698




  4%|▍         | 22/500 [2:43:06<59:04:04, 444.86s/it][A[A

epoch: 22 avg_reward: 0.00474687528855 avg_qvalue: 0.516717255116


KeyboardInterrupt: 

### Best model - unrestricted
- target update 200
- gamma 0.99
- reward at end -100
- max iterations per episode 1000
- learning rate 0.0001

In [None]:
label = 'best_unrestricted'

config = {
    'terminal_reward' : -100,
    'gamma' : 0.9,
    'target_update_limit' : 200,
    'max_iterations_per_episode' : 1000,
    'learning_rate' : 0.001
}

# dqn = DeepQNetwork(**config)

experiment = Experiment()
experiment.execute(500, dqn, label)
print 'Best iteration'
print 'score: {} | qvalue: {} | model: {}' \
    .format(experiment.best_score, experiment.best_qvalue, experiment.best_filename)
generate_graphic('report/images/{}.pdf'.format(label), 0, None, None)

### Generating the graphic for previous model, setting the y_lim

### Render an episode using best model found - iteration 090 from previous model

In [33]:
config = {
    'max_iterations_per_episode' : 5000000
}

dqn_test = DeepQNetwork(**config)
dqn_test.model.load_weights('boxing_3.h5')
score = dqn_test.run_test_average_reward(1, False)
print 'Score: {}'.format(score)

Score: 0.00125680770842


In [34]:
dqn = dqn_test

In [35]:
dqn.run_test_average_reward(1000, True)

KeyboardInterrupt: 