# Simple q-learning agent with experience replay

We re-write q-learning algorithm using _agentnet_ - a helper for lasagne that implements some RL techniques.

In [2]:
# ! sudo pip3 install --upgrade https://github.com/yandexdataschool/AgentNet/archive/master.zip

In [7]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
%env THEANO_FLAGS='floatX=float32'

#XVFB will be launched if you run on a server
import os
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY"))==0:
    !bash ../xvfb start
    %env DISPLAY=:1

env: THEANO_FLAGS='floatX=float32'


### Experiment setup
* Here we simply load the game and check that it works

In [8]:
import gym
make_env = lambda: gym.make("LunarLander-v2")

env=make_env()
env.reset()

state_shape = env.observation_space.shape
n_actions = env.action_space.n

In [9]:
%matplotlib inline

In [10]:
# plt.imshow(env.render("rgb_array"))
# del env

# Neural Network body

In [11]:
import lasagne
from lasagne.layers import *
from lasagne.nonlinearities import elu


#image observation at current tick goes here, shape = (sample_i,x,y,color)
observation_layer = InputLayer((None,)+state_shape)


nn = DenseLayer(observation_layer, 200, nonlinearity=elu)
nn = DenseLayer(nn, 200, nonlinearity=elu)

#a layer that predicts Qvalues
qvalues_layer = DenseLayer(nn,num_units=n_actions,
                           nonlinearity=None,name="q-values")

Picking actions is done by yet another layer, that implements $ \epsilon$ -greedy policy

In [12]:
from agentnet.resolver import EpsilonGreedyResolver
action_layer = EpsilonGreedyResolver(qvalues_layer)

#set starting epsilon
action_layer.epsilon.set_value(np.float32(0.05))


### Agent

We define an agent entirely composed of a lasagne network:
* Observations as InputLayer(s)
* Actions as intermediate Layer(s)
* `policy_estimators` is "whatever else you want to keep track of"

Each parameter can be either one layer or a list of layers

In [13]:
from agentnet.agent import Agent
agent = Agent(observation_layers=observation_layer,
              action_layers=action_layer,
              policy_estimators=qvalues_layer,)

In [14]:
#Since it's a single lasagne network, one can get it's weights, output, etc
weights = lasagne.layers.get_all_params(action_layer,trainable=True)
weights

[W, b, W, b, q-values.W, q-values.b]

# Create and manage a pool of atari sessions to play with

* To make training more stable, we shall have an entire batch of game sessions each happening independent of others
* Why several parallel agents help training: http://arxiv.org/pdf/1602.01783v1.pdf
* Alternative approach: store more sessions: https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf

In [15]:
from agentnet.experiments.openai_gym.pool import EnvPool
pool = EnvPool(agent,make_env,n_games=1,max_size=10000)

In [16]:
%%time
#interact for 7 ticks
obs_log,action_log,reward_log,_,_,_  = pool.interact(5)


print('actions:',action_log)
print('rewards:',reward_log)

actions: [[2 2 3 2 0]]
rewards: [[ 2.08304957  2.10279218 -3.0008554   0.89977308  0.        ]]
CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 7.59 ms


In [17]:
#we'll train on rollouts of 10 steps (required by n-step algorithms and rnns later)
SEQ_LENGTH=10

#load first sessions (this function calls interact and stores sessions in the pool)

for _ in range(100):
    pool.update(SEQ_LENGTH)

# q-learning

We shall now define a function that replays recent game sessions and updates network weights

In [18]:
#get agent's Qvalues obtained via experience replay
replay = pool.experience_replay.sample_session_batch(100)
qvalues_seq = agent.get_sessions(
    replay,
    session_length=SEQ_LENGTH,
    experience_replay=True,
)[-1]



In [19]:
#loss for Qlearning = (Q(s,a) - (r+gamma*Q(s',a_max)))^2, like you implemented before in lasagne.

from agentnet.learning import qlearning
elwise_mse_loss = qlearning.get_elementwise_objective(qvalues_seq,
                                                      replay.actions[0],
                                                      replay.rewards,
                                                      replay.is_alive,
                                                      gamma_or_gammas=0.99,
                                                      n_steps=1,)

#compute mean loss over "alive" fragments
loss = elwise_mse_loss.sum() / replay.is_alive.sum()

In [20]:
#get weight updates
updates = lasagne.updates.adam(loss,weights,learning_rate=1e-4)

#compile train function
import theano
train_step = theano.function([],loss,updates=updates)

# Demo run

Play full session with an untrained agent

In [21]:
#for MountainCar-v0 evaluation session is cropped to 200 ticks
untrained_reward = pool.evaluate(save_path="./records",record_video=True)

NoSuchDisplayException: Cannot connect to "None"

In [None]:
#show video
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./records/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./records/"+video_names[-1])) #this may or may not be _last_ video. Try other indices

# Training loop

In [22]:
epoch_counter = 1 #starting epoch
rewards = {} #full game rewards
target_score = -90

In [23]:
from tqdm import trange

for i in trange(10000):    
    
    #play
    for _ in range(5):
        pool.update(SEQ_LENGTH,append=True)
    
    #train
    train_step()
    
    #update epsilon
    epsilon = 0.05 + 0.95*np.exp(-epoch_counter/1000.)
    action_layer.epsilon.set_value(np.float32(epsilon))
    
    #play a few games for evaluation
    if epoch_counter%100==0:
        rewards[epoch_counter] = np.mean(pool.evaluate(n_games=3,record_video=False))
        print("iter=%i\tepsilon=%.3f"%(epoch_counter,action_layer.epsilon.get_value(),))
        print("Current score(mean over %i) = %.3f"%(3,np.mean(rewards[epoch_counter])))
    
        if rewards[epoch_counter] >= target_score:
            print("You win!")
            break

    
    epoch_counter  +=1

    
# Time to drink some coffee!

  1%|          | 100/10000 [00:06<11:10, 14.77it/s]

Episode finished after 79 timesteps with reward=-441.95969930023665
Episode finished after 146 timesteps with reward=-71.83262219712016
Episode finished after 101 timesteps with reward=-128.38087044294406
iter=100	epsilon=0.910
Current score(mean over 3) = -214.058


  2%|▏         | 200/10000 [00:13<10:41, 15.28it/s]

Episode finished after 112 timesteps with reward=-298.76424061521357
Episode finished after 103 timesteps with reward=-136.14124145150865
Episode finished after 125 timesteps with reward=-217.09230631684966
iter=200	epsilon=0.828
Current score(mean over 3) = -217.333


  3%|▎         | 300/10000 [00:19<10:26, 15.48it/s]

Episode finished after 137 timesteps with reward=-377.5301666211878
Episode finished after 100 timesteps with reward=-259.19924500955756
Episode finished after 155 timesteps with reward=-107.68837388728367
iter=300	epsilon=0.754
Current score(mean over 3) = -248.139


  4%|▍         | 398/10000 [00:25<10:21, 15.45it/s]

Episode finished after 120 timesteps with reward=-195.54949491250431
Episode finished after 183 timesteps with reward=-388.2470758567677


  4%|▍         | 402/10000 [00:26<10:31, 15.21it/s]

Episode finished after 222 timesteps with reward=-35.7822438152124
iter=400	epsilon=0.687
Current score(mean over 3) = -206.526


  5%|▌         | 500/10000 [00:33<10:33, 15.00it/s]

Episode finished after 91 timesteps with reward=-55.6156014798249
Episode finished after 140 timesteps with reward=-419.43475808352235
Episode finished after 183 timesteps with reward=-421.72163809068667
iter=500	epsilon=0.626
Current score(mean over 3) = -298.924


  6%|▌         | 600/10000 [00:40<10:28, 14.95it/s]

Episode finished after 212 timesteps with reward=-275.06556552357256
Episode finished after 115 timesteps with reward=-239.39913840916105
Episode finished after 135 timesteps with reward=-303.9075102495626
iter=600	epsilon=0.571
Current score(mean over 3) = -272.791


  7%|▋         | 699/10000 [00:49<10:59, 14.10it/s]

Episode finished after 79 timesteps with reward=-163.290676002915
Episode finished after 125 timesteps with reward=-425.4291776207567
Episode finished after 142 timesteps with reward=-313.26225423707103
iter=700	epsilon=0.522
Current score(mean over 3) = -300.661


  8%|▊         | 799/10000 [00:57<11:00, 13.93it/s]

Episode finished after 130 timesteps with reward=-559.6404466527156
Episode finished after 104 timesteps with reward=-451.83255122949464
Episode finished after 137 timesteps with reward=-409.3149972982682
iter=800	epsilon=0.477
Current score(mean over 3) = -473.596


  9%|▉         | 899/10000 [01:04<10:57, 13.84it/s]

Episode finished after 193 timesteps with reward=-766.9796664528284
Episode finished after 128 timesteps with reward=-241.3339114945503
Episode finished after 174 timesteps with reward=-348.31660855965885
iter=900	epsilon=0.436
Current score(mean over 3) = -452.210


 10%|▉         | 999/10000 [01:12<10:52, 13.80it/s]

Episode finished after 235 timesteps with reward=-360.87148486911104
Episode finished after 133 timesteps with reward=-382.620981445247


 10%|█         | 1001/10000 [01:13<10:57, 13.69it/s]

Episode finished after 337 timesteps with reward=-752.8922267820135
iter=1000	epsilon=0.399
Current score(mean over 3) = -498.795


 11%|█         | 1100/10000 [01:21<10:57, 13.54it/s]

Episode finished after 86 timesteps with reward=-260.67099932813517
Episode finished after 203 timesteps with reward=-838.6586310261279
Episode finished after 125 timesteps with reward=-258.2166842110545
iter=1100	epsilon=0.366
Current score(mean over 3) = -452.515


 12%|█▏        | 1199/10000 [01:29<10:55, 13.43it/s]

Episode finished after 259 timesteps with reward=-429.05695962318435
Episode finished after 239 timesteps with reward=-493.88008610299556


 12%|█▏        | 1201/10000 [01:30<11:00, 13.32it/s]

Episode finished after 442 timesteps with reward=-639.9591937247586
iter=1200	epsilon=0.336
Current score(mean over 3) = -520.965


 13%|█▎        | 1299/10000 [01:37<10:54, 13.29it/s]

Episode finished after 298 timesteps with reward=-384.6185143671015
Episode finished after 284 timesteps with reward=-338.54666568985755
Episode finished after 209 timesteps with reward=-288.904743686449
iter=1300	epsilon=0.309
Current score(mean over 3) = -337.357


 14%|█▍        | 1399/10000 [01:46<10:56, 13.10it/s]

Episode finished after 295 timesteps with reward=-388.194877375453
Episode finished after 454 timesteps with reward=-156.03969222137366
Episode finished after 217 timesteps with reward=-370.36952709219145
iter=1400	epsilon=0.284
Current score(mean over 3) = -304.868


 15%|█▍        | 1499/10000 [01:55<10:55, 12.97it/s]

Episode finished after 360 timesteps with reward=-377.26501691645507
Episode finished after 125 timesteps with reward=-301.741483701593


 15%|█▌        | 1501/10000 [01:56<10:58, 12.91it/s]

Episode finished after 224 timesteps with reward=-139.08573199819475
iter=1500	epsilon=0.262
Current score(mean over 3) = -272.697


 16%|█▌        | 1598/10000 [02:04<10:52, 12.87it/s]

Episode finished after 190 timesteps with reward=-497.18963770526676
Episode finished after 183 timesteps with reward=-513.658107444661


 16%|█▌        | 1602/10000 [02:04<10:55, 12.82it/s]

Episode finished after 215 timesteps with reward=-572.5500309385845
iter=1600	epsilon=0.242
Current score(mean over 3) = -527.799


 17%|█▋        | 1698/10000 [02:12<10:48, 12.81it/s]

Episode finished after 219 timesteps with reward=-348.21639441622
Episode finished after 218 timesteps with reward=-157.473172476024


 17%|█▋        | 1700/10000 [02:13<10:50, 12.76it/s]

Episode finished after 229 timesteps with reward=-339.23780156216117
iter=1700	epsilon=0.224
Current score(mean over 3) = -281.642


 18%|█▊        | 1799/10000 [02:21<10:44, 12.72it/s]

Episode finished after 229 timesteps with reward=-309.56875747121455


 18%|█▊        | 1801/10000 [02:22<10:46, 12.68it/s]

Episode finished after 323 timesteps with reward=-145.7889067727263
Episode finished after 108 timesteps with reward=-193.8341601510197
iter=1800	epsilon=0.207
Current score(mean over 3) = -216.397


 19%|█▉        | 1899/10000 [02:30<10:43, 12.60it/s]

Episode finished after 89 timesteps with reward=-256.8669391141574
Episode finished after 259 timesteps with reward=-219.0456209802516

 19%|█▉        | 1901/10000 [02:31<10:46, 12.52it/s]


Episode finished after 530 timesteps with reward=-369.55971541967
iter=1900	epsilon=0.192
Current score(mean over 3) = -281.824


 20%|█▉        | 1999/10000 [02:39<10:39, 12.50it/s]

Episode finished after 380 timesteps with reward=-173.95075943209827
Episode finished after 239 timesteps with reward=-294.76917995564804


 20%|██        | 2001/10000 [02:40<10:42, 12.45it/s]

Episode finished after 310 timesteps with reward=-368.6143106199293
iter=2000	epsilon=0.179
Current score(mean over 3) = -279.111


 21%|██        | 2099/10000 [02:50<10:42, 12.30it/s]

Episode finished after 280 timesteps with reward=-129.47505931926406
Episode finished after 263 timesteps with reward=-161.53105410009127

 21%|██        | 2101/10000 [02:51<10:45, 12.23it/s]


Episode finished after 459 timesteps with reward=-477.0281671571164
iter=2100	epsilon=0.166
Current score(mean over 3) = -256.011


 22%|██▏       | 2199/10000 [02:59<10:37, 12.23it/s]

Episode finished after 140 timesteps with reward=-250.3466026963432


 22%|██▏       | 2201/10000 [03:00<10:40, 12.17it/s]

Episode finished after 589 timesteps with reward=-394.0781825910061
Episode finished after 73 timesteps with reward=-565.4531230207841
iter=2200	epsilon=0.155
Current score(mean over 3) = -403.293


 23%|██▎       | 2299/10000 [03:09<10:33, 12.15it/s]

Episode finished after 76 timesteps with reward=-506.9152110730589
Episode finished after 175 timesteps with reward=-297.06476609744567
Episode finished after 75 timesteps with reward=-375.00278860067743
iter=2300	epsilon=0.145
Current score(mean over 3) = -392.994


 24%|██▍       | 2401/10000 [03:17<10:25, 12.15it/s]

Episode finished after 275 timesteps with reward=-281.30560544090093
Episode finished after 86 timesteps with reward=-614.7039301921369
Episode finished after 161 timesteps with reward=-287.288680961099
iter=2400	epsilon=0.136
Current score(mean over 3) = -394.433


 25%|██▍       | 2499/10000 [03:26<10:18, 12.12it/s]

Episode finished after 130 timesteps with reward=-215.39393249600417
Episode finished after 188 timesteps with reward=-380.9764725964161


 25%|██▌       | 2501/10000 [03:26<10:19, 12.10it/s]

Episode finished after 210 timesteps with reward=-357.65517416922427
iter=2500	epsilon=0.128
Current score(mean over 3) = -318.009


 26%|██▌       | 2599/10000 [03:35<10:14, 12.04it/s]

Episode finished after 543 timesteps with reward=-439.85352262585


 26%|██▌       | 2600/10000 [03:37<10:18, 11.97it/s]

Episode finished after 303 timesteps with reward=-295.6840594094733
Episode finished after 87 timesteps with reward=-435.77665098814543
iter=2600	epsilon=0.121
Current score(mean over 3) = -390.438


 27%|██▋       | 2698/10000 [03:47<10:15, 11.86it/s]

Episode finished after 180 timesteps with reward=-122.89573950641228
Episode finished after 276 timesteps with reward=-159.47100003599576


 27%|██▋       | 2700/10000 [03:48<10:18, 11.80it/s]

Episode finished after 609 timesteps with reward=-402.71587488941543
iter=2700	epsilon=0.114
Current score(mean over 3) = -228.361


 28%|██▊       | 2799/10000 [03:58<10:13, 11.73it/s]

Episode finished after 447 timesteps with reward=-307.26210257457603
Episode finished after 351 timesteps with reward=-344.63265452094265


 28%|██▊       | 2801/10000 [03:59<10:16, 11.67it/s]

Episode finished after 378 timesteps with reward=-186.92772284008305
iter=2800	epsilon=0.108
Current score(mean over 3) = -279.607


 29%|██▉       | 2899/10000 [04:10<10:12, 11.58it/s]

Episode finished after 222 timesteps with reward=-320.91433237603553


 29%|██▉       | 2900/10000 [04:11<10:14, 11.55it/s]

Episode finished after 333 timesteps with reward=-266.1636924642653
Episode finished after 279 timesteps with reward=-369.7403344353286
iter=2900	epsilon=0.102
Current score(mean over 3) = -318.939


 30%|██▉       | 2999/10000 [04:20<10:09, 11.49it/s]

Episode finished after 1000 timesteps with reward=-211.69967904628868
Episode finished after 289 timesteps with reward=-152.8934446268927


 30%|███       | 3001/10000 [04:23<10:14, 11.39it/s]

Episode finished after 262 timesteps with reward=-113.29642804942245
iter=3000	epsilon=0.097
Current score(mean over 3) = -159.297


 31%|███       | 3099/10000 [04:33<10:09, 11.32it/s]

Episode finished after 229 timesteps with reward=-231.41927443177678
Episode finished after 236 timesteps with reward=-199.07134678341143


 31%|███       | 3101/10000 [04:34<10:10, 11.29it/s]

Episode finished after 308 timesteps with reward=-267.34054617965757
iter=3100	epsilon=0.093
Current score(mean over 3) = -232.610


 32%|███▏      | 3198/10000 [04:44<10:04, 11.25it/s]

Episode finished after 191 timesteps with reward=-183.10507980858222
Episode finished after 314 timesteps with reward=-280.3408303794349


 32%|███▏      | 3202/10000 [04:45<10:06, 11.21it/s]

Episode finished after 353 timesteps with reward=-156.47724566199196
iter=3200	epsilon=0.089
Current score(mean over 3) = -206.641


 33%|███▎      | 3299/10000 [04:54<09:58, 11.19it/s]

Episode finished after 103 timesteps with reward=-612.2737265081734
Episode finished after 158 timesteps with reward=-282.98161519005504


 33%|███▎      | 3301/10000 [04:55<09:59, 11.18it/s]

Episode finished after 235 timesteps with reward=-659.1347825837418
iter=3300	epsilon=0.085
Current score(mean over 3) = -518.130


 34%|███▍      | 3399/10000 [05:04<09:52, 11.15it/s]

Episode finished after 210 timesteps with reward=-381.06128746026695
Episode finished after 291 timesteps with reward=-402.6488101272856


 34%|███▍      | 3401/10000 [05:05<09:53, 11.11it/s]

Episode finished after 406 timesteps with reward=-416.24050153827136
iter=3400	epsilon=0.082
Current score(mean over 3) = -399.984


 35%|███▍      | 3498/10000 [05:14<09:45, 11.11it/s]

Episode finished after 407 timesteps with reward=-359.43014104615173
Episode finished after 218 timesteps with reward=-418.06246302769796


 35%|███▌      | 3502/10000 [05:15<09:46, 11.09it/s]

Episode finished after 182 timesteps with reward=-189.43049922838514
iter=3500	epsilon=0.079
Current score(mean over 3) = -322.308


 36%|███▌      | 3600/10000 [05:25<09:38, 11.06it/s]

Episode finished after 90 timesteps with reward=-358.8200778178852
Episode finished after 104 timesteps with reward=-271.4342826594208
Episode finished after 104 timesteps with reward=-336.3758817930734
iter=3600	epsilon=0.076
Current score(mean over 3) = -322.210


 37%|███▋      | 3698/10000 [05:34<09:29, 11.07it/s]

Episode finished after 243 timesteps with reward=-318.233822260948
Episode finished after 173 timesteps with reward=-225.12555854991473


 37%|███▋      | 3701/10000 [05:34<09:30, 11.05it/s]

Episode finished after 231 timesteps with reward=-673.0015483467237
iter=3700	epsilon=0.073
Current score(mean over 3) = -405.454


 38%|███▊      | 3799/10000 [05:44<09:21, 11.04it/s]

Episode finished after 124 timesteps with reward=-209.17139695440414
Episode finished after 158 timesteps with reward=-386.98669283720994


 38%|███▊      | 3801/10000 [05:44<09:22, 11.03it/s]

Episode finished after 236 timesteps with reward=-323.4167184746407
iter=3800	epsilon=0.071
Current score(mean over 3) = -306.525


 39%|███▉      | 3899/10000 [05:53<09:13, 11.03it/s]

Episode finished after 230 timesteps with reward=-277.8305137231166


 39%|███▉      | 3901/10000 [05:54<09:13, 11.01it/s]

Episode finished after 300 timesteps with reward=-372.00380955121904
Episode finished after 156 timesteps with reward=-377.0090426084344
iter=3900	epsilon=0.069
Current score(mean over 3) = -342.281


 40%|███▉      | 3999/10000 [06:02<09:03, 11.03it/s]

Episode finished after 193 timesteps with reward=-295.42607704459857
Episode finished after 232 timesteps with reward=-617.7313113468892


 40%|████      | 4001/10000 [06:03<09:04, 11.02it/s]

Episode finished after 132 timesteps with reward=-295.77347165550395
iter=4000	epsilon=0.067
Current score(mean over 3) = -402.977


 41%|████      | 4098/10000 [06:11<08:55, 11.02it/s]

Episode finished after 288 timesteps with reward=-407.4796930060911
Episode finished after 206 timesteps with reward=-287.20069440888653


 41%|████      | 4102/10000 [06:12<08:55, 11.01it/s]

Episode finished after 228 timesteps with reward=-282.5328091229035
iter=4100	epsilon=0.066
Current score(mean over 3) = -325.738


 42%|████▏     | 4199/10000 [06:20<08:46, 11.03it/s]

Episode finished after 231 timesteps with reward=-359.252122183015
Episode finished after 261 timesteps with reward=-338.1436822439739

 42%|████▏     | 4201/10000 [06:21<08:46, 11.01it/s]


Episode finished after 245 timesteps with reward=-390.10594896538777
iter=4200	epsilon=0.064
Current score(mean over 3) = -362.501


 43%|████▎     | 4299/10000 [06:30<08:37, 11.02it/s]

Episode finished after 171 timesteps with reward=-219.47993362669553
Episode finished after 305 timesteps with reward=-256.66385988815813
Episode finished after 246 timesteps with reward=-395.6513169291004
iter=4300	epsilon=0.063
Current score(mean over 3) = -290.598


 44%|████▍     | 4399/10000 [06:39<08:28, 11.02it/s]

Episode finished after 179 timesteps with reward=-329.84342964198873
Episode finished after 201 timesteps with reward=-367.2267412128736


 44%|████▍     | 4401/10000 [06:39<08:28, 11.01it/s]

Episode finished after 223 timesteps with reward=-298.9196155000319
iter=4400	epsilon=0.062
Current score(mean over 3) = -331.997


 45%|████▍     | 4499/10000 [06:48<08:19, 11.01it/s]

Episode finished after 174 timesteps with reward=-201.8920995908204
Episode finished after 233 timesteps with reward=-347.3401344114794


 45%|████▌     | 4501/10000 [06:49<08:20, 11.00it/s]

Episode finished after 209 timesteps with reward=-439.6341414051181
iter=4500	epsilon=0.061
Current score(mean over 3) = -329.622


 46%|████▌     | 4599/10000 [06:58<08:11, 10.99it/s]

Episode finished after 140 timesteps with reward=-263.05211778728676
Episode finished after 201 timesteps with reward=-209.247466827802


 46%|████▌     | 4601/10000 [06:58<08:11, 10.98it/s]

Episode finished after 186 timesteps with reward=-179.7833971078879
iter=4600	epsilon=0.060
Current score(mean over 3) = -217.361


 47%|████▋     | 4699/10000 [07:07<08:02, 11.00it/s]

Episode finished after 101 timesteps with reward=-644.1553687551852
Episode finished after 100 timesteps with reward=-585.2660739124266
Episode finished after 165 timesteps with reward=-314.45623654265626
iter=4700	epsilon=0.059
Current score(mean over 3) = -514.626


 48%|████▊     | 4799/10000 [07:15<07:52, 11.02it/s]

Episode finished after 146 timesteps with reward=-462.87355035418705
Episode finished after 157 timesteps with reward=-475.96426572510205
Episode finished after 127 timesteps with reward=-592.930194457377
iter=4800	epsilon=0.058
Current score(mean over 3) = -510.589


 49%|████▉     | 4899/10000 [07:23<07:42, 11.04it/s]

Episode finished after 134 timesteps with reward=-205.38456348674617
Episode finished after 165 timesteps with reward=-310.9774101534495


 49%|████▉     | 4901/10000 [07:24<07:42, 11.03it/s]

Episode finished after 169 timesteps with reward=-320.02279591650836
iter=4900	epsilon=0.057
Current score(mean over 3) = -278.795


 50%|████▉     | 4999/10000 [07:32<07:32, 11.06it/s]

Episode finished after 124 timesteps with reward=-317.64418881229693
Episode finished after 102 timesteps with reward=-295.73916185926817
Episode finished after 164 timesteps with reward=-264.4119316660196
iter=5000	epsilon=0.056
Current score(mean over 3) = -292.598


 51%|█████     | 5099/10000 [07:40<07:22, 11.08it/s]

Episode finished after 271 timesteps with reward=-342.2031062092173
Episode finished after 224 timesteps with reward=-291.4975983612311
Episode finished after 106 timesteps with reward=-213.15733782031344
iter=5100	epsilon=0.056
Current score(mean over 3) = -282.286


 52%|█████▏    | 5199/10000 [07:49<07:13, 11.06it/s]

Episode finished after 191 timesteps with reward=-102.49448413045457
Episode finished after 244 timesteps with reward=-285.707045727882


 52%|█████▏    | 5201/10000 [07:50<07:14, 11.05it/s]

Episode finished after 199 timesteps with reward=-346.06729934918013
iter=5200	epsilon=0.055
Current score(mean over 3) = -244.756


 53%|█████▎    | 5299/10000 [07:59<07:05, 11.05it/s]

Episode finished after 214 timesteps with reward=-270.5000032261609
Episode finished after 253 timesteps with reward=-307.10601122295435


 53%|█████▎    | 5301/10000 [08:01<07:06, 11.01it/s]

Episode finished after 812 timesteps with reward=-338.5008595075662
iter=5300	epsilon=0.055
Current score(mean over 3) = -305.369


 54%|█████▍    | 5398/10000 [08:10<06:58, 11.00it/s]

Episode finished after 270 timesteps with reward=-310.52016784345983


 54%|█████▍    | 5400/10000 [08:11<06:58, 10.98it/s]

Episode finished after 328 timesteps with reward=-213.21933271808723
Episode finished after 243 timesteps with reward=-301.41197153921894
iter=5400	epsilon=0.054
Current score(mean over 3) = -275.050


 55%|█████▍    | 5499/10000 [08:21<06:50, 10.97it/s]

Episode finished after 207 timesteps with reward=-316.1527364135561
Episode finished after 197 timesteps with reward=-350.93280639630564


 55%|█████▌    | 5501/10000 [08:21<06:50, 10.96it/s]

Episode finished after 336 timesteps with reward=-283.70968481110253
iter=5500	epsilon=0.054
Current score(mean over 3) = -316.932


 56%|█████▌    | 5598/10000 [08:31<06:41, 10.95it/s]

Episode finished after 501 timesteps with reward=-332.1554119260449


 56%|█████▌    | 5602/10000 [08:32<06:42, 10.92it/s]

Episode finished after 250 timesteps with reward=-318.140797271921
Episode finished after 393 timesteps with reward=-288.88996782382753
iter=5600	epsilon=0.054
Current score(mean over 3) = -313.062


 57%|█████▋    | 5699/10000 [08:42<06:34, 10.92it/s]

Episode finished after 330 timesteps with reward=-312.9219011574643
Episode finished after 234 timesteps with reward=-271.1268695077351


 57%|█████▋    | 5701/10000 [08:42<06:34, 10.90it/s]

Episode finished after 234 timesteps with reward=-253.2830811732149
iter=5700	epsilon=0.053
Current score(mean over 3) = -279.111


 58%|█████▊    | 5799/10000 [08:52<06:25, 10.90it/s]

Episode finished after 182 timesteps with reward=-273.12989828365306
Episode finished after 257 timesteps with reward=-318.6179651656111


 58%|█████▊    | 5802/10000 [08:52<06:25, 10.89it/s]

Episode finished after 132 timesteps with reward=-195.13432955270088
iter=5800	epsilon=0.053
Current score(mean over 3) = -262.294


 59%|█████▉    | 5899/10000 [09:01<06:16, 10.89it/s]

Episode finished after 416 timesteps with reward=-305.9962855936242
Episode finished after 735 timesteps with reward=-339.10789949795355


 59%|█████▉    | 5901/10000 [09:03<06:17, 10.86it/s]

Episode finished after 342 timesteps with reward=-208.06513307650587
iter=5900	epsilon=0.053
Current score(mean over 3) = -284.390


 60%|█████▉    | 5999/10000 [09:14<06:09, 10.82it/s]

Episode finished after 300 timesteps with reward=-309.3152890771852
Episode finished after 211 timesteps with reward=-172.75475730184203


 60%|██████    | 6001/10000 [09:15<06:09, 10.81it/s]

Episode finished after 214 timesteps with reward=-301.17907579262874
iter=6000	epsilon=0.052
Current score(mean over 3) = -261.083


 61%|██████    | 6098/10000 [09:24<06:01, 10.81it/s]

Episode finished after 308 timesteps with reward=-296.7206941704852
Episode finished after 265 timesteps with reward=-284.53333011412064


 61%|██████    | 6102/10000 [09:25<06:01, 10.79it/s]

Episode finished after 402 timesteps with reward=-264.6480848865682
iter=6100	epsilon=0.052
Current score(mean over 3) = -281.967


 62%|██████▏   | 6198/10000 [09:33<05:51, 10.81it/s]

Episode finished after 217 timesteps with reward=-215.46693923929365
Episode finished after 236 timesteps with reward=-244.16622638471745


 62%|██████▏   | 6202/10000 [09:34<05:51, 10.80it/s]

Episode finished after 170 timesteps with reward=-271.69536892047813
iter=6200	epsilon=0.052
Current score(mean over 3) = -243.776


 63%|██████▎   | 6298/10000 [09:42<05:42, 10.82it/s]

Episode finished after 166 timesteps with reward=-251.54399493161625
Episode finished after 157 timesteps with reward=-224.90681206259285


 63%|██████▎   | 6301/10000 [09:43<05:42, 10.81it/s]

Episode finished after 444 timesteps with reward=-236.80369363684497
iter=6300	epsilon=0.052
Current score(mean over 3) = -237.752


 64%|██████▍   | 6399/10000 [09:52<05:33, 10.80it/s]

Episode finished after 322 timesteps with reward=-252.4399785232927
Episode finished after 279 timesteps with reward=-211.9275868660285
Episode finished after 295 timesteps with reward=-281.5981082354432
iter=6400	epsilon=0.052
Current score(mean over 3) = -248.655


 65%|██████▍   | 6499/10000 [10:01<05:24, 10.80it/s]

Episode finished after 262 timesteps with reward=-260.39381929434444
Episode finished after 257 timesteps with reward=-91.07658277151432


 65%|██████▌   | 6501/10000 [10:03<05:24, 10.78it/s]

Episode finished after 543 timesteps with reward=-162.6275399390429
iter=6500	epsilon=0.051
Current score(mean over 3) = -171.366


 66%|██████▌   | 6599/10000 [10:12<05:15, 10.78it/s]

Episode finished after 365 timesteps with reward=-254.9198131622042
Episode finished after 248 timesteps with reward=-262.2857555523427


 66%|██████▌   | 6601/10000 [10:12<05:15, 10.77it/s]

Episode finished after 207 timesteps with reward=-237.9502472938801
iter=6600	epsilon=0.051
Current score(mean over 3) = -251.719


 67%|██████▋   | 6700/10000 [10:22<05:06, 10.76it/s]

Episode finished after 150 timesteps with reward=-205.65778171023538
Episode finished after 119 timesteps with reward=-203.56142356757687
Episode finished after 178 timesteps with reward=-239.46301238394548
iter=6700	epsilon=0.051
Current score(mean over 3) = -216.227


 68%|██████▊   | 6799/10000 [10:31<04:57, 10.77it/s]

Episode finished after 248 timesteps with reward=-245.69516405955173
Episode finished after 211 timesteps with reward=-286.31993550515756


 68%|██████▊   | 6801/10000 [10:32<04:57, 10.76it/s]

Episode finished after 186 timesteps with reward=-169.5191135945633
iter=6800	epsilon=0.051
Current score(mean over 3) = -233.845


 69%|██████▉   | 6900/10000 [10:41<04:48, 10.75it/s]

Episode finished after 368 timesteps with reward=-192.8561670759682
Episode finished after 171 timesteps with reward=-187.17665563208095
Episode finished after 170 timesteps with reward=-188.95035882289648
iter=6900	epsilon=0.051
Current score(mean over 3) = -189.661


 70%|██████▉   | 6999/10000 [10:50<04:38, 10.76it/s]

Episode finished after 258 timesteps with reward=-105.87463860969663
Episode finished after 389 timesteps with reward=-263.5028954774158
Episode finished after 272 timesteps with reward=-233.64738422927908
iter=7000	epsilon=0.051
Current score(mean over 3) = -201.008


 71%|███████   | 7098/10000 [11:02<04:30, 10.72it/s]

Episode finished after 482 timesteps with reward=-101.84419460861025
Episode finished after 465 timesteps with reward=-286.5816394684949


 71%|███████   | 7101/10000 [11:04<04:31, 10.69it/s]

Episode finished after 522 timesteps with reward=-270.574875981312
iter=7100	epsilon=0.051
Current score(mean over 3) = -219.667


 72%|███████▏  | 7198/10000 [11:13<04:22, 10.68it/s]

Episode finished after 250 timesteps with reward=-154.6346885599561
Episode finished after 202 timesteps with reward=-151.24727943222067


 72%|███████▏  | 7201/10000 [11:14<04:22, 10.67it/s]

Episode finished after 363 timesteps with reward=-190.1458377191833
iter=7200	epsilon=0.051
Current score(mean over 3) = -165.343


 73%|███████▎  | 7298/10000 [11:24<04:13, 10.67it/s]

Episode finished after 938 timesteps with reward=-257.33109908534806
Episode finished after 420 timesteps with reward=-223.61311912130355


 73%|███████▎  | 7301/10000 [11:27<04:14, 10.62it/s]

Episode finished after 839 timesteps with reward=-395.53022643675337
iter=7300	epsilon=0.051
Current score(mean over 3) = -292.158


 74%|███████▍  | 7399/10000 [11:36<04:04, 10.62it/s]

Episode finished after 223 timesteps with reward=-175.83647337965758
Episode finished after 276 timesteps with reward=-160.7334003328645


 74%|███████▍  | 7402/10000 [11:37<04:04, 10.60it/s]

Episode finished after 685 timesteps with reward=56.062213750430274
iter=7400	epsilon=0.051
Current score(mean over 3) = -93.503


 75%|███████▍  | 7498/10000 [11:48<03:56, 10.58it/s]

Episode finished after 168 timesteps with reward=-225.9125925541647
Episode finished after 512 timesteps with reward=-203.74935171968144


 75%|███████▌  | 7501/10000 [11:50<03:56, 10.56it/s]

Episode finished after 684 timesteps with reward=-222.9689760026824
iter=7500	epsilon=0.051
Current score(mean over 3) = -217.544


 76%|███████▌  | 7599/10000 [12:03<03:48, 10.51it/s]

Episode finished after 280 timesteps with reward=-238.38173523841252
Episode finished after 150 timesteps with reward=-93.89764346182956


 76%|███████▌  | 7601/10000 [12:03<03:48, 10.50it/s]

Episode finished after 183 timesteps with reward=15.998160811028328
iter=7600	epsilon=0.050
Current score(mean over 3) = -105.427


 77%|███████▋  | 7698/10000 [12:13<03:39, 10.49it/s]

Episode finished after 173 timesteps with reward=-201.42317433688106
Episode finished after 381 timesteps with reward=84.65304620239735
Episode finished after 344 timesteps with reward=-151.55329146464157
iter=7700	epsilon=0.050
Current score(mean over 3) = -89.441
You win!


 77%|███████▋  | 7698/10000 [12:31<03:44, 10.25it/s]

In [None]:
from pandas import ewma
iters,session_rewards=zip(*sorted(rewards.items(),key=lambda (k,v):k))
plt.plot(iters,ewma(np.array(session_rewards),span=10))

In [None]:
final_reward = pool.evaluate(n_games=10,save_path="./records",record_video=True)

print("average reward:",final_reward)

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./records/")))

for video_name in video_names:
    HTML("""
    <video width="640" height="480" controls>
      <source src="{}" type="video/mp4">
    </video>
    """.format("./records/"+video_name)) #this may or may not be _last_ video. Try other indices