In [1]:
import gym
import numpy as np


In [2]:
%matplotlib inline
from matplotlib import animation
import matplotlib.pyplot as plt
from IPython.display import display, HTML

def display_frames_as_gif(frames):
    plt.figure(figsize=(frames[0].shape[1] / 72.0, frames[0].shape[0] / 72.0), dpi = 144)
    patch = plt.imshow(frames[0])
    plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, frames = len(frames), interval=50)
    plt.close(anim._fig)
    display(HTML(anim.to_jshtml()))

from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make('Pong-v0')

H = 400 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

   
    

  logger.warn(
  deprecation(
  deprecation(


In [3]:
%time hist1 = train_model(env, model, total_episodes=7000)

  logger.deprecation(


episode 1.000000, reward total was -19.000000. running mean: -19.000000
episode 2.000000, reward total was -21.000000. running mean: -19.020000
episode 3.000000, reward total was -21.000000. running mean: -19.039800
episode 4.000000, reward total was -21.000000. running mean: -19.059402
episode 5.000000, reward total was -21.000000. running mean: -19.078808
episode 6.000000, reward total was -20.000000. running mean: -19.088020
episode 7.000000, reward total was -21.000000. running mean: -19.107140
episode 8.000000, reward total was -21.000000. running mean: -19.126068
episode 9.000000, reward total was -21.000000. running mean: -19.144808
episode 10.000000, reward total was -21.000000. running mean: -19.163360
episode 11.000000, reward total was -21.000000. running mean: -19.181726
episode 12.000000, reward total was -19.000000. running mean: -19.179909
episode 13.000000, reward total was -21.000000. running mean: -19.198110
episode 14.000000, reward total was -21.000000. running mean

episode 114.000000, reward total was -21.000000. running mean: -20.033184
episode 115.000000, reward total was -16.000000. running mean: -19.992853
episode 116.000000, reward total was -21.000000. running mean: -20.002924
episode 117.000000, reward total was -21.000000. running mean: -20.012895
episode 118.000000, reward total was -20.000000. running mean: -20.012766
episode 119.000000, reward total was -21.000000. running mean: -20.022638
episode 120.000000, reward total was -21.000000. running mean: -20.032412
episode 121.000000, reward total was -21.000000. running mean: -20.042088
episode 122.000000, reward total was -21.000000. running mean: -20.051667
episode 123.000000, reward total was -20.000000. running mean: -20.051150
episode 124.000000, reward total was -20.000000. running mean: -20.050639
episode 125.000000, reward total was -21.000000. running mean: -20.060132
episode 126.000000, reward total was -20.000000. running mean: -20.059531
episode 127.000000, reward total was -

episode 225.000000, reward total was -19.000000. running mean: -20.244470
episode 226.000000, reward total was -21.000000. running mean: -20.252025
episode 227.000000, reward total was -20.000000. running mean: -20.249505
episode 228.000000, reward total was -21.000000. running mean: -20.257010
episode 229.000000, reward total was -21.000000. running mean: -20.264440
episode 230.000000, reward total was -19.000000. running mean: -20.251796
episode 231.000000, reward total was -21.000000. running mean: -20.259278
episode 232.000000, reward total was -21.000000. running mean: -20.266685
episode 233.000000, reward total was -21.000000. running mean: -20.274018
episode 234.000000, reward total was -20.000000. running mean: -20.271278
episode 235.000000, reward total was -20.000000. running mean: -20.268565
episode 236.000000, reward total was -20.000000. running mean: -20.265879
episode 237.000000, reward total was -21.000000. running mean: -20.273221
episode 238.000000, reward total was -

episode 336.000000, reward total was -19.000000. running mean: -20.290883
episode 337.000000, reward total was -20.000000. running mean: -20.287974
episode 338.000000, reward total was -20.000000. running mean: -20.285094
episode 339.000000, reward total was -21.000000. running mean: -20.292243
episode 340.000000, reward total was -18.000000. running mean: -20.269321
episode 341.000000, reward total was -21.000000. running mean: -20.276627
episode 342.000000, reward total was -20.000000. running mean: -20.273861
episode 343.000000, reward total was -21.000000. running mean: -20.281123
episode 344.000000, reward total was -21.000000. running mean: -20.288311
episode 345.000000, reward total was -18.000000. running mean: -20.265428
episode 346.000000, reward total was -21.000000. running mean: -20.272774
episode 347.000000, reward total was -21.000000. running mean: -20.280046
episode 348.000000, reward total was -19.000000. running mean: -20.267246
episode 349.000000, reward total was -

episode 447.000000, reward total was -20.000000. running mean: -20.347002
episode 448.000000, reward total was -21.000000. running mean: -20.353532
episode 449.000000, reward total was -21.000000. running mean: -20.359997
episode 450.000000, reward total was -21.000000. running mean: -20.366397
episode 451.000000, reward total was -21.000000. running mean: -20.372733
episode 452.000000, reward total was -20.000000. running mean: -20.369006
episode 453.000000, reward total was -21.000000. running mean: -20.375316
episode 454.000000, reward total was -19.000000. running mean: -20.361562
episode 455.000000, reward total was -21.000000. running mean: -20.367947
episode 456.000000, reward total was -21.000000. running mean: -20.374267
episode 457.000000, reward total was -21.000000. running mean: -20.380525
episode 458.000000, reward total was -21.000000. running mean: -20.386719
episode 459.000000, reward total was -19.000000. running mean: -20.372852
episode 460.000000, reward total was -

episode 558.000000, reward total was -21.000000. running mean: -20.330123
episode 559.000000, reward total was -21.000000. running mean: -20.336822
episode 560.000000, reward total was -21.000000. running mean: -20.343454
episode 561.000000, reward total was -19.000000. running mean: -20.330019
episode 562.000000, reward total was -21.000000. running mean: -20.336719
episode 563.000000, reward total was -21.000000. running mean: -20.343352
episode 564.000000, reward total was -21.000000. running mean: -20.349918
episode 565.000000, reward total was -18.000000. running mean: -20.326419
episode 566.000000, reward total was -20.000000. running mean: -20.323155
episode 567.000000, reward total was -20.000000. running mean: -20.319923
episode 568.000000, reward total was -20.000000. running mean: -20.316724
episode 569.000000, reward total was -21.000000. running mean: -20.323557
episode 570.000000, reward total was -18.000000. running mean: -20.300321
episode 571.000000, reward total was -

episode 669.000000, reward total was -20.000000. running mean: -20.346553
episode 670.000000, reward total was -21.000000. running mean: -20.353087
episode 671.000000, reward total was -21.000000. running mean: -20.359557
episode 672.000000, reward total was -20.000000. running mean: -20.355961
episode 673.000000, reward total was -21.000000. running mean: -20.362401
episode 674.000000, reward total was -21.000000. running mean: -20.368777
episode 675.000000, reward total was -21.000000. running mean: -20.375090
episode 676.000000, reward total was -21.000000. running mean: -20.381339
episode 677.000000, reward total was -20.000000. running mean: -20.377525
episode 678.000000, reward total was -21.000000. running mean: -20.383750
episode 679.000000, reward total was -20.000000. running mean: -20.379913
episode 680.000000, reward total was -21.000000. running mean: -20.386113
episode 681.000000, reward total was -20.000000. running mean: -20.382252
episode 682.000000, reward total was -

episode 780.000000, reward total was -19.000000. running mean: -20.390895
episode 781.000000, reward total was -19.000000. running mean: -20.376986
episode 782.000000, reward total was -21.000000. running mean: -20.383216
episode 783.000000, reward total was -17.000000. running mean: -20.349384
episode 784.000000, reward total was -20.000000. running mean: -20.345890
episode 785.000000, reward total was -21.000000. running mean: -20.352431
episode 786.000000, reward total was -20.000000. running mean: -20.348907
episode 787.000000, reward total was -20.000000. running mean: -20.345418
episode 788.000000, reward total was -21.000000. running mean: -20.351964
episode 789.000000, reward total was -20.000000. running mean: -20.348444
episode 790.000000, reward total was -20.000000. running mean: -20.344960
episode 791.000000, reward total was -21.000000. running mean: -20.351510
episode 792.000000, reward total was -21.000000. running mean: -20.357995
episode 793.000000, reward total was -

episode 891.000000, reward total was -21.000000. running mean: -20.325846
episode 892.000000, reward total was -19.000000. running mean: -20.312587
episode 893.000000, reward total was -19.000000. running mean: -20.299461
episode 894.000000, reward total was -21.000000. running mean: -20.306467
episode 895.000000, reward total was -20.000000. running mean: -20.303402
episode 896.000000, reward total was -20.000000. running mean: -20.300368
episode 897.000000, reward total was -21.000000. running mean: -20.307364
episode 898.000000, reward total was -21.000000. running mean: -20.314291
episode 899.000000, reward total was -20.000000. running mean: -20.311148
episode 900.000000, reward total was -21.000000. running mean: -20.318036
episode 901.000000, reward total was -19.000000. running mean: -20.304856
episode 902.000000, reward total was -21.000000. running mean: -20.311807
episode 903.000000, reward total was -21.000000. running mean: -20.318689
episode 904.000000, reward total was -

episode 1002.000000, reward total was -21.000000. running mean: -20.366540
episode 1003.000000, reward total was -21.000000. running mean: -20.372875
episode 1004.000000, reward total was -21.000000. running mean: -20.379146
episode 1005.000000, reward total was -20.000000. running mean: -20.375354
episode 1006.000000, reward total was -20.000000. running mean: -20.371601
episode 1007.000000, reward total was -20.000000. running mean: -20.367885
episode 1008.000000, reward total was -20.000000. running mean: -20.364206
episode 1009.000000, reward total was -19.000000. running mean: -20.350564
episode 1010.000000, reward total was -20.000000. running mean: -20.347058
episode 1011.000000, reward total was -20.000000. running mean: -20.343588
episode 1012.000000, reward total was -21.000000. running mean: -20.350152
episode 1013.000000, reward total was -21.000000. running mean: -20.356650
episode 1014.000000, reward total was -19.000000. running mean: -20.343084
episode 1015.000000, rewa

episode 1112.000000, reward total was -20.000000. running mean: -20.346817
episode 1113.000000, reward total was -21.000000. running mean: -20.353349
episode 1114.000000, reward total was -19.000000. running mean: -20.339816
episode 1115.000000, reward total was -19.000000. running mean: -20.326418
episode 1116.000000, reward total was -21.000000. running mean: -20.333153
episode 1117.000000, reward total was -20.000000. running mean: -20.329822
episode 1118.000000, reward total was -21.000000. running mean: -20.336524
episode 1119.000000, reward total was -21.000000. running mean: -20.343158
episode 1120.000000, reward total was -21.000000. running mean: -20.349727
episode 1121.000000, reward total was -21.000000. running mean: -20.356230
episode 1122.000000, reward total was -21.000000. running mean: -20.362667
episode 1123.000000, reward total was -20.000000. running mean: -20.359041
episode 1124.000000, reward total was -19.000000. running mean: -20.345450
episode 1125.000000, rewa

episode 1222.000000, reward total was -20.000000. running mean: -20.225516
episode 1223.000000, reward total was -21.000000. running mean: -20.233260
episode 1224.000000, reward total was -21.000000. running mean: -20.240928
episode 1225.000000, reward total was -21.000000. running mean: -20.248519
episode 1226.000000, reward total was -20.000000. running mean: -20.246033
episode 1227.000000, reward total was -20.000000. running mean: -20.243573
episode 1228.000000, reward total was -21.000000. running mean: -20.251137
episode 1229.000000, reward total was -21.000000. running mean: -20.258626
episode 1230.000000, reward total was -20.000000. running mean: -20.256040
episode 1231.000000, reward total was -21.000000. running mean: -20.263479
episode 1232.000000, reward total was -20.000000. running mean: -20.260844
episode 1233.000000, reward total was -21.000000. running mean: -20.268236
episode 1234.000000, reward total was -20.000000. running mean: -20.265554
episode 1235.000000, rewa

episode 1332.000000, reward total was -19.000000. running mean: -20.238562
episode 1333.000000, reward total was -21.000000. running mean: -20.246176
episode 1334.000000, reward total was -21.000000. running mean: -20.253714
episode 1335.000000, reward total was -21.000000. running mean: -20.261177
episode 1336.000000, reward total was -19.000000. running mean: -20.248565
episode 1337.000000, reward total was -21.000000. running mean: -20.256080
episode 1338.000000, reward total was -21.000000. running mean: -20.263519
episode 1339.000000, reward total was -19.000000. running mean: -20.250884
episode 1340.000000, reward total was -21.000000. running mean: -20.258375
episode 1341.000000, reward total was -20.000000. running mean: -20.255791
episode 1342.000000, reward total was -20.000000. running mean: -20.253233
episode 1343.000000, reward total was -20.000000. running mean: -20.250701
episode 1344.000000, reward total was -21.000000. running mean: -20.258194
episode 1345.000000, rewa

episode 1442.000000, reward total was -21.000000. running mean: -20.257003
episode 1443.000000, reward total was -20.000000. running mean: -20.254433
episode 1444.000000, reward total was -21.000000. running mean: -20.261889
episode 1445.000000, reward total was -19.000000. running mean: -20.249270
episode 1446.000000, reward total was -20.000000. running mean: -20.246777
episode 1447.000000, reward total was -20.000000. running mean: -20.244309
episode 1448.000000, reward total was -21.000000. running mean: -20.251866
episode 1449.000000, reward total was -18.000000. running mean: -20.229348
episode 1450.000000, reward total was -21.000000. running mean: -20.237054
episode 1451.000000, reward total was -20.000000. running mean: -20.234684
episode 1452.000000, reward total was -21.000000. running mean: -20.242337
episode 1453.000000, reward total was -21.000000. running mean: -20.249913
episode 1454.000000, reward total was -18.000000. running mean: -20.227414
episode 1455.000000, rewa

episode 1552.000000, reward total was -20.000000. running mean: -20.234845
episode 1553.000000, reward total was -20.000000. running mean: -20.232497
episode 1554.000000, reward total was -21.000000. running mean: -20.240172
episode 1555.000000, reward total was -21.000000. running mean: -20.247770
episode 1556.000000, reward total was -18.000000. running mean: -20.225292
episode 1557.000000, reward total was -20.000000. running mean: -20.223039
episode 1558.000000, reward total was -19.000000. running mean: -20.210809
episode 1559.000000, reward total was -21.000000. running mean: -20.218701
episode 1560.000000, reward total was -21.000000. running mean: -20.226514
episode 1561.000000, reward total was -17.000000. running mean: -20.194249
episode 1562.000000, reward total was -21.000000. running mean: -20.202306
episode 1563.000000, reward total was -19.000000. running mean: -20.190283
episode 1564.000000, reward total was -21.000000. running mean: -20.198380
episode 1565.000000, rewa

episode 1662.000000, reward total was -21.000000. running mean: -20.158400
episode 1663.000000, reward total was -20.000000. running mean: -20.156816
episode 1664.000000, reward total was -20.000000. running mean: -20.155248
episode 1665.000000, reward total was -21.000000. running mean: -20.163695
episode 1666.000000, reward total was -20.000000. running mean: -20.162058
episode 1667.000000, reward total was -21.000000. running mean: -20.170438
episode 1668.000000, reward total was -21.000000. running mean: -20.178733
episode 1669.000000, reward total was -21.000000. running mean: -20.186946
episode 1670.000000, reward total was -21.000000. running mean: -20.195076
episode 1671.000000, reward total was -21.000000. running mean: -20.203126
episode 1672.000000, reward total was -21.000000. running mean: -20.211094
episode 1673.000000, reward total was -20.000000. running mean: -20.208984
episode 1674.000000, reward total was -21.000000. running mean: -20.216894
episode 1675.000000, rewa

episode 1772.000000, reward total was -20.000000. running mean: -19.994667
episode 1773.000000, reward total was -20.000000. running mean: -19.994720
episode 1774.000000, reward total was -19.000000. running mean: -19.984773
episode 1775.000000, reward total was -21.000000. running mean: -19.994925
episode 1776.000000, reward total was -20.000000. running mean: -19.994976
episode 1777.000000, reward total was -21.000000. running mean: -20.005026
episode 1778.000000, reward total was -20.000000. running mean: -20.004976
episode 1779.000000, reward total was -18.000000. running mean: -19.984926
episode 1780.000000, reward total was -21.000000. running mean: -19.995077
episode 1781.000000, reward total was -21.000000. running mean: -20.005126
episode 1782.000000, reward total was -21.000000. running mean: -20.015075
episode 1783.000000, reward total was -19.000000. running mean: -20.004924
episode 1784.000000, reward total was -20.000000. running mean: -20.004875
episode 1785.000000, rewa

episode 1882.000000, reward total was -21.000000. running mean: -20.121353
episode 1883.000000, reward total was -21.000000. running mean: -20.130139
episode 1884.000000, reward total was -21.000000. running mean: -20.138838
episode 1885.000000, reward total was -20.000000. running mean: -20.137449
episode 1886.000000, reward total was -20.000000. running mean: -20.136075
episode 1887.000000, reward total was -21.000000. running mean: -20.144714
episode 1888.000000, reward total was -21.000000. running mean: -20.153267
episode 1889.000000, reward total was -21.000000. running mean: -20.161734
episode 1890.000000, reward total was -21.000000. running mean: -20.170117
episode 1891.000000, reward total was -21.000000. running mean: -20.178416
episode 1892.000000, reward total was -21.000000. running mean: -20.186632
episode 1893.000000, reward total was -20.000000. running mean: -20.184765
episode 1894.000000, reward total was -21.000000. running mean: -20.192918
episode 1895.000000, rewa

episode 1992.000000, reward total was -21.000000. running mean: -20.195888
episode 1993.000000, reward total was -20.000000. running mean: -20.193929
episode 1994.000000, reward total was -18.000000. running mean: -20.171990
episode 1995.000000, reward total was -19.000000. running mean: -20.160270
episode 1996.000000, reward total was -21.000000. running mean: -20.168668
episode 1997.000000, reward total was -20.000000. running mean: -20.166981
episode 1998.000000, reward total was -20.000000. running mean: -20.165311
episode 1999.000000, reward total was -21.000000. running mean: -20.173658
episode 2000.000000, reward total was -20.000000. running mean: -20.171921
episode 2001.000000, reward total was -20.000000. running mean: -20.170202
episode 2002.000000, reward total was -19.000000. running mean: -20.158500
episode 2003.000000, reward total was -21.000000. running mean: -20.166915
episode 2004.000000, reward total was -21.000000. running mean: -20.175246
episode 2005.000000, rewa

episode 2102.000000, reward total was -21.000000. running mean: -20.162547
episode 2103.000000, reward total was -18.000000. running mean: -20.140921
episode 2104.000000, reward total was -21.000000. running mean: -20.149512
episode 2105.000000, reward total was -21.000000. running mean: -20.158017
episode 2106.000000, reward total was -19.000000. running mean: -20.146437
episode 2107.000000, reward total was -19.000000. running mean: -20.134973
episode 2108.000000, reward total was -19.000000. running mean: -20.123623
episode 2109.000000, reward total was -21.000000. running mean: -20.132387
episode 2110.000000, reward total was -21.000000. running mean: -20.141063
episode 2111.000000, reward total was -20.000000. running mean: -20.139652
episode 2112.000000, reward total was -21.000000. running mean: -20.148256
episode 2113.000000, reward total was -20.000000. running mean: -20.146773
episode 2114.000000, reward total was -21.000000. running mean: -20.155305
episode 2115.000000, rewa

episode 2212.000000, reward total was -21.000000. running mean: -19.982267
episode 2213.000000, reward total was -21.000000. running mean: -19.992445
episode 2214.000000, reward total was -20.000000. running mean: -19.992520
episode 2215.000000, reward total was -19.000000. running mean: -19.982595
episode 2216.000000, reward total was -21.000000. running mean: -19.992769
episode 2217.000000, reward total was -18.000000. running mean: -19.972841
episode 2218.000000, reward total was -19.000000. running mean: -19.963113
episode 2219.000000, reward total was -21.000000. running mean: -19.973482
episode 2220.000000, reward total was -21.000000. running mean: -19.983747
episode 2221.000000, reward total was -20.000000. running mean: -19.983910
episode 2222.000000, reward total was -19.000000. running mean: -19.974070
episode 2223.000000, reward total was -19.000000. running mean: -19.964330
episode 2224.000000, reward total was -21.000000. running mean: -19.974686
episode 2225.000000, rewa

episode 2322.000000, reward total was -20.000000. running mean: -19.885406
episode 2323.000000, reward total was -20.000000. running mean: -19.886552
episode 2324.000000, reward total was -19.000000. running mean: -19.877686
episode 2325.000000, reward total was -21.000000. running mean: -19.888909
episode 2326.000000, reward total was -20.000000. running mean: -19.890020
episode 2327.000000, reward total was -20.000000. running mean: -19.891120
episode 2328.000000, reward total was -21.000000. running mean: -19.902209
episode 2329.000000, reward total was -20.000000. running mean: -19.903187
episode 2330.000000, reward total was -21.000000. running mean: -19.914155
episode 2331.000000, reward total was -18.000000. running mean: -19.895013
episode 2332.000000, reward total was -20.000000. running mean: -19.896063
episode 2333.000000, reward total was -21.000000. running mean: -19.907102
episode 2334.000000, reward total was -16.000000. running mean: -19.868031
episode 2335.000000, rewa

episode 2432.000000, reward total was -21.000000. running mean: -19.893073
episode 2433.000000, reward total was -20.000000. running mean: -19.894143
episode 2434.000000, reward total was -21.000000. running mean: -19.905201
episode 2435.000000, reward total was -19.000000. running mean: -19.896149
episode 2436.000000, reward total was -20.000000. running mean: -19.897188
episode 2437.000000, reward total was -20.000000. running mean: -19.898216
episode 2438.000000, reward total was -21.000000. running mean: -19.909234
episode 2439.000000, reward total was -18.000000. running mean: -19.890141
episode 2440.000000, reward total was -21.000000. running mean: -19.901240
episode 2441.000000, reward total was -20.000000. running mean: -19.902227
episode 2442.000000, reward total was -20.000000. running mean: -19.903205
episode 2443.000000, reward total was -19.000000. running mean: -19.894173
episode 2444.000000, reward total was -21.000000. running mean: -19.905231
episode 2445.000000, rewa

episode 2542.000000, reward total was -20.000000. running mean: -19.850405
episode 2543.000000, reward total was -20.000000. running mean: -19.851901
episode 2544.000000, reward total was -21.000000. running mean: -19.863382
episode 2545.000000, reward total was -19.000000. running mean: -19.854748
episode 2546.000000, reward total was -20.000000. running mean: -19.856201
episode 2547.000000, reward total was -21.000000. running mean: -19.867639
episode 2548.000000, reward total was -20.000000. running mean: -19.868962
episode 2549.000000, reward total was -20.000000. running mean: -19.870273
episode 2550.000000, reward total was -21.000000. running mean: -19.881570
episode 2551.000000, reward total was -21.000000. running mean: -19.892754
episode 2552.000000, reward total was -21.000000. running mean: -19.903827
episode 2553.000000, reward total was -20.000000. running mean: -19.904788
episode 2554.000000, reward total was -16.000000. running mean: -19.865740
episode 2555.000000, rewa

episode 2652.000000, reward total was -20.000000. running mean: -19.888092
episode 2653.000000, reward total was -20.000000. running mean: -19.889211
episode 2654.000000, reward total was -21.000000. running mean: -19.900319
episode 2655.000000, reward total was -20.000000. running mean: -19.901316
episode 2656.000000, reward total was -18.000000. running mean: -19.882303
episode 2657.000000, reward total was -21.000000. running mean: -19.893480
episode 2658.000000, reward total was -18.000000. running mean: -19.874545
episode 2659.000000, reward total was -20.000000. running mean: -19.875799
episode 2660.000000, reward total was -20.000000. running mean: -19.877041
episode 2661.000000, reward total was -21.000000. running mean: -19.888271
episode 2662.000000, reward total was -21.000000. running mean: -19.899388
episode 2663.000000, reward total was -21.000000. running mean: -19.910394
episode 2664.000000, reward total was -20.000000. running mean: -19.911290
episode 2665.000000, rewa

episode 2762.000000, reward total was -21.000000. running mean: -19.903450
episode 2763.000000, reward total was -21.000000. running mean: -19.914415
episode 2764.000000, reward total was -21.000000. running mean: -19.925271
episode 2765.000000, reward total was -20.000000. running mean: -19.926018
episode 2766.000000, reward total was -19.000000. running mean: -19.916758
episode 2767.000000, reward total was -19.000000. running mean: -19.907591
episode 2768.000000, reward total was -18.000000. running mean: -19.888515
episode 2769.000000, reward total was -20.000000. running mean: -19.889630
episode 2770.000000, reward total was -19.000000. running mean: -19.880733
episode 2771.000000, reward total was -21.000000. running mean: -19.891926
episode 2772.000000, reward total was -20.000000. running mean: -19.893007
episode 2773.000000, reward total was -20.000000. running mean: -19.894077
episode 2774.000000, reward total was -21.000000. running mean: -19.905136
episode 2775.000000, rewa

episode 2872.000000, reward total was -21.000000. running mean: -19.921238
episode 2873.000000, reward total was -19.000000. running mean: -19.912026
episode 2874.000000, reward total was -21.000000. running mean: -19.922906
episode 2875.000000, reward total was -20.000000. running mean: -19.923677
episode 2876.000000, reward total was -20.000000. running mean: -19.924440
episode 2877.000000, reward total was -21.000000. running mean: -19.935195
episode 2878.000000, reward total was -20.000000. running mean: -19.935843
episode 2879.000000, reward total was -20.000000. running mean: -19.936485
episode 2880.000000, reward total was -20.000000. running mean: -19.937120
episode 2881.000000, reward total was -20.000000. running mean: -19.937749
episode 2882.000000, reward total was -21.000000. running mean: -19.948371
episode 2883.000000, reward total was -19.000000. running mean: -19.938888
episode 2884.000000, reward total was -19.000000. running mean: -19.929499
episode 2885.000000, rewa

episode 2982.000000, reward total was -17.000000. running mean: -19.836615
episode 2983.000000, reward total was -19.000000. running mean: -19.828249
episode 2984.000000, reward total was -18.000000. running mean: -19.809967
episode 2985.000000, reward total was -19.000000. running mean: -19.801867
episode 2986.000000, reward total was -21.000000. running mean: -19.813848
episode 2987.000000, reward total was -21.000000. running mean: -19.825710
episode 2988.000000, reward total was -20.000000. running mean: -19.827453
episode 2989.000000, reward total was -20.000000. running mean: -19.829178
episode 2990.000000, reward total was -20.000000. running mean: -19.830886
episode 2991.000000, reward total was -21.000000. running mean: -19.842577
episode 2992.000000, reward total was -17.000000. running mean: -19.814152
episode 2993.000000, reward total was -20.000000. running mean: -19.816010
episode 2994.000000, reward total was -21.000000. running mean: -19.827850
episode 2995.000000, rewa

episode 3092.000000, reward total was -18.000000. running mean: -19.771934
episode 3093.000000, reward total was -19.000000. running mean: -19.764215
episode 3094.000000, reward total was -20.000000. running mean: -19.766572
episode 3095.000000, reward total was -19.000000. running mean: -19.758907
episode 3096.000000, reward total was -21.000000. running mean: -19.771318
episode 3097.000000, reward total was -21.000000. running mean: -19.783604
episode 3098.000000, reward total was -21.000000. running mean: -19.795768
episode 3099.000000, reward total was -21.000000. running mean: -19.807811
episode 3100.000000, reward total was -18.000000. running mean: -19.789733
episode 3101.000000, reward total was -20.000000. running mean: -19.791835
episode 3102.000000, reward total was -20.000000. running mean: -19.793917
episode 3103.000000, reward total was -20.000000. running mean: -19.795978
episode 3104.000000, reward total was -17.000000. running mean: -19.768018
episode 3105.000000, rewa

episode 3202.000000, reward total was -20.000000. running mean: -19.652800
episode 3203.000000, reward total was -19.000000. running mean: -19.646272
episode 3204.000000, reward total was -20.000000. running mean: -19.649810
episode 3205.000000, reward total was -21.000000. running mean: -19.663312
episode 3206.000000, reward total was -20.000000. running mean: -19.666678
episode 3207.000000, reward total was -21.000000. running mean: -19.680012
episode 3208.000000, reward total was -20.000000. running mean: -19.683212
episode 3209.000000, reward total was -19.000000. running mean: -19.676379
episode 3210.000000, reward total was -20.000000. running mean: -19.679616
episode 3211.000000, reward total was -20.000000. running mean: -19.682819
episode 3212.000000, reward total was -21.000000. running mean: -19.695991
episode 3213.000000, reward total was -18.000000. running mean: -19.679031
episode 3214.000000, reward total was -19.000000. running mean: -19.672241
episode 3215.000000, rewa

episode 3312.000000, reward total was -21.000000. running mean: -19.590141
episode 3313.000000, reward total was -20.000000. running mean: -19.594240
episode 3314.000000, reward total was -21.000000. running mean: -19.608298
episode 3315.000000, reward total was -21.000000. running mean: -19.622215
episode 3316.000000, reward total was -19.000000. running mean: -19.615992
episode 3317.000000, reward total was -20.000000. running mean: -19.619833
episode 3318.000000, reward total was -21.000000. running mean: -19.633634
episode 3319.000000, reward total was -20.000000. running mean: -19.637298
episode 3320.000000, reward total was -21.000000. running mean: -19.650925
episode 3321.000000, reward total was -20.000000. running mean: -19.654416
episode 3322.000000, reward total was -19.000000. running mean: -19.647871
episode 3323.000000, reward total was -20.000000. running mean: -19.651393
episode 3324.000000, reward total was -20.000000. running mean: -19.654879
episode 3325.000000, rewa

episode 3422.000000, reward total was -20.000000. running mean: -19.713281
episode 3423.000000, reward total was -19.000000. running mean: -19.706148
episode 3424.000000, reward total was -21.000000. running mean: -19.719086
episode 3425.000000, reward total was -16.000000. running mean: -19.681895
episode 3426.000000, reward total was -21.000000. running mean: -19.695076
episode 3427.000000, reward total was -21.000000. running mean: -19.708126
episode 3428.000000, reward total was -18.000000. running mean: -19.691044
episode 3429.000000, reward total was -20.000000. running mean: -19.694134
episode 3430.000000, reward total was -20.000000. running mean: -19.697193
episode 3431.000000, reward total was -19.000000. running mean: -19.690221
episode 3432.000000, reward total was -20.000000. running mean: -19.693319
episode 3433.000000, reward total was -20.000000. running mean: -19.696385
episode 3434.000000, reward total was -20.000000. running mean: -19.699421
episode 3435.000000, rewa

episode 3532.000000, reward total was -21.000000. running mean: -19.735921
episode 3533.000000, reward total was -21.000000. running mean: -19.748562
episode 3534.000000, reward total was -19.000000. running mean: -19.741076
episode 3535.000000, reward total was -17.000000. running mean: -19.713666
episode 3536.000000, reward total was -19.000000. running mean: -19.706529
episode 3537.000000, reward total was -20.000000. running mean: -19.709464
episode 3538.000000, reward total was -21.000000. running mean: -19.722369
episode 3539.000000, reward total was -21.000000. running mean: -19.735145
episode 3540.000000, reward total was -18.000000. running mean: -19.717794
episode 3541.000000, reward total was -17.000000. running mean: -19.690616
episode 3542.000000, reward total was -21.000000. running mean: -19.703710
episode 3543.000000, reward total was -20.000000. running mean: -19.706673
episode 3544.000000, reward total was -19.000000. running mean: -19.699606
episode 3545.000000, rewa

episode 3642.000000, reward total was -15.000000. running mean: -19.583513
episode 3643.000000, reward total was -21.000000. running mean: -19.597678
episode 3644.000000, reward total was -19.000000. running mean: -19.591701
episode 3645.000000, reward total was -19.000000. running mean: -19.585784
episode 3646.000000, reward total was -20.000000. running mean: -19.589926
episode 3647.000000, reward total was -20.000000. running mean: -19.594027
episode 3648.000000, reward total was -18.000000. running mean: -19.578086
episode 3649.000000, reward total was -21.000000. running mean: -19.592306
episode 3650.000000, reward total was -18.000000. running mean: -19.576382
episode 3651.000000, reward total was -20.000000. running mean: -19.580619
episode 3652.000000, reward total was -18.000000. running mean: -19.564812
episode 3653.000000, reward total was -18.000000. running mean: -19.549164
episode 3654.000000, reward total was -19.000000. running mean: -19.543673
episode 3655.000000, rewa

episode 3752.000000, reward total was -20.000000. running mean: -19.648672
episode 3753.000000, reward total was -19.000000. running mean: -19.642185
episode 3754.000000, reward total was -18.000000. running mean: -19.625763
episode 3755.000000, reward total was -18.000000. running mean: -19.609506
episode 3756.000000, reward total was -20.000000. running mean: -19.613411
episode 3757.000000, reward total was -21.000000. running mean: -19.627277
episode 3758.000000, reward total was -21.000000. running mean: -19.641004
episode 3759.000000, reward total was -19.000000. running mean: -19.634594
episode 3760.000000, reward total was -19.000000. running mean: -19.628248
episode 3761.000000, reward total was -20.000000. running mean: -19.631965
episode 3762.000000, reward total was -18.000000. running mean: -19.615646
episode 3763.000000, reward total was -21.000000. running mean: -19.629489
episode 3764.000000, reward total was -17.000000. running mean: -19.603194
episode 3765.000000, rewa

episode 3862.000000, reward total was -21.000000. running mean: -19.506770
episode 3863.000000, reward total was -20.000000. running mean: -19.511703
episode 3864.000000, reward total was -20.000000. running mean: -19.516586
episode 3865.000000, reward total was -21.000000. running mean: -19.531420
episode 3866.000000, reward total was -19.000000. running mean: -19.526106
episode 3867.000000, reward total was -19.000000. running mean: -19.520845
episode 3868.000000, reward total was -20.000000. running mean: -19.525636
episode 3869.000000, reward total was -18.000000. running mean: -19.510380
episode 3870.000000, reward total was -21.000000. running mean: -19.525276
episode 3871.000000, reward total was -20.000000. running mean: -19.530023
episode 3872.000000, reward total was -18.000000. running mean: -19.514723
episode 3873.000000, reward total was -21.000000. running mean: -19.529576
episode 3874.000000, reward total was -20.000000. running mean: -19.534280
episode 3875.000000, rewa

episode 3972.000000, reward total was -18.000000. running mean: -19.425256
episode 3973.000000, reward total was -18.000000. running mean: -19.411003
episode 3974.000000, reward total was -21.000000. running mean: -19.426893
episode 3975.000000, reward total was -19.000000. running mean: -19.422624
episode 3976.000000, reward total was -20.000000. running mean: -19.428398
episode 3977.000000, reward total was -21.000000. running mean: -19.444114
episode 3978.000000, reward total was -20.000000. running mean: -19.449673
episode 3979.000000, reward total was -17.000000. running mean: -19.425176
episode 3980.000000, reward total was -19.000000. running mean: -19.420924
episode 3981.000000, reward total was -21.000000. running mean: -19.436715
episode 3982.000000, reward total was -19.000000. running mean: -19.432348
episode 3983.000000, reward total was -18.000000. running mean: -19.418024
episode 3984.000000, reward total was -19.000000. running mean: -19.413844
episode 3985.000000, rewa

episode 4082.000000, reward total was -19.000000. running mean: -19.388973
episode 4083.000000, reward total was -18.000000. running mean: -19.375084
episode 4084.000000, reward total was -19.000000. running mean: -19.371333
episode 4085.000000, reward total was -20.000000. running mean: -19.377620
episode 4086.000000, reward total was -21.000000. running mean: -19.393843
episode 4087.000000, reward total was -20.000000. running mean: -19.399905
episode 4088.000000, reward total was -20.000000. running mean: -19.405906
episode 4089.000000, reward total was -19.000000. running mean: -19.401847
episode 4090.000000, reward total was -21.000000. running mean: -19.417828
episode 4091.000000, reward total was -19.000000. running mean: -19.413650
episode 4092.000000, reward total was -21.000000. running mean: -19.429514
episode 4093.000000, reward total was -18.000000. running mean: -19.415218
episode 4094.000000, reward total was -20.000000. running mean: -19.421066
episode 4095.000000, rewa

episode 4192.000000, reward total was -17.000000. running mean: -19.503824
episode 4193.000000, reward total was -18.000000. running mean: -19.488786
episode 4194.000000, reward total was -20.000000. running mean: -19.493898
episode 4195.000000, reward total was -20.000000. running mean: -19.498959
episode 4196.000000, reward total was -17.000000. running mean: -19.473970
episode 4197.000000, reward total was -19.000000. running mean: -19.469230
episode 4198.000000, reward total was -18.000000. running mean: -19.454538
episode 4199.000000, reward total was -20.000000. running mean: -19.459992
episode 4200.000000, reward total was -21.000000. running mean: -19.475392
episode 4201.000000, reward total was -21.000000. running mean: -19.490638
episode 4202.000000, reward total was -20.000000. running mean: -19.495732
episode 4203.000000, reward total was -20.000000. running mean: -19.500775
episode 4204.000000, reward total was -17.000000. running mean: -19.475767
episode 4205.000000, rewa

episode 4302.000000, reward total was -19.000000. running mean: -19.450984
episode 4303.000000, reward total was -20.000000. running mean: -19.456474
episode 4304.000000, reward total was -18.000000. running mean: -19.441909
episode 4305.000000, reward total was -18.000000. running mean: -19.427490
episode 4306.000000, reward total was -20.000000. running mean: -19.433215
episode 4307.000000, reward total was -20.000000. running mean: -19.438883
episode 4308.000000, reward total was -19.000000. running mean: -19.434494
episode 4309.000000, reward total was -21.000000. running mean: -19.450150
episode 4310.000000, reward total was -17.000000. running mean: -19.425648
episode 4311.000000, reward total was -20.000000. running mean: -19.431392
episode 4312.000000, reward total was -18.000000. running mean: -19.417078
episode 4313.000000, reward total was -20.000000. running mean: -19.422907
episode 4314.000000, reward total was -21.000000. running mean: -19.438678
episode 4315.000000, rewa

episode 4412.000000, reward total was -20.000000. running mean: -19.275427
episode 4413.000000, reward total was -21.000000. running mean: -19.292673
episode 4414.000000, reward total was -21.000000. running mean: -19.309746
episode 4415.000000, reward total was -20.000000. running mean: -19.316648
episode 4416.000000, reward total was -17.000000. running mean: -19.293482
episode 4417.000000, reward total was -21.000000. running mean: -19.310547
episode 4418.000000, reward total was -20.000000. running mean: -19.317442
episode 4419.000000, reward total was -21.000000. running mean: -19.334267
episode 4420.000000, reward total was -19.000000. running mean: -19.330925
episode 4421.000000, reward total was -19.000000. running mean: -19.327615
episode 4422.000000, reward total was -19.000000. running mean: -19.324339
episode 4423.000000, reward total was -21.000000. running mean: -19.341096
episode 4424.000000, reward total was -18.000000. running mean: -19.327685
episode 4425.000000, rewa

episode 4522.000000, reward total was -21.000000. running mean: -19.126979
episode 4523.000000, reward total was -21.000000. running mean: -19.145710
episode 4524.000000, reward total was -19.000000. running mean: -19.144252
episode 4525.000000, reward total was -18.000000. running mean: -19.132810
episode 4526.000000, reward total was -18.000000. running mean: -19.121482
episode 4527.000000, reward total was -21.000000. running mean: -19.140267
episode 4528.000000, reward total was -19.000000. running mean: -19.138864
episode 4529.000000, reward total was -21.000000. running mean: -19.157476
episode 4530.000000, reward total was -18.000000. running mean: -19.145901
episode 4531.000000, reward total was -20.000000. running mean: -19.154442
episode 4532.000000, reward total was -20.000000. running mean: -19.162898
episode 4533.000000, reward total was -20.000000. running mean: -19.171269
episode 4534.000000, reward total was -19.000000. running mean: -19.169556
episode 4535.000000, rewa

episode 4632.000000, reward total was -19.000000. running mean: -19.301343
episode 4633.000000, reward total was -19.000000. running mean: -19.298330
episode 4634.000000, reward total was -20.000000. running mean: -19.305347
episode 4635.000000, reward total was -19.000000. running mean: -19.302293
episode 4636.000000, reward total was -20.000000. running mean: -19.309270
episode 4637.000000, reward total was -18.000000. running mean: -19.296178
episode 4638.000000, reward total was -19.000000. running mean: -19.293216
episode 4639.000000, reward total was -16.000000. running mean: -19.260284
episode 4640.000000, reward total was -19.000000. running mean: -19.257681
episode 4641.000000, reward total was -21.000000. running mean: -19.275104
episode 4642.000000, reward total was -21.000000. running mean: -19.292353
episode 4643.000000, reward total was -19.000000. running mean: -19.289429
episode 4644.000000, reward total was -19.000000. running mean: -19.286535
episode 4645.000000, rewa

episode 4742.000000, reward total was -21.000000. running mean: -19.124948
episode 4743.000000, reward total was -19.000000. running mean: -19.123698
episode 4744.000000, reward total was -16.000000. running mean: -19.092461
episode 4745.000000, reward total was -20.000000. running mean: -19.101537
episode 4746.000000, reward total was -21.000000. running mean: -19.120521
episode 4747.000000, reward total was -19.000000. running mean: -19.119316
episode 4748.000000, reward total was -20.000000. running mean: -19.128123
episode 4749.000000, reward total was -20.000000. running mean: -19.136842
episode 4750.000000, reward total was -21.000000. running mean: -19.155473
episode 4751.000000, reward total was -20.000000. running mean: -19.163918
episode 4752.000000, reward total was -17.000000. running mean: -19.142279
episode 4753.000000, reward total was -21.000000. running mean: -19.160856
episode 4754.000000, reward total was -21.000000. running mean: -19.179248
episode 4755.000000, rewa

episode 4852.000000, reward total was -16.000000. running mean: -19.341959
episode 4853.000000, reward total was -19.000000. running mean: -19.338539
episode 4854.000000, reward total was -17.000000. running mean: -19.315154
episode 4855.000000, reward total was -19.000000. running mean: -19.312002
episode 4856.000000, reward total was -15.000000. running mean: -19.268882
episode 4857.000000, reward total was -19.000000. running mean: -19.266194
episode 4858.000000, reward total was -16.000000. running mean: -19.233532
episode 4859.000000, reward total was -18.000000. running mean: -19.221196
episode 4860.000000, reward total was -18.000000. running mean: -19.208984
episode 4861.000000, reward total was -20.000000. running mean: -19.216895
episode 4862.000000, reward total was -17.000000. running mean: -19.194726
episode 4863.000000, reward total was -21.000000. running mean: -19.212778
episode 4864.000000, reward total was -17.000000. running mean: -19.190651
episode 4865.000000, rewa

episode 4962.000000, reward total was -19.000000. running mean: -19.031892
episode 4963.000000, reward total was -19.000000. running mean: -19.031573
episode 4964.000000, reward total was -20.000000. running mean: -19.041258
episode 4965.000000, reward total was -18.000000. running mean: -19.030845
episode 4966.000000, reward total was -21.000000. running mean: -19.050537
episode 4967.000000, reward total was -19.000000. running mean: -19.050031
episode 4968.000000, reward total was -14.000000. running mean: -18.999531
episode 4969.000000, reward total was -19.000000. running mean: -18.999536
episode 4970.000000, reward total was -17.000000. running mean: -18.979540
episode 4971.000000, reward total was -18.000000. running mean: -18.969745
episode 4972.000000, reward total was -16.000000. running mean: -18.940047
episode 4973.000000, reward total was -19.000000. running mean: -18.940647
episode 4974.000000, reward total was -19.000000. running mean: -18.941241
episode 4975.000000, rewa

episode 5072.000000, reward total was -18.000000. running mean: -19.031229
episode 5073.000000, reward total was -18.000000. running mean: -19.020916
episode 5074.000000, reward total was -19.000000. running mean: -19.020707
episode 5075.000000, reward total was -18.000000. running mean: -19.010500
episode 5076.000000, reward total was -16.000000. running mean: -18.980395
episode 5077.000000, reward total was -15.000000. running mean: -18.940591
episode 5078.000000, reward total was -21.000000. running mean: -18.961185
episode 5079.000000, reward total was -21.000000. running mean: -18.981573
episode 5080.000000, reward total was -18.000000. running mean: -18.971758
episode 5081.000000, reward total was -20.000000. running mean: -18.982040
episode 5082.000000, reward total was -21.000000. running mean: -19.002220
episode 5083.000000, reward total was -16.000000. running mean: -18.972198
episode 5084.000000, reward total was -20.000000. running mean: -18.982476
episode 5085.000000, rewa

episode 5182.000000, reward total was -18.000000. running mean: -18.976775
episode 5183.000000, reward total was -18.000000. running mean: -18.967007
episode 5184.000000, reward total was -19.000000. running mean: -18.967337
episode 5185.000000, reward total was -21.000000. running mean: -18.987664
episode 5186.000000, reward total was -16.000000. running mean: -18.957787
episode 5187.000000, reward total was -18.000000. running mean: -18.948209
episode 5188.000000, reward total was -19.000000. running mean: -18.948727
episode 5189.000000, reward total was -21.000000. running mean: -18.969240
episode 5190.000000, reward total was -16.000000. running mean: -18.939547
episode 5191.000000, reward total was -18.000000. running mean: -18.930152
episode 5192.000000, reward total was -15.000000. running mean: -18.890850
episode 5193.000000, reward total was -18.000000. running mean: -18.881942
episode 5194.000000, reward total was -18.000000. running mean: -18.873122
episode 5195.000000, rewa

episode 5292.000000, reward total was -21.000000. running mean: -18.908195
episode 5293.000000, reward total was -19.000000. running mean: -18.909113
episode 5294.000000, reward total was -21.000000. running mean: -18.930022
episode 5295.000000, reward total was -17.000000. running mean: -18.910722
episode 5296.000000, reward total was -19.000000. running mean: -18.911614
episode 5297.000000, reward total was -20.000000. running mean: -18.922498
episode 5298.000000, reward total was -21.000000. running mean: -18.943273
episode 5299.000000, reward total was -21.000000. running mean: -18.963840
episode 5300.000000, reward total was -20.000000. running mean: -18.974202
episode 5301.000000, reward total was -20.000000. running mean: -18.984460
episode 5302.000000, reward total was -16.000000. running mean: -18.954615
episode 5303.000000, reward total was -19.000000. running mean: -18.955069
episode 5304.000000, reward total was -20.000000. running mean: -18.965519
episode 5305.000000, rewa

episode 5402.000000, reward total was -19.000000. running mean: -18.651243
episode 5403.000000, reward total was -19.000000. running mean: -18.654730
episode 5404.000000, reward total was -19.000000. running mean: -18.658183
episode 5405.000000, reward total was -19.000000. running mean: -18.661601
episode 5406.000000, reward total was -19.000000. running mean: -18.664985
episode 5407.000000, reward total was -18.000000. running mean: -18.658335
episode 5408.000000, reward total was -17.000000. running mean: -18.641752
episode 5409.000000, reward total was -19.000000. running mean: -18.645335
episode 5410.000000, reward total was -19.000000. running mean: -18.648881
episode 5411.000000, reward total was -18.000000. running mean: -18.642392
episode 5412.000000, reward total was -20.000000. running mean: -18.655969
episode 5413.000000, reward total was -21.000000. running mean: -18.679409
episode 5414.000000, reward total was -21.000000. running mean: -18.702615
episode 5415.000000, rewa

episode 5512.000000, reward total was -20.000000. running mean: -18.922006
episode 5513.000000, reward total was -14.000000. running mean: -18.872786
episode 5514.000000, reward total was -18.000000. running mean: -18.864058
episode 5515.000000, reward total was -20.000000. running mean: -18.875418
episode 5516.000000, reward total was -16.000000. running mean: -18.846664
episode 5517.000000, reward total was -21.000000. running mean: -18.868197
episode 5518.000000, reward total was -20.000000. running mean: -18.879515
episode 5519.000000, reward total was -19.000000. running mean: -18.880720
episode 5520.000000, reward total was -20.000000. running mean: -18.891913
episode 5521.000000, reward total was -17.000000. running mean: -18.872993
episode 5522.000000, reward total was -14.000000. running mean: -18.824264
episode 5523.000000, reward total was -20.000000. running mean: -18.836021
episode 5524.000000, reward total was -21.000000. running mean: -18.857661
episode 5525.000000, rewa

episode 5622.000000, reward total was -21.000000. running mean: -19.061963
episode 5623.000000, reward total was -19.000000. running mean: -19.061344
episode 5624.000000, reward total was -21.000000. running mean: -19.080730
episode 5625.000000, reward total was -19.000000. running mean: -19.079923
episode 5626.000000, reward total was -18.000000. running mean: -19.069124
episode 5627.000000, reward total was -19.000000. running mean: -19.068433
episode 5628.000000, reward total was -19.000000. running mean: -19.067748
episode 5629.000000, reward total was -18.000000. running mean: -19.057071
episode 5630.000000, reward total was -21.000000. running mean: -19.076500
episode 5631.000000, reward total was -16.000000. running mean: -19.045735
episode 5632.000000, reward total was -20.000000. running mean: -19.055278
episode 5633.000000, reward total was -20.000000. running mean: -19.064725
episode 5634.000000, reward total was -19.000000. running mean: -19.064078
episode 5635.000000, rewa

episode 5732.000000, reward total was -18.000000. running mean: -18.674259
episode 5733.000000, reward total was -17.000000. running mean: -18.657516
episode 5734.000000, reward total was -19.000000. running mean: -18.660941
episode 5735.000000, reward total was -16.000000. running mean: -18.634332
episode 5736.000000, reward total was -20.000000. running mean: -18.647988
episode 5737.000000, reward total was -19.000000. running mean: -18.651509
episode 5738.000000, reward total was -17.000000. running mean: -18.634994
episode 5739.000000, reward total was -19.000000. running mean: -18.638644
episode 5740.000000, reward total was -21.000000. running mean: -18.662257
episode 5741.000000, reward total was -19.000000. running mean: -18.665635
episode 5742.000000, reward total was -19.000000. running mean: -18.668978
episode 5743.000000, reward total was -19.000000. running mean: -18.672288
episode 5744.000000, reward total was -18.000000. running mean: -18.665566
episode 5745.000000, rewa

episode 5842.000000, reward total was -20.000000. running mean: -18.600802
episode 5843.000000, reward total was -20.000000. running mean: -18.614794
episode 5844.000000, reward total was -21.000000. running mean: -18.638646
episode 5845.000000, reward total was -19.000000. running mean: -18.642260
episode 5846.000000, reward total was -18.000000. running mean: -18.635837
episode 5847.000000, reward total was -20.000000. running mean: -18.649479
episode 5848.000000, reward total was -20.000000. running mean: -18.662984
episode 5849.000000, reward total was -19.000000. running mean: -18.666354
episode 5850.000000, reward total was -21.000000. running mean: -18.689691
episode 5851.000000, reward total was -20.000000. running mean: -18.702794
episode 5852.000000, reward total was -17.000000. running mean: -18.685766
episode 5853.000000, reward total was -18.000000. running mean: -18.678908
episode 5854.000000, reward total was -18.000000. running mean: -18.672119
episode 5855.000000, rewa

episode 5952.000000, reward total was -17.000000. running mean: -18.362519
episode 5953.000000, reward total was -19.000000. running mean: -18.368894
episode 5954.000000, reward total was -18.000000. running mean: -18.365205
episode 5955.000000, reward total was -20.000000. running mean: -18.381553
episode 5956.000000, reward total was -19.000000. running mean: -18.387737
episode 5957.000000, reward total was -16.000000. running mean: -18.363860
episode 5958.000000, reward total was -20.000000. running mean: -18.380221
episode 5959.000000, reward total was -18.000000. running mean: -18.376419
episode 5960.000000, reward total was -21.000000. running mean: -18.402655
episode 5961.000000, reward total was -19.000000. running mean: -18.408628
episode 5962.000000, reward total was -18.000000. running mean: -18.404542
episode 5963.000000, reward total was -17.000000. running mean: -18.390497
episode 5964.000000, reward total was -19.000000. running mean: -18.396592
episode 5965.000000, rewa

episode 6062.000000, reward total was -17.000000. running mean: -18.459400
episode 6063.000000, reward total was -20.000000. running mean: -18.474806
episode 6064.000000, reward total was -18.000000. running mean: -18.470058
episode 6065.000000, reward total was -18.000000. running mean: -18.465357
episode 6066.000000, reward total was -21.000000. running mean: -18.490703
episode 6067.000000, reward total was -18.000000. running mean: -18.485796
episode 6068.000000, reward total was -19.000000. running mean: -18.490938
episode 6069.000000, reward total was -14.000000. running mean: -18.446029
episode 6070.000000, reward total was -21.000000. running mean: -18.471569
episode 6071.000000, reward total was -19.000000. running mean: -18.476853
episode 6072.000000, reward total was -19.000000. running mean: -18.482085
episode 6073.000000, reward total was -20.000000. running mean: -18.497264
episode 6074.000000, reward total was -19.000000. running mean: -18.502291
episode 6075.000000, rewa

episode 6172.000000, reward total was -17.000000. running mean: -18.366466
episode 6173.000000, reward total was -15.000000. running mean: -18.332802
episode 6174.000000, reward total was -17.000000. running mean: -18.319474
episode 6175.000000, reward total was -20.000000. running mean: -18.336279
episode 6176.000000, reward total was -19.000000. running mean: -18.342916
episode 6177.000000, reward total was -20.000000. running mean: -18.359487
episode 6178.000000, reward total was -15.000000. running mean: -18.325892
episode 6179.000000, reward total was -17.000000. running mean: -18.312633
episode 6180.000000, reward total was -16.000000. running mean: -18.289507
episode 6181.000000, reward total was -17.000000. running mean: -18.276612
episode 6182.000000, reward total was -20.000000. running mean: -18.293846
episode 6183.000000, reward total was -19.000000. running mean: -18.300907
episode 6184.000000, reward total was -17.000000. running mean: -18.287898
episode 6185.000000, rewa

episode 6282.000000, reward total was -19.000000. running mean: -18.321804
episode 6283.000000, reward total was -19.000000. running mean: -18.328586
episode 6284.000000, reward total was -17.000000. running mean: -18.315300
episode 6285.000000, reward total was -20.000000. running mean: -18.332147
episode 6286.000000, reward total was -16.000000. running mean: -18.308825
episode 6287.000000, reward total was -17.000000. running mean: -18.295737
episode 6288.000000, reward total was -17.000000. running mean: -18.282780
episode 6289.000000, reward total was -19.000000. running mean: -18.289952
episode 6290.000000, reward total was -18.000000. running mean: -18.287052
episode 6291.000000, reward total was -19.000000. running mean: -18.294182
episode 6292.000000, reward total was -20.000000. running mean: -18.311240
episode 6293.000000, reward total was -20.000000. running mean: -18.328128
episode 6294.000000, reward total was -15.000000. running mean: -18.294846
episode 6295.000000, rewa

episode 6392.000000, reward total was -17.000000. running mean: -18.277085
episode 6393.000000, reward total was -19.000000. running mean: -18.284314
episode 6394.000000, reward total was -19.000000. running mean: -18.291471
episode 6395.000000, reward total was -19.000000. running mean: -18.298556
episode 6396.000000, reward total was -21.000000. running mean: -18.325571
episode 6397.000000, reward total was -17.000000. running mean: -18.312315
episode 6398.000000, reward total was -19.000000. running mean: -18.319192
episode 6399.000000, reward total was -19.000000. running mean: -18.326000
episode 6400.000000, reward total was -19.000000. running mean: -18.332740
episode 6401.000000, reward total was -21.000000. running mean: -18.359412
episode 6402.000000, reward total was -16.000000. running mean: -18.335818
episode 6403.000000, reward total was -18.000000. running mean: -18.332460
episode 6404.000000, reward total was -17.000000. running mean: -18.319136
episode 6405.000000, rewa

episode 6502.000000, reward total was -17.000000. running mean: -18.368599
episode 6503.000000, reward total was -19.000000. running mean: -18.374913
episode 6504.000000, reward total was -17.000000. running mean: -18.361164
episode 6505.000000, reward total was -17.000000. running mean: -18.347552
episode 6506.000000, reward total was -20.000000. running mean: -18.364077
episode 6507.000000, reward total was -16.000000. running mean: -18.340436
episode 6508.000000, reward total was -18.000000. running mean: -18.337032
episode 6509.000000, reward total was -17.000000. running mean: -18.323661
episode 6510.000000, reward total was -21.000000. running mean: -18.350425
episode 6511.000000, reward total was -20.000000. running mean: -18.366920
episode 6512.000000, reward total was -21.000000. running mean: -18.393251
episode 6513.000000, reward total was -21.000000. running mean: -18.419319
episode 6514.000000, reward total was -19.000000. running mean: -18.425126
episode 6515.000000, rewa

episode 6612.000000, reward total was -19.000000. running mean: -18.555444
episode 6613.000000, reward total was -19.000000. running mean: -18.559889
episode 6614.000000, reward total was -20.000000. running mean: -18.574290
episode 6615.000000, reward total was -19.000000. running mean: -18.578548
episode 6616.000000, reward total was -18.000000. running mean: -18.572762
episode 6617.000000, reward total was -20.000000. running mean: -18.587034
episode 6618.000000, reward total was -19.000000. running mean: -18.591164
episode 6619.000000, reward total was -17.000000. running mean: -18.575252
episode 6620.000000, reward total was -18.000000. running mean: -18.569500
episode 6621.000000, reward total was -21.000000. running mean: -18.593805
episode 6622.000000, reward total was -16.000000. running mean: -18.567867
episode 6623.000000, reward total was -21.000000. running mean: -18.592188
episode 6624.000000, reward total was -20.000000. running mean: -18.606266
episode 6625.000000, rewa

episode 6722.000000, reward total was -17.000000. running mean: -18.349419
episode 6723.000000, reward total was -20.000000. running mean: -18.365925
episode 6724.000000, reward total was -18.000000. running mean: -18.362266
episode 6725.000000, reward total was -18.000000. running mean: -18.358643
episode 6726.000000, reward total was -19.000000. running mean: -18.365057
episode 6727.000000, reward total was -21.000000. running mean: -18.391406
episode 6728.000000, reward total was -21.000000. running mean: -18.417492
episode 6729.000000, reward total was -18.000000. running mean: -18.413317
episode 6730.000000, reward total was -14.000000. running mean: -18.369184
episode 6731.000000, reward total was -18.000000. running mean: -18.365492
episode 6732.000000, reward total was -19.000000. running mean: -18.371837
episode 6733.000000, reward total was -19.000000. running mean: -18.378119
episode 6734.000000, reward total was -19.000000. running mean: -18.384338
episode 6735.000000, rewa

episode 6832.000000, reward total was -17.000000. running mean: -18.303377
episode 6833.000000, reward total was -19.000000. running mean: -18.310343
episode 6834.000000, reward total was -19.000000. running mean: -18.317240
episode 6835.000000, reward total was -18.000000. running mean: -18.314067
episode 6836.000000, reward total was -20.000000. running mean: -18.330927
episode 6837.000000, reward total was -16.000000. running mean: -18.307617
episode 6838.000000, reward total was -19.000000. running mean: -18.314541
episode 6839.000000, reward total was -12.000000. running mean: -18.251396
episode 6840.000000, reward total was -21.000000. running mean: -18.278882
episode 6841.000000, reward total was -19.000000. running mean: -18.286093
episode 6842.000000, reward total was -20.000000. running mean: -18.303232
episode 6843.000000, reward total was -13.000000. running mean: -18.250200
episode 6844.000000, reward total was -18.000000. running mean: -18.247698
episode 6845.000000, rewa

episode 6942.000000, reward total was -15.000000. running mean: -18.153102
episode 6943.000000, reward total was -19.000000. running mean: -18.161571
episode 6944.000000, reward total was -14.000000. running mean: -18.119955
episode 6945.000000, reward total was -19.000000. running mean: -18.128756
episode 6946.000000, reward total was -17.000000. running mean: -18.117468
episode 6947.000000, reward total was -20.000000. running mean: -18.136294
episode 6948.000000, reward total was -20.000000. running mean: -18.154931
episode 6949.000000, reward total was -17.000000. running mean: -18.143381
episode 6950.000000, reward total was -20.000000. running mean: -18.161947
episode 6951.000000, reward total was -21.000000. running mean: -18.190328
episode 6952.000000, reward total was -21.000000. running mean: -18.218425
episode 6953.000000, reward total was -17.000000. running mean: -18.206240
episode 6954.000000, reward total was -17.000000. running mean: -18.194178
episode 6955.000000, rewa