In [1]:
import gym
import numpy as np

from gym.wrappers import AtariPreprocessing
gym.new_step_api=True
env = gym.make('Pong-v0')

H = 200 # number of hidden layer neurons
D = 80 * 80 # input dimensionality: 80x80 grid
model = {}
model['W1'] = np.random.randn(H,D) / np.sqrt(D) # "Xavier" initialization
model['W2'] = np.random.randn(H) / np.sqrt(H)
# hyperparameters
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-3
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
grad_buffer = { k : np.zeros_like(v) for k,v in model.items() } # update buffers that add up gradients over a batch
rmsprop_cache = { k : np.zeros_like(v) for k,v in model.items() } # rmsprop memory

def sigmoid(x): 
  return 1.0 / (1.0 + np.exp(-x)) # sigmoid "squashing" function to interval [0,1]

def prepro(I):
  I=np.asarray(I)
  I = I[35:195] # crop
  I = I[::2,::2,0] # downsample by factor of 2
  
  I[I == 144] = 0 # erase background (background type 1)
  
  I[I == 109] = 0 # erase background (background type 2)
  
  I[I != 0] = 1 # everything else (paddles, ball) just set to 1
  return I.astype(float).ravel()

def discount_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(range(0, r.size)):
    if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

def policy_forward(x):
  h = np.dot(model['W1'], x)
  h[h<0] = 0 # ReLU nonlinearity
  logp = np.dot(model['W2'], h)
  p = sigmoid(logp)
  return p, h # return probability of taking action 2, and hidden state

def policy_backward(epx, eph, epdlogp):
  """ backward pass. (eph is array of intermediate hidden states) """
  dW2 = np.dot(eph.T, epdlogp).ravel()
  dh = np.outer(epdlogp, model['W2'])
  dh[eph <= 0] = 0 # backpro prelu
  dW1 = np.dot(dh.T, epx)
  return {'W1':dW1, 'W2':dW2}

def model_step(model, observation, prev_x):

  cur_x = prepro(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(D)
  prev_x = cur_x
  
  # forward the policy network and sample an action from the returned probability
  aprob, _ = policy_forward(x)
  action = 2 if aprob >= 0.5 else 3 # roll the dice!
  
  return action, prev_x

def play_game(env, model):
  observation = env.reset()

  frames = []
  cumulated_reward = 0

  prev_x = None # used in computing the difference frame

  for t in range(1000):
      frames.append(env.render(mode = 'rgb_array'))
      action, prev_x = model_step(model, observation, prev_x)
      observation, reward, done, info = env.step(action)
      cumulated_reward += reward
      if done:
          print("Episode finished after {} timesteps, accumulated reward = {}".format(t+1, cumulated_reward))
          break
  print("Episode finished without success, accumulated reward = {}".format(cumulated_reward))
  env.close()
  display_frames_as_gif(frames)

def train_model(env, model, total_episodes = 100):
  hist = []
  observation = env.reset()

  prev_x = None # used in computing the difference frame
  xs,hs,dlogps,drs = [],[],[],[]
  running_reward = None
  reward_sum = 0
  episode_number = 0

  while True:
  
    cur_x = prepro(observation)
    x = cur_x - prev_x if prev_x is not None else np.zeros(D)
    prev_x = cur_x

    # forward the policy network and sample an action from the returned probability
    aprob, h = policy_forward(x)
    action = 2 if np.random.uniform() < aprob else 3 # roll the dice!

    # record various intermediates (needed later for backprop)
    xs.append(x) # observation
    hs.append(h) # hidden state
    y = 1 if action == 2 else 0 # a "fake label"
    dlogps.append(y - aprob) # grad that encourages the action that was taken to be taken (see http://cs231n.github.io/neural-networks-2/#losses if confused)

    # step the environment and get new measurements
    observation, reward, done, info = env.step(action)
    reward_sum += reward

    drs.append(reward) # record reward (has to be done after we call step() to get reward for previous action)

    if done: # an episode finished
      episode_number += 1

      # stack together all inputs, hidden states, action gradients, and rewards for this episode
      epx = np.vstack(xs)
      eph = np.vstack(hs)
      epdlogp = np.vstack(dlogps)
      epr = np.vstack(drs)
      xs,hs,dlogps,drs = [],[],[],[] # reset array memory

      # compute the discounted reward backwards through time
      discounted_epr = discount_rewards(epr)
      # standardize the rewards to be unit normal (helps control the gradient estimator variance)
      discounted_epr -= np.mean(discounted_epr)
      discounted_epr /= np.std(discounted_epr)

      epdlogp *= discounted_epr # modulate the gradient with advantage (PG magic happens right here.)
      grad = policy_backward(epx, eph, epdlogp)
      for k in model: grad_buffer[k] += grad[k] # accumulate grad over batch

      # perform rmsprop parameter update every batch_size episodes
      if episode_number % batch_size == 0:
        for k,v in model.items():
          g = grad_buffer[k] # gradient
          rmsprop_cache[k] = decay_rate * rmsprop_cache[k] + (1 - decay_rate) * g**2
          model[k] += learning_rate * g / (np.sqrt(rmsprop_cache[k]) + 1e-5)
          grad_buffer[k] = np.zeros_like(v) # reset batch gradient buffer

      # boring book-keeping
      running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01
      hist.append((episode_number, reward_sum, running_reward))
      print ('episode %f, reward total was %f. running mean: %f' % (episode_number, reward_sum, running_reward))
      reward_sum = 0
      observation = env.reset() # reset env
      prev_x = None
      if episode_number == total_episodes: 
        return hist

   
    

  logger.warn(
  deprecation(
  deprecation(


In [2]:
%time hist1 = train_model(env, model, total_episodes=7000)

  logger.deprecation(


episode 1.000000, reward total was -21.000000. running mean: -21.000000
episode 2.000000, reward total was -19.000000. running mean: -20.980000
episode 3.000000, reward total was -21.000000. running mean: -20.980200
episode 4.000000, reward total was -21.000000. running mean: -20.980398
episode 5.000000, reward total was -20.000000. running mean: -20.970594
episode 6.000000, reward total was -21.000000. running mean: -20.970888
episode 7.000000, reward total was -21.000000. running mean: -20.971179
episode 8.000000, reward total was -21.000000. running mean: -20.971467
episode 9.000000, reward total was -20.000000. running mean: -20.961753
episode 10.000000, reward total was -21.000000. running mean: -20.962135
episode 11.000000, reward total was -21.000000. running mean: -20.962514
episode 12.000000, reward total was -20.000000. running mean: -20.952889
episode 13.000000, reward total was -21.000000. running mean: -20.953360
episode 14.000000, reward total was -21.000000. running mean

episode 114.000000, reward total was -20.000000. running mean: -20.704704
episode 115.000000, reward total was -20.000000. running mean: -20.697657
episode 116.000000, reward total was -20.000000. running mean: -20.690680
episode 117.000000, reward total was -21.000000. running mean: -20.693773
episode 118.000000, reward total was -20.000000. running mean: -20.686836
episode 119.000000, reward total was -20.000000. running mean: -20.679967
episode 120.000000, reward total was -21.000000. running mean: -20.683168
episode 121.000000, reward total was -21.000000. running mean: -20.686336
episode 122.000000, reward total was -20.000000. running mean: -20.679473
episode 123.000000, reward total was -21.000000. running mean: -20.682678
episode 124.000000, reward total was -19.000000. running mean: -20.665851
episode 125.000000, reward total was -19.000000. running mean: -20.649193
episode 126.000000, reward total was -20.000000. running mean: -20.642701
episode 127.000000, reward total was -

episode 225.000000, reward total was -20.000000. running mean: -20.590564
episode 226.000000, reward total was -21.000000. running mean: -20.594658
episode 227.000000, reward total was -21.000000. running mean: -20.598711
episode 228.000000, reward total was -19.000000. running mean: -20.582724
episode 229.000000, reward total was -20.000000. running mean: -20.576897
episode 230.000000, reward total was -21.000000. running mean: -20.581128
episode 231.000000, reward total was -20.000000. running mean: -20.575317
episode 232.000000, reward total was -20.000000. running mean: -20.569564
episode 233.000000, reward total was -20.000000. running mean: -20.563868
episode 234.000000, reward total was -21.000000. running mean: -20.568229
episode 235.000000, reward total was -19.000000. running mean: -20.552547
episode 236.000000, reward total was -21.000000. running mean: -20.557022
episode 237.000000, reward total was -20.000000. running mean: -20.551451
episode 238.000000, reward total was -

episode 336.000000, reward total was -18.000000. running mean: -20.184322
episode 337.000000, reward total was -21.000000. running mean: -20.192479
episode 338.000000, reward total was -20.000000. running mean: -20.190554
episode 339.000000, reward total was -20.000000. running mean: -20.188649
episode 340.000000, reward total was -20.000000. running mean: -20.186762
episode 341.000000, reward total was -21.000000. running mean: -20.194895
episode 342.000000, reward total was -21.000000. running mean: -20.202946
episode 343.000000, reward total was -21.000000. running mean: -20.210916
episode 344.000000, reward total was -20.000000. running mean: -20.208807
episode 345.000000, reward total was -19.000000. running mean: -20.196719
episode 346.000000, reward total was -19.000000. running mean: -20.184752
episode 347.000000, reward total was -19.000000. running mean: -20.172904
episode 348.000000, reward total was -19.000000. running mean: -20.161175
episode 349.000000, reward total was -

episode 447.000000, reward total was -20.000000. running mean: -19.899648
episode 448.000000, reward total was -21.000000. running mean: -19.910652
episode 449.000000, reward total was -19.000000. running mean: -19.901545
episode 450.000000, reward total was -17.000000. running mean: -19.872530
episode 451.000000, reward total was -19.000000. running mean: -19.863804
episode 452.000000, reward total was -20.000000. running mean: -19.865166
episode 453.000000, reward total was -21.000000. running mean: -19.876515
episode 454.000000, reward total was -15.000000. running mean: -19.827750
episode 455.000000, reward total was -19.000000. running mean: -19.819472
episode 456.000000, reward total was -19.000000. running mean: -19.811277
episode 457.000000, reward total was -16.000000. running mean: -19.773165
episode 458.000000, reward total was -20.000000. running mean: -19.775433
episode 459.000000, reward total was -18.000000. running mean: -19.757679
episode 460.000000, reward total was -

episode 558.000000, reward total was -19.000000. running mean: -19.494017
episode 559.000000, reward total was -20.000000. running mean: -19.499077
episode 560.000000, reward total was -18.000000. running mean: -19.484086
episode 561.000000, reward total was -16.000000. running mean: -19.449245
episode 562.000000, reward total was -18.000000. running mean: -19.434753
episode 563.000000, reward total was -20.000000. running mean: -19.440405
episode 564.000000, reward total was -19.000000. running mean: -19.436001
episode 565.000000, reward total was -20.000000. running mean: -19.441641
episode 566.000000, reward total was -19.000000. running mean: -19.437225
episode 567.000000, reward total was -20.000000. running mean: -19.442852
episode 568.000000, reward total was -19.000000. running mean: -19.438424
episode 569.000000, reward total was -19.000000. running mean: -19.434040
episode 570.000000, reward total was -21.000000. running mean: -19.449699
episode 571.000000, reward total was -

episode 669.000000, reward total was -17.000000. running mean: -18.538623
episode 670.000000, reward total was -18.000000. running mean: -18.533237
episode 671.000000, reward total was -19.000000. running mean: -18.537904
episode 672.000000, reward total was -17.000000. running mean: -18.522525
episode 673.000000, reward total was -18.000000. running mean: -18.517300
episode 674.000000, reward total was -20.000000. running mean: -18.532127
episode 675.000000, reward total was -19.000000. running mean: -18.536806
episode 676.000000, reward total was -14.000000. running mean: -18.491438
episode 677.000000, reward total was -21.000000. running mean: -18.516523
episode 678.000000, reward total was -17.000000. running mean: -18.501358
episode 679.000000, reward total was -21.000000. running mean: -18.526345
episode 680.000000, reward total was -18.000000. running mean: -18.521081
episode 681.000000, reward total was -13.000000. running mean: -18.465870
episode 682.000000, reward total was -

episode 780.000000, reward total was -15.000000. running mean: -17.818262
episode 781.000000, reward total was -17.000000. running mean: -17.810079
episode 782.000000, reward total was -19.000000. running mean: -17.821978
episode 783.000000, reward total was -20.000000. running mean: -17.843759
episode 784.000000, reward total was -18.000000. running mean: -17.845321
episode 785.000000, reward total was -17.000000. running mean: -17.836868
episode 786.000000, reward total was -19.000000. running mean: -17.848499
episode 787.000000, reward total was -15.000000. running mean: -17.820014
episode 788.000000, reward total was -18.000000. running mean: -17.821814
episode 789.000000, reward total was -13.000000. running mean: -17.773596
episode 790.000000, reward total was -18.000000. running mean: -17.775860
episode 791.000000, reward total was -19.000000. running mean: -17.788101
episode 792.000000, reward total was -16.000000. running mean: -17.770220
episode 793.000000, reward total was -

episode 891.000000, reward total was -18.000000. running mean: -17.255975
episode 892.000000, reward total was -15.000000. running mean: -17.233415
episode 893.000000, reward total was -19.000000. running mean: -17.251081
episode 894.000000, reward total was -18.000000. running mean: -17.258570
episode 895.000000, reward total was -17.000000. running mean: -17.255984
episode 896.000000, reward total was -16.000000. running mean: -17.243424
episode 897.000000, reward total was -16.000000. running mean: -17.230990
episode 898.000000, reward total was -20.000000. running mean: -17.258680
episode 899.000000, reward total was -16.000000. running mean: -17.246093
episode 900.000000, reward total was -19.000000. running mean: -17.263633
episode 901.000000, reward total was -17.000000. running mean: -17.260996
episode 902.000000, reward total was -13.000000. running mean: -17.218386
episode 903.000000, reward total was -19.000000. running mean: -17.236202
episode 904.000000, reward total was -

episode 1002.000000, reward total was -19.000000. running mean: -16.752698
episode 1003.000000, reward total was -12.000000. running mean: -16.705171
episode 1004.000000, reward total was -16.000000. running mean: -16.698120
episode 1005.000000, reward total was -14.000000. running mean: -16.671139
episode 1006.000000, reward total was -18.000000. running mean: -16.684427
episode 1007.000000, reward total was -17.000000. running mean: -16.687583
episode 1008.000000, reward total was -15.000000. running mean: -16.670707
episode 1009.000000, reward total was -18.000000. running mean: -16.684000
episode 1010.000000, reward total was -19.000000. running mean: -16.707160
episode 1011.000000, reward total was -14.000000. running mean: -16.680088
episode 1012.000000, reward total was -10.000000. running mean: -16.613287
episode 1013.000000, reward total was -14.000000. running mean: -16.587155
episode 1014.000000, reward total was -15.000000. running mean: -16.571283
episode 1015.000000, rewa

episode 1112.000000, reward total was -18.000000. running mean: -15.857578
episode 1113.000000, reward total was -9.000000. running mean: -15.789002
episode 1114.000000, reward total was -18.000000. running mean: -15.811112
episode 1115.000000, reward total was -11.000000. running mean: -15.763001
episode 1116.000000, reward total was -17.000000. running mean: -15.775371
episode 1117.000000, reward total was -17.000000. running mean: -15.787617
episode 1118.000000, reward total was -16.000000. running mean: -15.789741
episode 1119.000000, reward total was -15.000000. running mean: -15.781843
episode 1120.000000, reward total was -13.000000. running mean: -15.754025
episode 1121.000000, reward total was -13.000000. running mean: -15.726485
episode 1122.000000, reward total was -13.000000. running mean: -15.699220
episode 1123.000000, reward total was -17.000000. running mean: -15.712228
episode 1124.000000, reward total was -13.000000. running mean: -15.685105
episode 1125.000000, rewar

episode 1222.000000, reward total was -16.000000. running mean: -15.314280
episode 1223.000000, reward total was -13.000000. running mean: -15.291138
episode 1224.000000, reward total was -17.000000. running mean: -15.308226
episode 1225.000000, reward total was -18.000000. running mean: -15.335144
episode 1226.000000, reward total was -13.000000. running mean: -15.311793
episode 1227.000000, reward total was -14.000000. running mean: -15.298675
episode 1228.000000, reward total was -15.000000. running mean: -15.295688
episode 1229.000000, reward total was -13.000000. running mean: -15.272731
episode 1230.000000, reward total was -15.000000. running mean: -15.270004
episode 1231.000000, reward total was -13.000000. running mean: -15.247304
episode 1232.000000, reward total was -19.000000. running mean: -15.284831
episode 1233.000000, reward total was -15.000000. running mean: -15.281982
episode 1234.000000, reward total was -17.000000. running mean: -15.299162
episode 1235.000000, rewa

episode 1332.000000, reward total was -13.000000. running mean: -15.017681
episode 1333.000000, reward total was -15.000000. running mean: -15.017504
episode 1334.000000, reward total was -12.000000. running mean: -14.987329
episode 1335.000000, reward total was -17.000000. running mean: -15.007456
episode 1336.000000, reward total was -17.000000. running mean: -15.027381
episode 1337.000000, reward total was -19.000000. running mean: -15.067108
episode 1338.000000, reward total was -12.000000. running mean: -15.036436
episode 1339.000000, reward total was -14.000000. running mean: -15.026072
episode 1340.000000, reward total was -18.000000. running mean: -15.055811
episode 1341.000000, reward total was -17.000000. running mean: -15.075253
episode 1342.000000, reward total was -14.000000. running mean: -15.064501
episode 1343.000000, reward total was -12.000000. running mean: -15.033856
episode 1344.000000, reward total was -11.000000. running mean: -14.993517
episode 1345.000000, rewa

episode 1442.000000, reward total was -18.000000. running mean: -13.883147
episode 1443.000000, reward total was -15.000000. running mean: -13.894315
episode 1444.000000, reward total was -6.000000. running mean: -13.815372
episode 1445.000000, reward total was -15.000000. running mean: -13.827219
episode 1446.000000, reward total was -8.000000. running mean: -13.768946
episode 1447.000000, reward total was -13.000000. running mean: -13.761257
episode 1448.000000, reward total was -15.000000. running mean: -13.773644
episode 1449.000000, reward total was -12.000000. running mean: -13.755908
episode 1450.000000, reward total was -11.000000. running mean: -13.728349
episode 1451.000000, reward total was -8.000000. running mean: -13.671065
episode 1452.000000, reward total was -16.000000. running mean: -13.694355
episode 1453.000000, reward total was -14.000000. running mean: -13.697411
episode 1454.000000, reward total was -16.000000. running mean: -13.720437
episode 1455.000000, reward 

episode 1552.000000, reward total was -8.000000. running mean: -13.174287
episode 1553.000000, reward total was -13.000000. running mean: -13.172544
episode 1554.000000, reward total was -9.000000. running mean: -13.130819
episode 1555.000000, reward total was -15.000000. running mean: -13.149510
episode 1556.000000, reward total was -15.000000. running mean: -13.168015
episode 1557.000000, reward total was -14.000000. running mean: -13.176335
episode 1558.000000, reward total was -9.000000. running mean: -13.134572
episode 1559.000000, reward total was -5.000000. running mean: -13.053226
episode 1560.000000, reward total was -17.000000. running mean: -13.092694
episode 1561.000000, reward total was -12.000000. running mean: -13.081767
episode 1562.000000, reward total was -10.000000. running mean: -13.050949
episode 1563.000000, reward total was -17.000000. running mean: -13.090440
episode 1564.000000, reward total was -17.000000. running mean: -13.129535
episode 1565.000000, reward t

episode 1662.000000, reward total was -12.000000. running mean: -12.600396
episode 1663.000000, reward total was -14.000000. running mean: -12.614392
episode 1664.000000, reward total was -9.000000. running mean: -12.578248
episode 1665.000000, reward total was -19.000000. running mean: -12.642465
episode 1666.000000, reward total was -14.000000. running mean: -12.656041
episode 1667.000000, reward total was -17.000000. running mean: -12.699480
episode 1668.000000, reward total was -10.000000. running mean: -12.672486
episode 1669.000000, reward total was -12.000000. running mean: -12.665761
episode 1670.000000, reward total was -13.000000. running mean: -12.669103
episode 1671.000000, reward total was -14.000000. running mean: -12.682412
episode 1672.000000, reward total was -13.000000. running mean: -12.685588
episode 1673.000000, reward total was -12.000000. running mean: -12.678732
episode 1674.000000, reward total was -8.000000. running mean: -12.631945
episode 1675.000000, reward

episode 1772.000000, reward total was -13.000000. running mean: -12.195155
episode 1773.000000, reward total was -17.000000. running mean: -12.243203
episode 1774.000000, reward total was -1.000000. running mean: -12.130771
episode 1775.000000, reward total was -9.000000. running mean: -12.099463
episode 1776.000000, reward total was -11.000000. running mean: -12.088469
episode 1777.000000, reward total was -13.000000. running mean: -12.097584
episode 1778.000000, reward total was -11.000000. running mean: -12.086608
episode 1779.000000, reward total was -8.000000. running mean: -12.045742
episode 1780.000000, reward total was -12.000000. running mean: -12.045285
episode 1781.000000, reward total was -11.000000. running mean: -12.034832
episode 1782.000000, reward total was -17.000000. running mean: -12.084484
episode 1783.000000, reward total was -12.000000. running mean: -12.083639
episode 1784.000000, reward total was -12.000000. running mean: -12.082802
episode 1785.000000, reward 

episode 1882.000000, reward total was -8.000000. running mean: -12.228445
episode 1883.000000, reward total was -10.000000. running mean: -12.206161
episode 1884.000000, reward total was -6.000000. running mean: -12.144099
episode 1885.000000, reward total was -6.000000. running mean: -12.082658
episode 1886.000000, reward total was -12.000000. running mean: -12.081832
episode 1887.000000, reward total was -14.000000. running mean: -12.101013
episode 1888.000000, reward total was -2.000000. running mean: -12.000003
episode 1889.000000, reward total was -14.000000. running mean: -12.020003
episode 1890.000000, reward total was -9.000000. running mean: -11.989803
episode 1891.000000, reward total was -13.000000. running mean: -11.999905
episode 1892.000000, reward total was -15.000000. running mean: -12.029906
episode 1893.000000, reward total was -13.000000. running mean: -12.039607
episode 1894.000000, reward total was -9.000000. running mean: -12.009211
episode 1895.000000, reward tot

episode 1992.000000, reward total was -4.000000. running mean: -11.267072
episode 1993.000000, reward total was -19.000000. running mean: -11.344402
episode 1994.000000, reward total was -5.000000. running mean: -11.280958
episode 1995.000000, reward total was -12.000000. running mean: -11.288148
episode 1996.000000, reward total was -5.000000. running mean: -11.225266
episode 1997.000000, reward total was -13.000000. running mean: -11.243014
episode 1998.000000, reward total was -12.000000. running mean: -11.250584
episode 1999.000000, reward total was -13.000000. running mean: -11.268078
episode 2000.000000, reward total was -13.000000. running mean: -11.285397
episode 2001.000000, reward total was -11.000000. running mean: -11.282543
episode 2002.000000, reward total was -10.000000. running mean: -11.269718
episode 2003.000000, reward total was -12.000000. running mean: -11.277020
episode 2004.000000, reward total was -12.000000. running mean: -11.284250
episode 2005.000000, reward 

episode 2102.000000, reward total was -11.000000. running mean: -11.267313
episode 2103.000000, reward total was -17.000000. running mean: -11.324640
episode 2104.000000, reward total was -13.000000. running mean: -11.341393
episode 2105.000000, reward total was -11.000000. running mean: -11.337979
episode 2106.000000, reward total was -16.000000. running mean: -11.384599
episode 2107.000000, reward total was -10.000000. running mean: -11.370753
episode 2108.000000, reward total was -8.000000. running mean: -11.337046
episode 2109.000000, reward total was -10.000000. running mean: -11.323675
episode 2110.000000, reward total was -13.000000. running mean: -11.340439
episode 2111.000000, reward total was -15.000000. running mean: -11.377034
episode 2112.000000, reward total was -9.000000. running mean: -11.353264
episode 2113.000000, reward total was -16.000000. running mean: -11.399731
episode 2114.000000, reward total was -1.000000. running mean: -11.295734
episode 2115.000000, reward 

episode 2212.000000, reward total was -8.000000. running mean: -10.761051
episode 2213.000000, reward total was -12.000000. running mean: -10.773440
episode 2214.000000, reward total was -10.000000. running mean: -10.765706
episode 2215.000000, reward total was -18.000000. running mean: -10.838049
episode 2216.000000, reward total was -17.000000. running mean: -10.899668
episode 2217.000000, reward total was -11.000000. running mean: -10.900671
episode 2218.000000, reward total was -10.000000. running mean: -10.891665
episode 2219.000000, reward total was -11.000000. running mean: -10.892748
episode 2220.000000, reward total was -9.000000. running mean: -10.873821
episode 2221.000000, reward total was -11.000000. running mean: -10.875082
episode 2222.000000, reward total was -9.000000. running mean: -10.856332
episode 2223.000000, reward total was -17.000000. running mean: -10.917768
episode 2224.000000, reward total was -20.000000. running mean: -11.008591
episode 2225.000000, reward 

episode 2322.000000, reward total was -13.000000. running mean: -10.538957
episode 2323.000000, reward total was -11.000000. running mean: -10.543568
episode 2324.000000, reward total was -9.000000. running mean: -10.528132
episode 2325.000000, reward total was -14.000000. running mean: -10.562851
episode 2326.000000, reward total was -9.000000. running mean: -10.547222
episode 2327.000000, reward total was -16.000000. running mean: -10.601750
episode 2328.000000, reward total was -12.000000. running mean: -10.615733
episode 2329.000000, reward total was -9.000000. running mean: -10.599575
episode 2330.000000, reward total was -15.000000. running mean: -10.643580
episode 2331.000000, reward total was -11.000000. running mean: -10.647144
episode 2332.000000, reward total was -3.000000. running mean: -10.570672
episode 2333.000000, reward total was -7.000000. running mean: -10.534966
episode 2334.000000, reward total was -12.000000. running mean: -10.549616
episode 2335.000000, reward to

episode 2432.000000, reward total was -3.000000. running mean: -10.995075
episode 2433.000000, reward total was -15.000000. running mean: -11.035125
episode 2434.000000, reward total was -7.000000. running mean: -10.994773
episode 2435.000000, reward total was -12.000000. running mean: -11.004826
episode 2436.000000, reward total was -11.000000. running mean: -11.004777
episode 2437.000000, reward total was -11.000000. running mean: -11.004730
episode 2438.000000, reward total was -11.000000. running mean: -11.004682
episode 2439.000000, reward total was -13.000000. running mean: -11.024635
episode 2440.000000, reward total was -10.000000. running mean: -11.014389
episode 2441.000000, reward total was -9.000000. running mean: -10.994245
episode 2442.000000, reward total was -17.000000. running mean: -11.054303
episode 2443.000000, reward total was -7.000000. running mean: -11.013760
episode 2444.000000, reward total was -11.000000. running mean: -11.013622
episode 2445.000000, reward t

episode 2542.000000, reward total was -9.000000. running mean: -10.509809
episode 2543.000000, reward total was -15.000000. running mean: -10.554711
episode 2544.000000, reward total was -10.000000. running mean: -10.549164
episode 2545.000000, reward total was -13.000000. running mean: -10.573672
episode 2546.000000, reward total was -5.000000. running mean: -10.517935
episode 2547.000000, reward total was -6.000000. running mean: -10.472756
episode 2548.000000, reward total was -7.000000. running mean: -10.438029
episode 2549.000000, reward total was -7.000000. running mean: -10.403648
episode 2550.000000, reward total was -11.000000. running mean: -10.409612
episode 2551.000000, reward total was -11.000000. running mean: -10.415516
episode 2552.000000, reward total was -5.000000. running mean: -10.361360
episode 2553.000000, reward total was -15.000000. running mean: -10.407747
episode 2554.000000, reward total was -1.000000. running mean: -10.313669
episode 2555.000000, reward tota

episode 2652.000000, reward total was -17.000000. running mean: -10.311255
episode 2653.000000, reward total was -6.000000. running mean: -10.268142
episode 2654.000000, reward total was -15.000000. running mean: -10.315461
episode 2655.000000, reward total was -13.000000. running mean: -10.342306
episode 2656.000000, reward total was -9.000000. running mean: -10.328883
episode 2657.000000, reward total was -13.000000. running mean: -10.355594
episode 2658.000000, reward total was -17.000000. running mean: -10.422038
episode 2659.000000, reward total was -9.000000. running mean: -10.407818
episode 2660.000000, reward total was -6.000000. running mean: -10.363740
episode 2661.000000, reward total was 3.000000. running mean: -10.230102
episode 2662.000000, reward total was -5.000000. running mean: -10.177801
episode 2663.000000, reward total was -6.000000. running mean: -10.136023
episode 2664.000000, reward total was -7.000000. running mean: -10.104663
episode 2665.000000, reward total 

episode 2764.000000, reward total was -6.000000. running mean: -9.518380
episode 2765.000000, reward total was -15.000000. running mean: -9.573196
episode 2766.000000, reward total was -13.000000. running mean: -9.607464
episode 2767.000000, reward total was -5.000000. running mean: -9.561389
episode 2768.000000, reward total was -14.000000. running mean: -9.605776
episode 2769.000000, reward total was -8.000000. running mean: -9.589718
episode 2770.000000, reward total was -3.000000. running mean: -9.523821
episode 2771.000000, reward total was -9.000000. running mean: -9.518582
episode 2772.000000, reward total was -13.000000. running mean: -9.553397
episode 2773.000000, reward total was -4.000000. running mean: -9.497863
episode 2774.000000, reward total was -17.000000. running mean: -9.572884
episode 2775.000000, reward total was -12.000000. running mean: -9.597155
episode 2776.000000, reward total was -15.000000. running mean: -9.651184
episode 2777.000000, reward total was -14.00

episode 2876.000000, reward total was -11.000000. running mean: -8.866783
episode 2877.000000, reward total was -12.000000. running mean: -8.898115
episode 2878.000000, reward total was -11.000000. running mean: -8.919134
episode 2879.000000, reward total was -3.000000. running mean: -8.859942
episode 2880.000000, reward total was -1.000000. running mean: -8.781343
episode 2881.000000, reward total was -15.000000. running mean: -8.843530
episode 2882.000000, reward total was -10.000000. running mean: -8.855094
episode 2883.000000, reward total was -12.000000. running mean: -8.886543
episode 2884.000000, reward total was -9.000000. running mean: -8.887678
episode 2885.000000, reward total was -6.000000. running mean: -8.858801
episode 2886.000000, reward total was -11.000000. running mean: -8.880213
episode 2887.000000, reward total was -5.000000. running mean: -8.841411
episode 2888.000000, reward total was -6.000000. running mean: -8.812997
episode 2889.000000, reward total was -15.00

episode 2988.000000, reward total was -15.000000. running mean: -8.688666
episode 2989.000000, reward total was -16.000000. running mean: -8.761779
episode 2990.000000, reward total was -17.000000. running mean: -8.844161
episode 2991.000000, reward total was -13.000000. running mean: -8.885719
episode 2992.000000, reward total was -10.000000. running mean: -8.896862
episode 2993.000000, reward total was -16.000000. running mean: -8.967894
episode 2994.000000, reward total was -9.000000. running mean: -8.968215
episode 2995.000000, reward total was -4.000000. running mean: -8.918533
episode 2996.000000, reward total was -10.000000. running mean: -8.929347
episode 2997.000000, reward total was -8.000000. running mean: -8.920054
episode 2998.000000, reward total was -4.000000. running mean: -8.870853
episode 2999.000000, reward total was -9.000000. running mean: -8.872145
episode 3000.000000, reward total was -12.000000. running mean: -8.903423
episode 3001.000000, reward total was 6.000

episode 3100.000000, reward total was -1.000000. running mean: -8.205517
episode 3101.000000, reward total was -5.000000. running mean: -8.173462
episode 3102.000000, reward total was -7.000000. running mean: -8.161727
episode 3103.000000, reward total was -9.000000. running mean: -8.170110
episode 3104.000000, reward total was -7.000000. running mean: -8.158409
episode 3105.000000, reward total was -5.000000. running mean: -8.126825
episode 3106.000000, reward total was -13.000000. running mean: -8.175556
episode 3107.000000, reward total was -14.000000. running mean: -8.233801
episode 3108.000000, reward total was -9.000000. running mean: -8.241463
episode 3109.000000, reward total was -15.000000. running mean: -8.309048
episode 3110.000000, reward total was -13.000000. running mean: -8.355958
episode 3111.000000, reward total was -10.000000. running mean: -8.372398
episode 3112.000000, reward total was 3.000000. running mean: -8.258674
episode 3113.000000, reward total was -8.000000

episode 3212.000000, reward total was -7.000000. running mean: -8.141572
episode 3213.000000, reward total was -3.000000. running mean: -8.090156
episode 3214.000000, reward total was -9.000000. running mean: -8.099255
episode 3215.000000, reward total was -3.000000. running mean: -8.048262
episode 3216.000000, reward total was -11.000000. running mean: -8.077779
episode 3217.000000, reward total was -11.000000. running mean: -8.107002
episode 3218.000000, reward total was -7.000000. running mean: -8.095932
episode 3219.000000, reward total was -1.000000. running mean: -8.024972
episode 3220.000000, reward total was -11.000000. running mean: -8.054723
episode 3221.000000, reward total was -5.000000. running mean: -8.024175
episode 3222.000000, reward total was -14.000000. running mean: -8.083934
episode 3223.000000, reward total was -7.000000. running mean: -8.073094
episode 3224.000000, reward total was -10.000000. running mean: -8.092363
episode 3225.000000, reward total was -1.00000

episode 3324.000000, reward total was -9.000000. running mean: -7.741637
episode 3325.000000, reward total was -9.000000. running mean: -7.754221
episode 3326.000000, reward total was -6.000000. running mean: -7.736679
episode 3327.000000, reward total was -13.000000. running mean: -7.789312
episode 3328.000000, reward total was 5.000000. running mean: -7.661419
episode 3329.000000, reward total was 4.000000. running mean: -7.544805
episode 3330.000000, reward total was -1.000000. running mean: -7.479357
episode 3331.000000, reward total was -13.000000. running mean: -7.534563
episode 3332.000000, reward total was -5.000000. running mean: -7.509217
episode 3333.000000, reward total was -12.000000. running mean: -7.554125
episode 3334.000000, reward total was -11.000000. running mean: -7.588584
episode 3335.000000, reward total was -4.000000. running mean: -7.552698
episode 3336.000000, reward total was -7.000000. running mean: -7.547171
episode 3337.000000, reward total was -11.000000.

episode 3436.000000, reward total was -14.000000. running mean: -7.163907
episode 3437.000000, reward total was -9.000000. running mean: -7.182268
episode 3438.000000, reward total was -6.000000. running mean: -7.170446
episode 3439.000000, reward total was -8.000000. running mean: -7.178741
episode 3440.000000, reward total was -12.000000. running mean: -7.226954
episode 3441.000000, reward total was -6.000000. running mean: -7.214684
episode 3442.000000, reward total was -11.000000. running mean: -7.252537
episode 3443.000000, reward total was -15.000000. running mean: -7.330012
episode 3444.000000, reward total was -6.000000. running mean: -7.316712
episode 3445.000000, reward total was -12.000000. running mean: -7.363545
episode 3446.000000, reward total was 6.000000. running mean: -7.229909
episode 3447.000000, reward total was -6.000000. running mean: -7.217610
episode 3448.000000, reward total was -5.000000. running mean: -7.195434
episode 3449.000000, reward total was -11.00000

episode 3549.000000, reward total was 1.000000. running mean: -5.898014
episode 3550.000000, reward total was -10.000000. running mean: -5.939034
episode 3551.000000, reward total was -4.000000. running mean: -5.919643
episode 3552.000000, reward total was 1.000000. running mean: -5.850447
episode 3553.000000, reward total was 9.000000. running mean: -5.701943
episode 3554.000000, reward total was -11.000000. running mean: -5.754923
episode 3555.000000, reward total was -7.000000. running mean: -5.767374
episode 3556.000000, reward total was 12.000000. running mean: -5.589700
episode 3557.000000, reward total was -4.000000. running mean: -5.573803
episode 3558.000000, reward total was 4.000000. running mean: -5.478065
episode 3559.000000, reward total was -11.000000. running mean: -5.533284
episode 3560.000000, reward total was 12.000000. running mean: -5.357952
episode 3561.000000, reward total was -7.000000. running mean: -5.374372
episode 3562.000000, reward total was -5.000000. run

episode 3662.000000, reward total was 2.000000. running mean: -4.980306
episode 3663.000000, reward total was -1.000000. running mean: -4.940503
episode 3664.000000, reward total was -7.000000. running mean: -4.961098
episode 3665.000000, reward total was -13.000000. running mean: -5.041487
episode 3666.000000, reward total was -11.000000. running mean: -5.101072
episode 3667.000000, reward total was -8.000000. running mean: -5.130062
episode 3668.000000, reward total was -8.000000. running mean: -5.158761
episode 3669.000000, reward total was -5.000000. running mean: -5.157174
episode 3670.000000, reward total was -8.000000. running mean: -5.185602
episode 3671.000000, reward total was 6.000000. running mean: -5.073746
episode 3672.000000, reward total was -8.000000. running mean: -5.103008
episode 3673.000000, reward total was 7.000000. running mean: -4.981978
episode 3674.000000, reward total was -5.000000. running mean: -4.982158
episode 3675.000000, reward total was -7.000000. run

episode 3775.000000, reward total was 5.000000. running mean: -4.252241
episode 3776.000000, reward total was -5.000000. running mean: -4.259718
episode 3777.000000, reward total was -8.000000. running mean: -4.297121
episode 3778.000000, reward total was 4.000000. running mean: -4.214150
episode 3779.000000, reward total was -12.000000. running mean: -4.292008
episode 3780.000000, reward total was -2.000000. running mean: -4.269088
episode 3781.000000, reward total was 7.000000. running mean: -4.156397
episode 3782.000000, reward total was 3.000000. running mean: -4.084833
episode 3783.000000, reward total was -8.000000. running mean: -4.123985
episode 3784.000000, reward total was -5.000000. running mean: -4.132745
episode 3785.000000, reward total was 4.000000. running mean: -4.051418
episode 3786.000000, reward total was -12.000000. running mean: -4.130903
episode 3787.000000, reward total was -11.000000. running mean: -4.199594
episode 3788.000000, reward total was -1.000000. runn

episode 3888.000000, reward total was 4.000000. running mean: -3.239654
episode 3889.000000, reward total was -3.000000. running mean: -3.237257
episode 3890.000000, reward total was 1.000000. running mean: -3.194885
episode 3891.000000, reward total was -2.000000. running mean: -3.182936
episode 3892.000000, reward total was -8.000000. running mean: -3.231106
episode 3893.000000, reward total was 8.000000. running mean: -3.118795
episode 3894.000000, reward total was 1.000000. running mean: -3.077607
episode 3895.000000, reward total was 4.000000. running mean: -3.006831
episode 3896.000000, reward total was -1.000000. running mean: -2.986763
episode 3897.000000, reward total was -3.000000. running mean: -2.986895
episode 3898.000000, reward total was -5.000000. running mean: -3.007026
episode 3899.000000, reward total was -4.000000. running mean: -3.016956
episode 3900.000000, reward total was -2.000000. running mean: -3.006787
episode 3901.000000, reward total was -11.000000. runnin

episode 4001.000000, reward total was -2.000000. running mean: -3.021390
episode 4002.000000, reward total was -7.000000. running mean: -3.061176
episode 4003.000000, reward total was -5.000000. running mean: -3.080565
episode 4004.000000, reward total was 11.000000. running mean: -2.939759
episode 4005.000000, reward total was 1.000000. running mean: -2.900361
episode 4006.000000, reward total was 6.000000. running mean: -2.811358
episode 4007.000000, reward total was 3.000000. running mean: -2.753244
episode 4008.000000, reward total was -1.000000. running mean: -2.735712
episode 4009.000000, reward total was -5.000000. running mean: -2.758355
episode 4010.000000, reward total was 12.000000. running mean: -2.610771
episode 4011.000000, reward total was -10.000000. running mean: -2.684663
episode 4012.000000, reward total was -2.000000. running mean: -2.677817
episode 4013.000000, reward total was -6.000000. running mean: -2.711039
episode 4014.000000, reward total was -3.000000. runn

episode 4114.000000, reward total was 1.000000. running mean: -3.160406
episode 4115.000000, reward total was -7.000000. running mean: -3.198802
episode 4116.000000, reward total was 6.000000. running mean: -3.106814
episode 4117.000000, reward total was 9.000000. running mean: -2.985746
episode 4118.000000, reward total was -8.000000. running mean: -3.035888
episode 4119.000000, reward total was 4.000000. running mean: -2.965529
episode 4120.000000, reward total was -1.000000. running mean: -2.945874
episode 4121.000000, reward total was -3.000000. running mean: -2.946415
episode 4122.000000, reward total was -3.000000. running mean: -2.946951
episode 4123.000000, reward total was -12.000000. running mean: -3.037482
episode 4124.000000, reward total was -9.000000. running mean: -3.097107
episode 4125.000000, reward total was -3.000000. running mean: -3.096136
episode 4126.000000, reward total was -5.000000. running mean: -3.115174
episode 4127.000000, reward total was -1.000000. runni

episode 4227.000000, reward total was 2.000000. running mean: -2.489798
episode 4228.000000, reward total was -7.000000. running mean: -2.534900
episode 4229.000000, reward total was -1.000000. running mean: -2.519551
episode 4230.000000, reward total was -9.000000. running mean: -2.584356
episode 4231.000000, reward total was -13.000000. running mean: -2.688512
episode 4232.000000, reward total was -2.000000. running mean: -2.681627
episode 4233.000000, reward total was 2.000000. running mean: -2.634811
episode 4234.000000, reward total was 11.000000. running mean: -2.498463
episode 4235.000000, reward total was -1.000000. running mean: -2.483478
episode 4236.000000, reward total was -7.000000. running mean: -2.528643
episode 4237.000000, reward total was 5.000000. running mean: -2.453357
episode 4238.000000, reward total was -5.000000. running mean: -2.478823
episode 4239.000000, reward total was 4.000000. running mean: -2.414035
episode 4240.000000, reward total was 4.000000. runnin

episode 4340.000000, reward total was 3.000000. running mean: -2.084403
episode 4341.000000, reward total was -4.000000. running mean: -2.103559
episode 4342.000000, reward total was -4.000000. running mean: -2.122523
episode 4343.000000, reward total was 8.000000. running mean: -2.021298
episode 4344.000000, reward total was -9.000000. running mean: -2.091085
episode 4345.000000, reward total was -7.000000. running mean: -2.140174
episode 4346.000000, reward total was 3.000000. running mean: -2.088772
episode 4347.000000, reward total was -10.000000. running mean: -2.167884
episode 4348.000000, reward total was 1.000000. running mean: -2.136206
episode 4349.000000, reward total was -7.000000. running mean: -2.184844
episode 4350.000000, reward total was -5.000000. running mean: -2.212995
episode 4351.000000, reward total was 5.000000. running mean: -2.140865
episode 4352.000000, reward total was -5.000000. running mean: -2.169456
episode 4353.000000, reward total was -11.000000. runni

episode 4453.000000, reward total was -4.000000. running mean: -1.967481
episode 4454.000000, reward total was -11.000000. running mean: -2.057806
episode 4455.000000, reward total was -13.000000. running mean: -2.167228
episode 4456.000000, reward total was -5.000000. running mean: -2.195556
episode 4457.000000, reward total was 1.000000. running mean: -2.163600
episode 4458.000000, reward total was 3.000000. running mean: -2.111964
episode 4459.000000, reward total was 5.000000. running mean: -2.040845
episode 4460.000000, reward total was 3.000000. running mean: -1.990436
episode 4461.000000, reward total was -9.000000. running mean: -2.060532
episode 4462.000000, reward total was 6.000000. running mean: -1.979927
episode 4463.000000, reward total was -6.000000. running mean: -2.020127
episode 4464.000000, reward total was -5.000000. running mean: -2.049926
episode 4465.000000, reward total was -4.000000. running mean: -2.069427
episode 4466.000000, reward total was -6.000000. runni

episode 4566.000000, reward total was 4.000000. running mean: -2.441824
episode 4567.000000, reward total was -11.000000. running mean: -2.527406
episode 4568.000000, reward total was 7.000000. running mean: -2.432132
episode 4569.000000, reward total was 4.000000. running mean: -2.367811
episode 4570.000000, reward total was -5.000000. running mean: -2.394133
episode 4571.000000, reward total was -9.000000. running mean: -2.460191
episode 4572.000000, reward total was -1.000000. running mean: -2.445589
episode 4573.000000, reward total was -11.000000. running mean: -2.531133
episode 4574.000000, reward total was 2.000000. running mean: -2.485822
episode 4575.000000, reward total was 1.000000. running mean: -2.450964
episode 4576.000000, reward total was -9.000000. running mean: -2.516454
episode 4577.000000, reward total was -1.000000. running mean: -2.501290
episode 4578.000000, reward total was -7.000000. running mean: -2.546277
episode 4579.000000, reward total was -5.000000. runni

episode 4679.000000, reward total was 1.000000. running mean: -3.010576
episode 4680.000000, reward total was -2.000000. running mean: -3.000470
episode 4681.000000, reward total was 11.000000. running mean: -2.860465
episode 4682.000000, reward total was 7.000000. running mean: -2.761860
episode 4683.000000, reward total was -6.000000. running mean: -2.794242
episode 4684.000000, reward total was -11.000000. running mean: -2.876299
episode 4685.000000, reward total was -4.000000. running mean: -2.887536
episode 4686.000000, reward total was -2.000000. running mean: -2.878661
episode 4687.000000, reward total was 8.000000. running mean: -2.769874
episode 4688.000000, reward total was -2.000000. running mean: -2.762176
episode 4689.000000, reward total was -5.000000. running mean: -2.784554
episode 4690.000000, reward total was -3.000000. running mean: -2.786708
episode 4691.000000, reward total was -1.000000. running mean: -2.768841
episode 4692.000000, reward total was -9.000000. runn

episode 4792.000000, reward total was 3.000000. running mean: -1.664898
episode 4793.000000, reward total was 4.000000. running mean: -1.608249
episode 4794.000000, reward total was -8.000000. running mean: -1.672166
episode 4795.000000, reward total was -12.000000. running mean: -1.775445
episode 4796.000000, reward total was 1.000000. running mean: -1.747690
episode 4797.000000, reward total was 5.000000. running mean: -1.680213
episode 4798.000000, reward total was -8.000000. running mean: -1.743411
episode 4799.000000, reward total was 10.000000. running mean: -1.625977
episode 4800.000000, reward total was 4.000000. running mean: -1.569717
episode 4801.000000, reward total was -10.000000. running mean: -1.654020
episode 4802.000000, reward total was 8.000000. running mean: -1.557480
episode 4803.000000, reward total was -1.000000. running mean: -1.551905
episode 4804.000000, reward total was -3.000000. running mean: -1.566386
episode 4805.000000, reward total was 7.000000. running

episode 4905.000000, reward total was 3.000000. running mean: -1.827816
episode 4906.000000, reward total was -6.000000. running mean: -1.869538
episode 4907.000000, reward total was -1.000000. running mean: -1.860843
episode 4908.000000, reward total was 7.000000. running mean: -1.772234
episode 4909.000000, reward total was 6.000000. running mean: -1.694512
episode 4910.000000, reward total was -1.000000. running mean: -1.687567
episode 4911.000000, reward total was -6.000000. running mean: -1.730691
episode 4912.000000, reward total was -3.000000. running mean: -1.743384
episode 4913.000000, reward total was 4.000000. running mean: -1.685951
episode 4914.000000, reward total was -5.000000. running mean: -1.719091
episode 4915.000000, reward total was 4.000000. running mean: -1.661900
episode 4916.000000, reward total was 2.000000. running mean: -1.625281
episode 4917.000000, reward total was -5.000000. running mean: -1.659028
episode 4918.000000, reward total was 4.000000. running m

episode 5018.000000, reward total was 5.000000. running mean: -1.304990
episode 5019.000000, reward total was -2.000000. running mean: -1.311941
episode 5020.000000, reward total was 4.000000. running mean: -1.258821
episode 5021.000000, reward total was -5.000000. running mean: -1.296233
episode 5022.000000, reward total was -8.000000. running mean: -1.363271
episode 5023.000000, reward total was -1.000000. running mean: -1.359638
episode 5024.000000, reward total was 11.000000. running mean: -1.236042
episode 5025.000000, reward total was -4.000000. running mean: -1.263681
episode 5026.000000, reward total was 4.000000. running mean: -1.211044
episode 5027.000000, reward total was -7.000000. running mean: -1.268934
episode 5028.000000, reward total was 7.000000. running mean: -1.186245
episode 5029.000000, reward total was 3.000000. running mean: -1.144382
episode 5030.000000, reward total was -1.000000. running mean: -1.142938
episode 5031.000000, reward total was -13.000000. runnin

episode 5131.000000, reward total was 7.000000. running mean: -0.853050
episode 5132.000000, reward total was 11.000000. running mean: -0.734520
episode 5133.000000, reward total was -10.000000. running mean: -0.827174
episode 5134.000000, reward total was 7.000000. running mean: -0.748903
episode 5135.000000, reward total was 2.000000. running mean: -0.721414
episode 5136.000000, reward total was 7.000000. running mean: -0.644199
episode 5137.000000, reward total was 6.000000. running mean: -0.577757
episode 5138.000000, reward total was 4.000000. running mean: -0.531980
episode 5139.000000, reward total was -10.000000. running mean: -0.626660
episode 5140.000000, reward total was 3.000000. running mean: -0.590393
episode 5141.000000, reward total was -4.000000. running mean: -0.624490
episode 5142.000000, reward total was 5.000000. running mean: -0.568245
episode 5143.000000, reward total was 6.000000. running mean: -0.502562
episode 5144.000000, reward total was -3.000000. running m

episode 5244.000000, reward total was 1.000000. running mean: -0.769562
episode 5245.000000, reward total was -9.000000. running mean: -0.851866
episode 5246.000000, reward total was -3.000000. running mean: -0.873347
episode 5247.000000, reward total was -5.000000. running mean: -0.914614
episode 5248.000000, reward total was 3.000000. running mean: -0.875468
episode 5249.000000, reward total was -8.000000. running mean: -0.946713
episode 5250.000000, reward total was -10.000000. running mean: -1.037246
episode 5251.000000, reward total was -8.000000. running mean: -1.106873
episode 5252.000000, reward total was 10.000000. running mean: -0.995805
episode 5253.000000, reward total was -5.000000. running mean: -1.035847
episode 5254.000000, reward total was 2.000000. running mean: -1.005488
episode 5255.000000, reward total was -11.000000. running mean: -1.105433
episode 5256.000000, reward total was -5.000000. running mean: -1.144379
episode 5257.000000, reward total was 3.000000. runn

episode 5357.000000, reward total was -10.000000. running mean: -0.993136
episode 5358.000000, reward total was -1.000000. running mean: -0.993205
episode 5359.000000, reward total was 3.000000. running mean: -0.953273
episode 5360.000000, reward total was -3.000000. running mean: -0.973740
episode 5361.000000, reward total was -1.000000. running mean: -0.974003
episode 5362.000000, reward total was -15.000000. running mean: -1.114263
episode 5363.000000, reward total was 7.000000. running mean: -1.033120
episode 5364.000000, reward total was -3.000000. running mean: -1.052789
episode 5365.000000, reward total was 14.000000. running mean: -0.902261
episode 5366.000000, reward total was -1.000000. running mean: -0.903238
episode 5367.000000, reward total was 6.000000. running mean: -0.834206
episode 5368.000000, reward total was 7.000000. running mean: -0.755864
episode 5369.000000, reward total was -5.000000. running mean: -0.798305
episode 5370.000000, reward total was 2.000000. runni

episode 5470.000000, reward total was -11.000000. running mean: -1.335149
episode 5471.000000, reward total was -9.000000. running mean: -1.411797
episode 5472.000000, reward total was 4.000000. running mean: -1.357679
episode 5473.000000, reward total was 9.000000. running mean: -1.254102
episode 5474.000000, reward total was -3.000000. running mean: -1.271561
episode 5475.000000, reward total was -1.000000. running mean: -1.268846
episode 5476.000000, reward total was -7.000000. running mean: -1.326157
episode 5477.000000, reward total was -7.000000. running mean: -1.382896
episode 5478.000000, reward total was -12.000000. running mean: -1.489067
episode 5479.000000, reward total was 2.000000. running mean: -1.454176
episode 5480.000000, reward total was 2.000000. running mean: -1.419634
episode 5481.000000, reward total was 2.000000. running mean: -1.385438
episode 5482.000000, reward total was -1.000000. running mean: -1.381584
episode 5483.000000, reward total was 5.000000. runnin

episode 5583.000000, reward total was -1.000000. running mean: -0.676674
episode 5584.000000, reward total was -11.000000. running mean: -0.779907
episode 5585.000000, reward total was -3.000000. running mean: -0.802108
episode 5586.000000, reward total was 7.000000. running mean: -0.724087
episode 5587.000000, reward total was 1.000000. running mean: -0.706846
episode 5588.000000, reward total was 3.000000. running mean: -0.669777
episode 5589.000000, reward total was -5.000000. running mean: -0.713080
episode 5590.000000, reward total was -4.000000. running mean: -0.745949
episode 5591.000000, reward total was 10.000000. running mean: -0.638489
episode 5592.000000, reward total was -5.000000. running mean: -0.682104
episode 5593.000000, reward total was 4.000000. running mean: -0.635283
episode 5594.000000, reward total was 6.000000. running mean: -0.568931
episode 5595.000000, reward total was -5.000000. running mean: -0.613241
episode 5596.000000, reward total was 1.000000. running

episode 5696.000000, reward total was 7.000000. running mean: -0.861847
episode 5697.000000, reward total was 13.000000. running mean: -0.723228
episode 5698.000000, reward total was -1.000000. running mean: -0.725996
episode 5699.000000, reward total was -6.000000. running mean: -0.778736
episode 5700.000000, reward total was 18.000000. running mean: -0.590949
episode 5701.000000, reward total was -4.000000. running mean: -0.625039
episode 5702.000000, reward total was 7.000000. running mean: -0.548789
episode 5703.000000, reward total was 9.000000. running mean: -0.453301
episode 5704.000000, reward total was 6.000000. running mean: -0.388768
episode 5705.000000, reward total was -2.000000. running mean: -0.404880
episode 5706.000000, reward total was 2.000000. running mean: -0.380831
episode 5707.000000, reward total was -5.000000. running mean: -0.427023
episode 5708.000000, reward total was 8.000000. running mean: -0.342753
episode 5709.000000, reward total was -4.000000. running 

episode 5810.000000, reward total was -1.000000. running mean: -0.172470
episode 5811.000000, reward total was 2.000000. running mean: -0.150745
episode 5812.000000, reward total was 1.000000. running mean: -0.139238
episode 5813.000000, reward total was 11.000000. running mean: -0.027845
episode 5814.000000, reward total was 6.000000. running mean: 0.032433
episode 5815.000000, reward total was -3.000000. running mean: 0.002109
episode 5816.000000, reward total was 5.000000. running mean: 0.052088
episode 5817.000000, reward total was -4.000000. running mean: 0.011567
episode 5818.000000, reward total was -10.000000. running mean: -0.088549
episode 5819.000000, reward total was -5.000000. running mean: -0.137663
episode 5820.000000, reward total was -1.000000. running mean: -0.146287
episode 5821.000000, reward total was 4.000000. running mean: -0.104824
episode 5822.000000, reward total was -7.000000. running mean: -0.173776
episode 5823.000000, reward total was 6.000000. running mea

episode 5923.000000, reward total was 1.000000. running mean: -1.596487
episode 5924.000000, reward total was -6.000000. running mean: -1.640522
episode 5925.000000, reward total was -9.000000. running mean: -1.714117
episode 5926.000000, reward total was 6.000000. running mean: -1.636976
episode 5927.000000, reward total was -5.000000. running mean: -1.670606
episode 5928.000000, reward total was 2.000000. running mean: -1.633900
episode 5929.000000, reward total was 1.000000. running mean: -1.607561
episode 5930.000000, reward total was -3.000000. running mean: -1.621485
episode 5931.000000, reward total was -2.000000. running mean: -1.625270
episode 5932.000000, reward total was -4.000000. running mean: -1.649018
episode 5933.000000, reward total was 9.000000. running mean: -1.542528
episode 5934.000000, reward total was 12.000000. running mean: -1.407102
episode 5935.000000, reward total was 6.000000. running mean: -1.333031
episode 5936.000000, reward total was -8.000000. running 

episode 6036.000000, reward total was -7.000000. running mean: -0.435357
episode 6037.000000, reward total was -3.000000. running mean: -0.461004
episode 6038.000000, reward total was -3.000000. running mean: -0.486394
episode 6039.000000, reward total was 3.000000. running mean: -0.451530
episode 6040.000000, reward total was 3.000000. running mean: -0.417014
episode 6041.000000, reward total was -8.000000. running mean: -0.492844
episode 6042.000000, reward total was -12.000000. running mean: -0.607916
episode 6043.000000, reward total was -6.000000. running mean: -0.661837
episode 6044.000000, reward total was 1.000000. running mean: -0.645218
episode 6045.000000, reward total was -6.000000. running mean: -0.698766
episode 6046.000000, reward total was 9.000000. running mean: -0.601779
episode 6047.000000, reward total was -5.000000. running mean: -0.645761
episode 6048.000000, reward total was 1.000000. running mean: -0.629303
episode 6049.000000, reward total was -6.000000. runnin

episode 6149.000000, reward total was 3.000000. running mean: -0.613320
episode 6150.000000, reward total was -1.000000. running mean: -0.617186
episode 6151.000000, reward total was 4.000000. running mean: -0.571014
episode 6152.000000, reward total was -3.000000. running mean: -0.595304
episode 6153.000000, reward total was -7.000000. running mean: -0.659351
episode 6154.000000, reward total was -9.000000. running mean: -0.742758
episode 6155.000000, reward total was -4.000000. running mean: -0.775330
episode 6156.000000, reward total was -3.000000. running mean: -0.797577
episode 6157.000000, reward total was 1.000000. running mean: -0.779601
episode 6158.000000, reward total was -1.000000. running mean: -0.781805
episode 6159.000000, reward total was 15.000000. running mean: -0.623987
episode 6160.000000, reward total was -5.000000. running mean: -0.667747
episode 6161.000000, reward total was 10.000000. running mean: -0.561070
episode 6162.000000, reward total was -1.000000. runni

episode 6263.000000, reward total was -1.000000. running mean: 0.146704
episode 6264.000000, reward total was 1.000000. running mean: 0.155237
episode 6265.000000, reward total was -7.000000. running mean: 0.083685
episode 6266.000000, reward total was 5.000000. running mean: 0.132848
episode 6267.000000, reward total was -4.000000. running mean: 0.091519
episode 6268.000000, reward total was -2.000000. running mean: 0.070604
episode 6269.000000, reward total was 5.000000. running mean: 0.119898
episode 6270.000000, reward total was 6.000000. running mean: 0.178699
episode 6271.000000, reward total was 7.000000. running mean: 0.246912
episode 6272.000000, reward total was 9.000000. running mean: 0.334443
episode 6273.000000, reward total was 4.000000. running mean: 0.371098
episode 6274.000000, reward total was 3.000000. running mean: 0.397387
episode 6275.000000, reward total was 7.000000. running mean: 0.463414
episode 6276.000000, reward total was 6.000000. running mean: 0.518779
ep

episode 6378.000000, reward total was 13.000000. running mean: 1.301064
episode 6379.000000, reward total was 8.000000. running mean: 1.368054
episode 6380.000000, reward total was 5.000000. running mean: 1.404373
episode 6381.000000, reward total was 5.000000. running mean: 1.440329
episode 6382.000000, reward total was 6.000000. running mean: 1.485926
episode 6383.000000, reward total was -2.000000. running mean: 1.451067
episode 6384.000000, reward total was 2.000000. running mean: 1.456556
episode 6385.000000, reward total was -5.000000. running mean: 1.391991
episode 6386.000000, reward total was -5.000000. running mean: 1.328071
episode 6387.000000, reward total was -2.000000. running mean: 1.294790
episode 6388.000000, reward total was 3.000000. running mean: 1.311842
episode 6389.000000, reward total was -1.000000. running mean: 1.288724
episode 6390.000000, reward total was 2.000000. running mean: 1.295836
episode 6391.000000, reward total was -9.000000. running mean: 1.192878

episode 6493.000000, reward total was -1.000000. running mean: 0.124295
episode 6494.000000, reward total was 1.000000. running mean: 0.133052
episode 6495.000000, reward total was -11.000000. running mean: 0.021721
episode 6496.000000, reward total was -1.000000. running mean: 0.011504
episode 6497.000000, reward total was 6.000000. running mean: 0.071389
episode 6498.000000, reward total was 3.000000. running mean: 0.100675
episode 6499.000000, reward total was -3.000000. running mean: 0.069668
episode 6500.000000, reward total was 7.000000. running mean: 0.138972
episode 6501.000000, reward total was -7.000000. running mean: 0.067582
episode 6502.000000, reward total was 4.000000. running mean: 0.106906
episode 6503.000000, reward total was 3.000000. running mean: 0.135837
episode 6504.000000, reward total was -1.000000. running mean: 0.124479
episode 6505.000000, reward total was 2.000000. running mean: 0.143234
episode 6506.000000, reward total was 3.000000. running mean: 0.171801

episode 6608.000000, reward total was 6.000000. running mean: 0.281177
episode 6609.000000, reward total was 7.000000. running mean: 0.348366
episode 6610.000000, reward total was 2.000000. running mean: 0.364882
episode 6611.000000, reward total was 8.000000. running mean: 0.441233
episode 6612.000000, reward total was -3.000000. running mean: 0.406821
episode 6613.000000, reward total was 7.000000. running mean: 0.472753
episode 6614.000000, reward total was -3.000000. running mean: 0.438025
episode 6615.000000, reward total was 2.000000. running mean: 0.453645
episode 6616.000000, reward total was 1.000000. running mean: 0.459108
episode 6617.000000, reward total was 10.000000. running mean: 0.554517
episode 6618.000000, reward total was -3.000000. running mean: 0.518972
episode 6619.000000, reward total was 4.000000. running mean: 0.553782
episode 6620.000000, reward total was -5.000000. running mean: 0.498245
episode 6621.000000, reward total was 5.000000. running mean: 0.543262
e

episode 6723.000000, reward total was 6.000000. running mean: 0.867928
episode 6724.000000, reward total was 6.000000. running mean: 0.919249
episode 6725.000000, reward total was 2.000000. running mean: 0.930057
episode 6726.000000, reward total was -7.000000. running mean: 0.850756
episode 6727.000000, reward total was -4.000000. running mean: 0.802248
episode 6728.000000, reward total was 8.000000. running mean: 0.874226
episode 6729.000000, reward total was 6.000000. running mean: 0.925484
episode 6730.000000, reward total was 7.000000. running mean: 0.986229
episode 6731.000000, reward total was -1.000000. running mean: 0.966367
episode 6732.000000, reward total was 7.000000. running mean: 1.026703
episode 6733.000000, reward total was -5.000000. running mean: 0.966436
episode 6734.000000, reward total was 3.000000. running mean: 0.986772
episode 6735.000000, reward total was -2.000000. running mean: 0.956904
episode 6736.000000, reward total was 4.000000. running mean: 0.987335
e

episode 6838.000000, reward total was -1.000000. running mean: 0.908612
episode 6839.000000, reward total was -4.000000. running mean: 0.859526
episode 6840.000000, reward total was -3.000000. running mean: 0.820931
episode 6841.000000, reward total was 9.000000. running mean: 0.902722
episode 6842.000000, reward total was 1.000000. running mean: 0.903694
episode 6843.000000, reward total was 2.000000. running mean: 0.914657
episode 6844.000000, reward total was 6.000000. running mean: 0.965511
episode 6845.000000, reward total was 10.000000. running mean: 1.055856
episode 6846.000000, reward total was -1.000000. running mean: 1.035297
episode 6847.000000, reward total was -2.000000. running mean: 1.004944
episode 6848.000000, reward total was -3.000000. running mean: 0.964895
episode 6849.000000, reward total was 1.000000. running mean: 0.965246
episode 6850.000000, reward total was 8.000000. running mean: 1.035593
episode 6851.000000, reward total was -2.000000. running mean: 1.00523

episode 6953.000000, reward total was -2.000000. running mean: 1.100375
episode 6954.000000, reward total was -7.000000. running mean: 1.019371
episode 6955.000000, reward total was -9.000000. running mean: 0.919178
episode 6956.000000, reward total was 11.000000. running mean: 1.019986
episode 6957.000000, reward total was 9.000000. running mean: 1.099786
episode 6958.000000, reward total was -7.000000. running mean: 1.018788
episode 6959.000000, reward total was -2.000000. running mean: 0.988600
episode 6960.000000, reward total was 9.000000. running mean: 1.068714
episode 6961.000000, reward total was -11.000000. running mean: 0.948027
episode 6962.000000, reward total was -1.000000. running mean: 0.928547
episode 6963.000000, reward total was -2.000000. running mean: 0.899261
episode 6964.000000, reward total was -2.000000. running mean: 0.870269
episode 6965.000000, reward total was -3.000000. running mean: 0.831566
episode 6966.000000, reward total was 5.000000. running mean: 0.8