In [5]:
import copy
import glob
import os
import time

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from envs import make_env 

In [6]:
class args:
    def __init__(self):
        self.env_name='PongNoFrameskip-v4'
        self.seed=1
        self.log_dir=''
        self.save_dir='saved_models'
        self.cuda=False
        self.num_stack=4
        self.num_steps=5
        self.num_processes=16
        self.lr=7e-4
        self.eps=1e-5
        self.alpha=.99
        self.max_grad_norm=.5
        self.value_loss_coef=.5
        self.entropy_coef=.1
        self.num_frames=8e6
        self.use_gae=False
        self.gamma=.99
        self.tau=.95
        self.save_interval=1000
        self.log_interval=100
        self.vis_interval=100
        self.load_model=False
        self.save_model=True
        
args = args()

SAVE_PATH = "saved_models/a2c_121717.pt"
LOAD_PATH = "saved_models/a2c_121717.pt"

In [7]:
num_updates = int(args.num_frames) // args.num_steps // args.num_processes

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [12]:
def main():
    os.environ['OMP_NUM_THREADS'] = '1'

    envs = [make_env(args.env_name, args.seed, i, args.log_dir)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    global actor_critic
    
    actor_critic = CNNPolicy(obs_shape[0], envs.action_space)
  
    if args.load_model:
        actor_critic.load_state_dict(torch.load(LOAD_PATH))

    action_shape = 1

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)

    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space,\
                              actor_critic.state_size)
    
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            # Sample actions
            value, action, action_log_prob, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                      Variable(rollouts.states[step], volatile=True),
                                                                      Variable(rollouts.masks[step], volatile=True))
            cpu_actions = action.data.squeeze(1).cpu().numpy()

            # Obser reward and next obs
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks)

        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                       Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                       Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                       Variable(rollouts.actions.view(-1, action_shape)))

        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

        advantages = Variable(rollouts.returns[:-1]) - values
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()

        optimizer.zero_grad()
        (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        optimizer.step()
 
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_model:
            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                
            torch.save(actor_critic.state_dict(), SAVE_PATH)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
            print("\n rollout returns", rollouts.returns, "\nvalues", values)
            
main()

Updates 0, num timesteps 80, FPS 2, mean/median reward 0.0/0.0, min/max reward 0.0/0.0, entropy 1.58978, value loss 0.04211, policy loss 0.22921

 rollout returns 
(0 ,.,.) = 
  0.3817
  0.3835
  0.3883
  0.4000
  0.3911
  0.3947
  0.3945
  0.3849
  0.3966
  0.3997
  0.3972
  0.3838
  0.3983
  0.3861
  0.3827
  0.3892

(1 ,.,.) = 
  0.3856
  0.3874
  0.3923
  0.4041
  0.3951
  0.3987
  0.3985
  0.3888
  0.4007
  0.4037
  0.4012
  0.3877
  0.4024
  0.3900
  0.3866
  0.3931

(2 ,.,.) = 
  0.3895
  0.3913
  0.3962
  0.4082
  0.3991
  0.4027
  0.4025
  0.3927
  0.4047
  0.4078
  0.4052
  0.3916
  0.4064
  0.3939
  0.3905
  0.3971

(3 ,.,.) = 
  0.3934
  0.3953
  0.4002
  0.4123
  0.4031
  0.4068
  0.4066
  0.3967
  0.4088
  0.4119
  0.4093
  0.3956
  0.4105
  0.3979
  0.3945
  0.4011

(4 ,.,.) = 
  0.3974
  0.3992
  0.4043
  0.4164
  0.4072
  0.4109
  0.4107
  0.4007
  0.4129
  0.4161
  0.4135
  0.3996
  0.4147
  0.4019
  0.3984
  0.4051

(5 ,.,.) = 
  0.4014
  0.4033
  0.4084
  0.4206
  0

Updates 500, num timesteps 40080, FPS 127, mean/median reward -20.2/-21.0, min/max reward -21.0/-17.0, entropy 1.78284, value loss 0.06730, policy loss -0.02376

 rollout returns 
(0 ,.,.) = 
 -1.8078
 -1.8041
 -1.8135
 -1.8125
 -2.8046
 -1.8128
 -1.8113
 -1.8026
 -2.7641
 -1.7981
 -1.8140
 -1.8122
 -1.8079
 -1.8060
 -1.8159
 -1.8135

(1 ,.,.) = 
 -1.8261
 -1.8223
 -1.8319
 -1.8308
 -1.8228
 -1.8311
 -1.8296
 -1.8208
 -2.7921
 -1.8163
 -1.8323
 -1.8305
 -1.8262
 -1.8242
 -1.8342
 -1.8318

(2 ,.,.) = 
 -1.8445
 -1.8408
 -1.8504
 -1.8493
 -1.8413
 -1.8496
 -1.8480
 -1.8392
 -2.8203
 -1.8346
 -1.8508
 -1.8490
 -1.8446
 -1.8426
 -1.8527
 -1.8503

(3 ,.,.) = 
 -1.8632
 -1.8593
 -1.8691
 -1.8680
 -1.8599
 -1.8683
 -1.8667
 -1.8578
 -2.8488
 -1.8531
 -1.8695
 -1.8677
 -1.8632
 -1.8612
 -1.8714
 -1.8690

(4 ,.,.) = 
 -1.8820
 -1.8781
 -1.8879
 -1.8868
 -1.8786
 -1.8871
 -1.8856
 -1.8766
 -2.8775
 -1.8719
 -1.8884
 -1.8866
 -1.8821
 -1.8800
 -1.8903
 -1.8879

(5 ,.,.) = 
 -1.9010
 -1.8971
 -1.9

Updates 1000, num timesteps 80080, FPS 149, mean/median reward -20.2/-20.0, min/max reward -21.0/-19.0, entropy 1.78848, value loss 0.04575, policy loss -0.00901

 rollout returns 
(0 ,.,.) = 
 -2.3868
 -1.4124
 -1.4154
 -1.4187
 -1.4170
 -1.4113
 -1.4111
 -1.4110
 -1.4162
 -1.4153
 -1.4094
 -1.4150
 -1.4127
 -1.4087
 -1.4168
 -1.4180

(1 ,.,.) = 
 -2.4110
 -1.4267
 -1.4297
 -1.4330
 -1.4313
 -1.4256
 -1.4253
 -1.4253
 -1.4305
 -1.4296
 -1.4236
 -1.4293
 -1.4270
 -1.4229
 -1.4311
 -1.4323

(2 ,.,.) = 
 -2.4353
 -1.4411
 -1.4441
 -1.4475
 -1.4457
 -1.4400
 -1.4397
 -1.4397
 -1.4450
 -1.4441
 -1.4380
 -1.4437
 -1.4414
 -1.4373
 -1.4456
 -1.4468

(3 ,.,.) = 
 -2.4599
 -1.4556
 -1.4587
 -1.4621
 -1.4603
 -1.4545
 -1.4543
 -1.4542
 -1.4596
 -1.4586
 -1.4525
 -1.4583
 -1.4560
 -1.4518
 -1.4602
 -1.4614

(4 ,.,.) = 
 -1.4747
 -1.4703
 -1.4735
 -1.4769
 -1.4751
 -1.4692
 -1.4690
 -1.4689
 -1.4743
 -1.4734
 -1.4672
 -1.4730
 -1.4707
 -1.4665
 -1.4749
 -1.4762

(5 ,.,.) = 
 -1.4896
 -1.4852
 -1.

Updates 1500, num timesteps 120080, FPS 142, mean/median reward -20.4/-21.0, min/max reward -21.0/-19.0, entropy 1.79118, value loss 0.04904, policy loss -0.02508

 rollout returns 
(0 ,.,.) = 
 -1.3451
 -1.3215
 -1.4823
 -1.5386
 -2.5408
 -1.3240
 -1.5751
 -1.3233
 -1.4458
 -1.5543
 -2.2874
 -1.6266
 -1.5870
 -1.2959
 -1.6165
 -1.2742

(1 ,.,.) = 
 -1.3587
 -1.3349
 -1.4973
 -1.5541
 -1.5563
 -1.3374
 -1.5910
 -1.3367
 -1.4604
 -1.5700
 -2.3105
 -1.6430
 -1.6030
 -1.3090
 -1.6328
 -1.2871

(2 ,.,.) = 
 -1.3724
 -1.3484
 -1.5124
 -1.5698
 -1.5721
 -1.3509
 -1.6071
 -1.3502
 -1.4752
 -1.5859
 -2.3338
 -1.6596
 -1.6192
 -1.3222
 -1.6493
 -1.3001

(3 ,.,.) = 
 -1.3863
 -1.3620
 -1.5277
 -1.5857
 -1.5879
 -1.3646
 -1.6233
 -1.3639
 -1.4901
 -1.6019
 -1.3473
 -1.6764
 -1.6356
 -1.3356
 -1.6660
 -1.3132

(4 ,.,.) = 
 -1.4003
 -1.3757
 -1.5431
 -1.6017
 -1.6040
 -1.3784
 -1.6397
 -1.3776
 -1.5051
 -1.6181
 -1.3609
 -1.6933
 -1.6521
 -1.3491
 -1.6828
 -1.3265

(5 ,.,.) = 
 -1.4144
 -1.3896
 -1

Updates 2000, num timesteps 160080, FPS 143, mean/median reward -20.4/-21.0, min/max reward -21.0/-19.0, entropy 1.79144, value loss 0.03326, policy loss -0.00303

 rollout returns 
(0 ,.,.) = 
 -1.6549
 -1.6591
 -2.3052
 -1.4295
 -1.3554
 -1.4636
 -1.7154
 -1.3191
 -1.3484
 -1.4270
 -1.3530
 -1.3425
 -1.3361
 -1.3900
 -1.4474
 -1.2709

(1 ,.,.) = 
 -1.6716
 -1.6758
 -2.3284
 -1.4440
 -1.3691
 -1.4784
 -1.7327
 -1.3324
 -1.3620
 -1.4414
 -1.3666
 -1.3560
 -1.3496
 -1.4040
 -1.4620
 -1.2838

(2 ,.,.) = 
 -1.6885
 -1.6928
 -2.3520
 -1.4586
 -1.3829
 -1.4933
 -1.7502
 -1.3458
 -1.3757
 -1.4560
 -1.3804
 -1.3697
 -1.3632
 -1.4182
 -1.4767
 -1.2967

(3 ,.,.) = 
 -1.7055
 -1.7099
 -1.3656
 -1.4733
 -1.3969
 -1.5084
 -1.7679
 -1.3594
 -1.3896
 -1.4707
 -1.3944
 -1.3836
 -1.3770
 -1.4325
 -1.4917
 -1.3098

(4 ,.,.) = 
 -1.7228
 -1.7271
 -1.3794
 -1.4882
 -1.4110
 -1.5236
 -1.7858
 -1.3732
 -1.4037
 -1.4855
 -1.4085
 -1.3976
 -1.3909
 -1.4470
 -1.5067
 -1.3231

(5 ,.,.) = 
 -1.7402
 -1.7446
 -1

Updates 2500, num timesteps 200080, FPS 151, mean/median reward -19.8/-20.0, min/max reward -21.0/-18.0, entropy 1.79066, value loss 0.05491, policy loss 0.06146

 rollout returns 
(0 ,.,.) = 
  0.0386
 -0.9191
 -1.0465
 -1.0438
 -0.9318
 -1.0132
 -1.1995
 -1.1529
 -0.9529
 -1.1839
 -0.9670
 -1.9145
 -1.1248
 -1.1122
 -1.0075
 -1.1736

(1 ,.,.) = 
  0.0390
 -0.9284
 -1.0571
 -1.0543
 -0.9412
 -1.0235
 -1.2117
 -1.1645
 -0.9625
 -1.1958
 -0.9767
 -1.9339
 -1.1361
 -1.1234
 -1.0177
 -1.1854

(2 ,.,.) = 
  0.0394
 -0.9378
 -1.0677
 -1.0650
 -0.9507
 -1.0338
 -1.2239
 -1.1763
 -0.9722
 -1.2079
 -0.9866
 -0.9433
 -1.1476
 -1.1348
 -1.0279
 -1.1974

(3 ,.,.) = 
 -0.9704
 -0.9473
 -1.0785
 -1.0757
 -0.9603
 -1.0443
 -1.2363
 -1.1882
 -0.9821
 -1.2201
 -0.9966
 -0.9529
 -1.1592
 -1.1462
 -1.0383
 -1.2095

(4 ,.,.) = 
 -0.9802
 -0.9569
 -1.0894
 -1.0866
 -0.9700
 -1.0548
 -1.2488
 -1.2002
 -0.9920
 -1.2324
 -1.0066
 -0.9625
 -1.1709
 -1.1578
 -1.0488
 -1.2217

(5 ,.,.) = 
 -0.9901
 -0.9665
 -1.

Updates 3000, num timesteps 240080, FPS 157, mean/median reward -20.2/-21.0, min/max reward -21.0/-19.0, entropy 1.78917, value loss 0.02213, policy loss -0.04501

 rollout returns 
(0 ,.,.) = 
 -2.4243
 -2.6621
 -2.3607
 -2.8268
 -1.8754
 -2.3720
 -1.7995
 -1.7291
 -2.0694
 -1.6498
 -1.8553
 -2.1746
 -2.1123
 -2.7903
 -1.9840
 -1.4599

(1 ,.,.) = 
 -2.4488
 -2.6890
 -2.3846
 -2.8553
 -1.8944
 -2.3960
 -1.8177
 -1.7466
 -2.0903
 -1.6665
 -1.8740
 -2.1966
 -2.1337
 -1.8083
 -2.0040
 -1.4746

(2 ,.,.) = 
 -2.4735
 -1.7061
 -2.4087
 -2.8842
 -1.9135
 -2.4202
 -1.8361
 -1.7642
 -2.1115
 -1.6833
 -1.8930
 -2.2187
 -2.1552
 -1.8266
 -2.0242
 -1.4895

(3 ,.,.) = 
 -2.4985
 -1.7233
 -2.4330
 -2.9133
 -1.9328
 -2.4446
 -1.8546
 -1.7821
 -2.1328
 -1.7003
 -1.9121
 -2.2412
 -2.1770
 -1.8451
 -2.0447
 -1.5046

(4 ,.,.) = 
 -2.5238
 -1.7407
 -2.4576
 -1.9326
 -1.9524
 -2.4693
 -1.8733
 -1.8001
 -2.1543
 -1.7175
 -1.9314
 -2.2638
 -2.1990
 -1.8637
 -2.0653
 -1.5198

(5 ,.,.) = 
 -2.5492
 -1.7583
 -2

Updates 3500, num timesteps 280080, FPS 161, mean/median reward -20.2/-21.0, min/max reward -21.0/-19.0, entropy 1.78923, value loss 0.02582, policy loss 0.02565

 rollout returns 
(0 ,.,.) = 
 -2.1826
 -1.8429
 -2.1766
 -1.5322
 -1.7829
 -1.7153
 -1.9968
 -1.8595
 -1.6121
 -1.5169
 -2.3552
 -1.7565
 -2.3366
 -1.7660
 -2.6398
 -1.7051

(1 ,.,.) = 
 -2.2046
 -1.8615
 -2.1986
 -1.5477
 -1.8010
 -1.7326
 -2.0169
 -1.8783
 -1.6284
 -1.5323
 -2.3790
 -1.7742
 -2.3603
 -1.7838
 -2.6665
 -1.7223

(2 ,.,.) = 
 -2.2269
 -1.8803
 -2.2208
 -1.5633
 -1.8191
 -1.7501
 -2.0373
 -1.8972
 -1.6449
 -1.5477
 -2.4030
 -1.7922
 -2.3841
 -1.8019
 -2.6934
 -1.7397

(3 ,.,.) = 
 -2.2494
 -1.8993
 -2.2433
 -1.5791
 -1.8375
 -1.7678
 -2.0579
 -1.9164
 -1.6615
 -1.5634
 -2.4273
 -1.8103
 -2.4082
 -1.8201
 -2.7206
 -1.7573

(4 ,.,.) = 
 -2.2721
 -1.9185
 -2.2659
 -1.5951
 -1.8561
 -1.7856
 -2.0787
 -1.9357
 -1.6783
 -1.5792
 -2.4518
 -1.8285
 -2.4325
 -1.8385
 -2.7481
 -1.7750

(5 ,.,.) = 
 -2.2950
 -1.9378
 -2.

Updates 4000, num timesteps 320080, FPS 165, mean/median reward -20.2/-20.0, min/max reward -21.0/-18.0, entropy 1.79030, value loss 0.02046, policy loss 0.04898

 rollout returns 
(0 ,.,.) = 
 -1.5701
 -0.5283
 -2.3433
 -1.8661
 -1.8724
 -1.6344
 -1.7413
 -1.5934
 -1.6906
 -1.7152
 -2.1644
 -2.0368
 -1.5139
 -2.5696
 -1.6503
 -2.4827

(1 ,.,.) = 
 -1.5859
 -0.5336
 -2.3669
 -1.8850
 -1.8913
 -1.6509
 -1.7589
 -1.6095
 -1.7077
 -1.7325
 -2.1862
 -2.0573
 -1.5292
 -1.5854
 -1.6670
 -2.5078

(2 ,.,.) = 
 -1.6019
 -0.5390
 -2.3908
 -1.9040
 -1.9104
 -1.6676
 -1.7766
 -1.6258
 -1.7250
 -1.7500
 -2.2083
 -2.0781
 -1.5447
 -1.6014
 -1.6838
 -1.5230

(3 ,.,.) = 
 -1.6181
 -0.5445
 -2.4150
 -1.9233
 -1.9297
 -1.6845
 -1.7946
 -1.6422
 -1.7424
 -1.7677
 -2.2306
 -2.0991
 -1.5603
 -1.6176
 -1.7008
 -1.5384

(4 ,.,.) = 
 -1.6345
 -0.5500
 -2.4394
 -1.9427
 -1.9492
 -1.7015
 -1.8127
 -1.6588
 -1.7600
 -1.7855
 -2.2532
 -2.1203
 -1.5760
 -1.6339
 -1.7180
 -1.5540

(5 ,.,.) = 
 -1.6510
 -0.5555
 -2.

Updates 4500, num timesteps 360080, FPS 168, mean/median reward -20.2/-20.0, min/max reward -21.0/-19.0, entropy 1.78917, value loss 0.01578, policy loss -0.02135

 rollout returns 
(0 ,.,.) = 
 -1.8482
 -1.7875
 -1.5799
 -1.4889
 -0.4845
 -1.3506
 -1.8010
 -1.5788
 -1.8718
 -1.4806
 -1.7664
 -1.9714
 -1.9848
 -1.6201
 -1.3452
 -1.4073

(1 ,.,.) = 
 -1.8669
 -1.8055
 -1.5958
 -1.5039
 -0.4894
 -1.3643
 -1.8192
 -1.5947
 -1.8907
 -1.4955
 -1.7843
 -1.9914
 -2.0049
 -1.6364
 -1.3588
 -1.4215

(2 ,.,.) = 
 -1.8857
 -1.8238
 -1.6119
 -1.5191
 -0.4944
 -1.3780
 -1.8376
 -1.6108
 -1.9098
 -1.5106
 -1.8023
 -2.0115
 -2.0251
 -1.6530
 -1.3725
 -1.4359

(3 ,.,.) = 
 -1.9048
 -1.8422
 -1.6282
 -1.5345
 -0.4994
 -1.3920
 -1.8562
 -1.6271
 -1.9291
 -1.5259
 -1.8205
 -2.0318
 -2.0456
 -1.6697
 -1.3864
 -1.4504

(4 ,.,.) = 
 -1.9240
 -1.8608
 -1.6447
 -1.5500
 -0.5044
 -1.4060
 -1.8749
 -1.6436
 -1.9486
 -1.5413
 -1.8389
 -2.0523
 -2.0663
 -1.6865
 -1.4004
 -1.4650

(5 ,.,.) = 
 -1.9434
 -1.8796
 -1

Updates 5000, num timesteps 400080, FPS 170, mean/median reward -20.2/-20.0, min/max reward -21.0/-18.0, entropy 1.78760, value loss 0.01703, policy loss 0.02600

 rollout returns 
(0 ,.,.) = 
 -1.6781
 -1.5357
 -2.5685
 -1.3373
 -1.2185
 -1.6365
 -1.4943
 -1.6490
 -1.7048
 -2.5038
 -1.5191
 -2.4672
 -1.8831
 -1.4641
 -1.1716
 -1.8149

(1 ,.,.) = 
 -1.6950
 -1.5512
 -1.5844
 -1.3508
 -1.2308
 -1.6530
 -1.5094
 -1.6657
 -1.7220
 -1.5190
 -1.5345
 -2.4921
 -1.9021
 -1.4789
 -1.1834
 -1.8333

(2 ,.,.) = 
 -1.7122
 -1.5668
 -1.6004
 -1.3644
 -1.2432
 -1.6697
 -1.5247
 -1.6825
 -1.7394
 -1.5343
 -1.5500
 -2.5173
 -1.9213
 -1.4938
 -1.1954
 -1.8518

(3 ,.,.) = 
 -1.7295
 -1.5827
 -1.6165
 -1.3782
 -1.2557
 -1.6866
 -1.5401
 -1.6995
 -1.7570
 -1.5498
 -1.5656
 -2.5427
 -1.9407
 -1.5089
 -1.2074
 -1.8705

(4 ,.,.) = 
 -1.7469
 -1.5987
 -1.6329
 -1.3921
 -1.2684
 -1.7036
 -1.5556
 -1.7167
 -1.7747
 -1.5655
 -1.5814
 -2.5684
 -1.9603
 -1.5242
 -1.2196
 -1.8894

(5 ,.,.) = 
 -1.7646
 -1.6148
 -1.

Updates 5500, num timesteps 440080, FPS 172, mean/median reward -19.8/-20.0, min/max reward -21.0/-18.0, entropy 1.78641, value loss 0.01940, policy loss 0.00513

 rollout returns 
(0 ,.,.) = 
 -2.5623
 -2.0537
 -2.3545
 -2.4806
 -1.8894
 -2.3464
 -1.7919
 -2.8114
 -2.3162
 -2.4419
 -2.9903
 -1.7358
 -2.0309
 -2.3333
 -1.9145
 -1.8026

(1 ,.,.) = 
 -2.5882
 -2.0744
 -2.3783
 -2.5057
 -1.9085
 -2.3701
 -1.8100
 -2.8398
 -2.3396
 -2.4666
 -2.0105
 -1.7533
 -2.0514
 -2.3569
 -1.9338
 -1.8208

(2 ,.,.) = 
 -2.6143
 -2.0954
 -2.4023
 -2.5310
 -1.9277
 -2.3940
 -1.8283
 -2.8685
 -2.3632
 -2.4915
 -2.0308
 -1.7710
 -2.0721
 -2.3807
 -1.9533
 -1.8392

(3 ,.,.) = 
 -2.6407
 -2.1166
 -2.4266
 -2.5565
 -1.9472
 -2.4182
 -1.8468
 -1.8874
 -2.3871
 -2.5167
 -2.0513
 -1.7889
 -2.0930
 -2.4048
 -1.9731
 -1.8578

(4 ,.,.) = 
 -2.6674
 -2.1379
 -2.4511
 -2.5824
 -1.9669
 -2.4426
 -1.8655
 -1.9064
 -2.4112
 -2.5421
 -2.0720
 -1.8070
 -2.1142
 -2.4291
 -1.9930
 -1.8765

(5 ,.,.) = 
 -2.6943
 -2.1595
 -2.

Updates 6000, num timesteps 480080, FPS 174, mean/median reward -20.2/-20.0, min/max reward -21.0/-19.0, entropy 1.78898, value loss 0.01312, policy loss 0.01435

 rollout returns 
(0 ,.,.) = 
 -2.1129
 -2.0546
 -2.0393
 -1.6005
 -1.4341
 -1.4518
 -1.9662
 -1.7569
 -1.9525
 -2.1254
 -1.7428
 -1.9703
 -2.1238
 -1.3450
 -2.2688
 -0.9129

(1 ,.,.) = 
 -2.1342
 -2.0753
 -2.0599
 -1.6166
 -1.4486
 -1.4665
 -1.9860
 -1.7746
 -1.9722
 -2.1469
 -1.7604
 -1.9902
 -2.1452
 -1.3586
 -2.2917
 -0.9222

(2 ,.,.) = 
 -2.1558
 -2.0963
 -2.0807
 -1.6330
 -1.4632
 -1.4813
 -2.0061
 -1.7925
 -1.9921
 -2.1685
 -1.7782
 -2.0103
 -2.1669
 -1.3723
 -2.3149
 -0.9315

(3 ,.,.) = 
 -2.1776
 -2.1175
 -2.1017
 -1.6495
 -1.4780
 -1.4963
 -2.0263
 -1.8106
 -2.0122
 -2.1904
 -1.7961
 -2.0306
 -2.1888
 -1.3862
 -2.3383
 -0.9409

(4 ,.,.) = 
 -2.1996
 -2.1388
 -2.1229
 -1.6661
 -1.4929
 -1.5114
 -2.0468
 -1.8289
 -2.0325
 -2.2126
 -1.8143
 -2.0511
 -2.2109
 -1.4002
 -2.3619
 -0.9504

(5 ,.,.) = 
 -2.2218
 -2.1604
 -2.

Updates 6500, num timesteps 520080, FPS 175, mean/median reward -20.1/-21.0, min/max reward -21.0/-18.0, entropy 1.78934, value loss 0.01730, policy loss -0.03145

 rollout returns 
(0 ,.,.) = 
 -2.2569
 -2.2388
 -2.5193
 -1.9255
 -0.9191
 -2.5857
 -2.5182
 -1.6705
 -2.4846
 -2.2788
 -2.2030
 -1.9803
 -1.7855
 -2.8639
 -1.4438
 -2.7859

(1 ,.,.) = 
 -2.2797
 -2.2614
 -2.5448
 -1.9449
 -0.9284
 -2.6118
 -2.5436
 -1.6874
 -2.5097
 -2.3018
 -2.2252
 -2.0003
 -1.8036
 -1.8828
 -1.4584
 -1.8039

(2 ,.,.) = 
 -2.3027
 -2.2842
 -2.5705
 -1.9646
 -0.9378
 -2.6382
 -2.5693
 -1.7044
 -2.5350
 -2.3251
 -2.2477
 -2.0205
 -1.8218
 -1.9018
 -1.4732
 -1.8221

(3 ,.,.) = 
 -2.3260
 -2.3073
 -2.5964
 -1.9844
 -0.9472
 -2.6649
 -2.5952
 -1.7216
 -2.5606
 -2.3485
 -2.2704
 -2.0409
 -1.8402
 -1.9210
 -1.4880
 -1.8405

(4 ,.,.) = 
 -2.3495
 -2.3306
 -2.6227
 -2.0045
 -0.9568
 -2.6918
 -2.6215
 -1.7390
 -2.5865
 -2.3723
 -2.2933
 -2.0615
 -1.8588
 -1.9404
 -1.5031
 -1.8591

(5 ,.,.) = 
 -2.3732
 -2.3542
 -2

Updates 7000, num timesteps 560080, FPS 176, mean/median reward -20.2/-20.0, min/max reward -21.0/-19.0, entropy 1.79071, value loss 0.01480, policy loss 0.00532

 rollout returns 
(0 ,.,.) = 
 -0.8767
 -2.1591
 -2.0209
 -2.0290
 -1.8264
 -1.9611
 -2.7282
 -2.7885
 -2.3077
 -3.0269
 -1.8887
 -1.9348
 -2.0055
 -2.7755
 -0.9761
 -2.0839

(1 ,.,.) = 
 -0.8856
 -2.1809
 -2.0413
 -2.0495
 -1.8448
 -1.9809
 -2.7558
 -2.8167
 -2.3310
 -2.0474
 -1.9078
 -1.9543
 -2.0258
 -2.8036
 -0.9860
 -2.1049

(2 ,.,.) = 
 -0.8945
 -2.2029
 -2.0620
 -2.0702
 -1.8634
 -2.0009
 -2.7836
 -2.8452
 -2.3546
 -2.0680
 -1.9271
 -1.9741
 -2.0463
 -2.8319
 -0.9959
 -2.1262

(3 ,.,.) = 
 -0.9036
 -2.2252
 -2.0828
 -2.0912
 -1.8823
 -2.0212
 -2.8117
 -1.8638
 -2.3784
 -2.0889
 -1.9465
 -1.9940
 -2.0669
 -2.8605
 -1.0060
 -2.1477

(4 ,.,.) = 
 -0.9127
 -2.2477
 -2.1038
 -2.1123
 -1.9013
 -2.0416
 -2.8401
 -1.8826
 -2.4024
 -2.1100
 -1.9662
 -2.0142
 -2.0878
 -2.8894
 -1.0162
 -2.1694

(5 ,.,.) = 
 -0.9219
 -2.2704
 -2.

Updates 7500, num timesteps 600080, FPS 176, mean/median reward -19.9/-20.0, min/max reward -21.0/-18.0, entropy 1.78944, value loss 0.02150, policy loss 0.00573

 rollout returns 
(0 ,.,.) = 
 -1.7943
 -2.3148
 -1.4565
 -1.1255
 -2.0831
 -1.9920
 -2.2976
 -2.3584
 -1.9098
 -1.7917
 -2.8301
 -1.9991
 -1.5492
 -2.7185
 -1.8709
 -2.3679

(1 ,.,.) = 
 -1.8124
 -2.3381
 -1.4712
 -1.1368
 -2.1041
 -2.0122
 -2.3208
 -2.3822
 -1.9291
 -1.8098
 -2.8587
 -2.0193
 -1.5648
 -2.7459
 -1.8898
 -2.3918

(2 ,.,.) = 
 -1.8307
 -2.3618
 -1.4860
 -1.1483
 -2.1253
 -2.0325
 -2.3442
 -1.3962
 -1.9486
 -1.8280
 -2.8876
 -2.0397
 -1.5806
 -2.7737
 -1.9089
 -1.4058

(3 ,.,.) = 
 -1.8492
 -2.3856
 -1.5010
 -1.1599
 -2.1468
 -2.0530
 -2.3679
 -1.4103
 -1.9683
 -1.8465
 -2.9168
 -2.0603
 -1.5966
 -2.8017
 -1.9282
 -1.4200

(4 ,.,.) = 
 -1.8679
 -2.4097
 -1.5162
 -1.1716
 -2.1685
 -2.0738
 -2.3918
 -1.4246
 -1.9882
 -1.8651
 -1.9361
 -2.0811
 -1.6127
 -2.8300
 -1.9477
 -1.4344

(5 ,.,.) = 
 -1.8867
 -2.4340
 -1.

Updates 8000, num timesteps 640080, FPS 177, mean/median reward -20.1/-20.0, min/max reward -21.0/-19.0, entropy 1.78889, value loss 0.01327, policy loss -0.00484

 rollout returns 
(0 ,.,.) = 
 -1.5674
 -2.2691
 -1.6677
 -1.4568
 -2.1417
 -1.7016
 -2.3719
 -1.6724
 -2.1394
 -1.7535
 -2.0226
 -2.0233
 -1.6700
 -1.8272
 -1.4391
 -1.6982

(1 ,.,.) = 
 -1.5833
 -2.2920
 -1.6845
 -1.4715
 -2.1633
 -1.7187
 -2.3959
 -1.6893
 -2.1611
 -1.7712
 -2.0430
 -2.0438
 -1.6868
 -1.8456
 -1.4537
 -1.7154

(2 ,.,.) = 
 -1.5992
 -2.3151
 -1.7016
 -1.4863
 -2.1852
 -1.7361
 -2.4201
 -1.7064
 -2.1829
 -1.7891
 -2.0636
 -2.0644
 -1.7039
 -1.8643
 -1.4683
 -1.7327

(3 ,.,.) = 
 -1.6154
 -2.3385
 -1.7187
 -1.5014
 -2.2073
 -1.7536
 -2.4445
 -1.7236
 -2.2049
 -1.8071
 -2.0845
 -2.0853
 -1.7211
 -1.8831
 -1.4832
 -1.7502

(4 ,.,.) = 
 -1.6317
 -2.3622
 -1.7361
 -1.5165
 -2.2296
 -1.7714
 -2.4692
 -1.7410
 -2.2272
 -1.8254
 -2.1055
 -2.1063
 -1.7385
 -1.9021
 -1.4982
 -1.7679

(5 ,.,.) = 
 -1.6482
 -2.3860
 -1

Updates 8500, num timesteps 680080, FPS 178, mean/median reward -20.4/-21.0, min/max reward -21.0/-19.0, entropy 1.78924, value loss 0.01870, policy loss 0.03840

 rollout returns 
(0 ,.,.) = 
 -1.8439
 -1.8609
 -2.8801
 -2.2041
 -2.1132
 -2.2250
 -1.8674
 -2.2722
 -2.8800
 -2.2555
 -2.3142
 -2.2416
 -2.7983
 -3.0907
 -2.7219
 -2.1677

(1 ,.,.) = 
 -0.8525
 -1.8797
 -2.9092
 -2.2264
 -2.1345
 -2.2474
 -1.8862
 -2.2951
 -2.9091
 -2.2782
 -2.3376
 -2.2643
 -2.8265
 -3.1219
 -2.7494
 -2.1896

(2 ,.,.) = 
 -0.8611
 -1.8986
 -2.9386
 -2.2489
 -2.1561
 -2.2701
 -1.9053
 -2.3183
 -2.9385
 -2.3012
 -2.3612
 -2.2871
 -2.8551
 -3.1534
 -2.7771
 -2.2117

(3 ,.,.) = 
 -0.8698
 -1.9178
 -2.9683
 -2.2716
 -2.1778
 -2.2931
 -1.9245
 -2.3417
 -2.9682
 -2.3245
 -2.3851
 -2.3102
 -1.8738
 -2.1752
 -2.8052
 -2.2341

(4 ,.,.) = 
 -0.8786
 -1.9372
 -2.9982
 -2.2945
 -2.1998
 -2.3162
 -1.9440
 -2.3654
 -2.9981
 -2.3480
 -2.4092
 -2.3336
 -1.8927
 -2.1972
 -2.8335
 -2.2566

(5 ,.,.) = 
 -0.8874
 -1.9568
 -3.

Updates 9000, num timesteps 720080, FPS 179, mean/median reward -20.2/-20.0, min/max reward -21.0/-18.0, entropy 1.78872, value loss 0.00768, policy loss -0.00704

 rollout returns 
(0 ,.,.) = 
 -2.1245
 -1.3054
 -1.8643
 -1.7854
 -1.8896
 -1.9253
 -2.9167
 -1.9924
 -2.1774
 -2.6423
 -1.6723
 -1.3025
 -2.3257
 -2.1296
 -1.8313
 -1.7929

(1 ,.,.) = 
 -2.1460
 -1.3186
 -1.8831
 -1.8034
 -1.9087
 -1.9447
 -1.9361
 -2.0126
 -2.1994
 -2.6690
 -1.6892
 -1.3156
 -2.3492
 -2.1511
 -1.8498
 -1.8111

(2 ,.,.) = 
 -2.1677
 -1.3319
 -1.9021
 -1.8216
 -1.9280
 -1.9644
 -1.9556
 -2.0329
 -2.2216
 -2.6959
 -1.7062
 -1.3289
 -2.3729
 -2.1728
 -1.8685
 -1.8293

(3 ,.,.) = 
 -2.1896
 -1.3453
 -1.9213
 -1.8400
 -1.9474
 -1.9842
 -1.9754
 -2.0534
 -2.2441
 -2.7232
 -1.7235
 -1.3423
 -2.3969
 -2.1948
 -1.8874
 -1.8478

(4 ,.,.) = 
 -2.2117
 -1.3589
 -1.9408
 -1.8586
 -1.9671
 -2.0042
 -1.9954
 -2.0742
 -2.2667
 -2.7507
 -1.7409
 -1.3559
 -2.4211
 -2.2170
 -1.9065
 -1.8665

(5 ,.,.) = 
 -2.2340
 -1.3726
 -1

Updates 9500, num timesteps 760080, FPS 178, mean/median reward -20.1/-20.0, min/max reward -21.0/-18.0, entropy 1.78972, value loss 0.02493, policy loss -0.02421

 rollout returns 
(0 ,.,.) = 
 -1.3363
 -1.8917
 -1.1487
 -1.6967
 -2.0309
 -1.5093
 -2.2589
 -1.8022
 -1.8085
 -2.1270
 -2.0367
 -1.7431
 -2.2301
 -1.6315
 -1.6891
 -1.9827

(1 ,.,.) = 
 -1.3498
 -1.9108
 -1.1603
 -1.7139
 -2.0514
 -1.5245
 -2.2818
 -1.8204
 -1.8268
 -2.1485
 -2.0573
 -1.7607
 -2.2527
 -1.6480
 -1.7062
 -2.0027

(2 ,.,.) = 
 -1.3634
 -1.9301
 -1.1720
 -1.7312
 -2.0721
 -1.5399
 -2.3048
 -1.8387
 -1.8453
 -2.1702
 -2.0780
 -1.7785
 -2.2754
 -1.6646
 -1.7234
 -2.0230

(3 ,.,.) = 
 -1.3772
 -1.9496
 -1.1838
 -1.7487
 -2.0930
 -1.5555
 -2.3281
 -1.8573
 -1.8639
 -2.1921
 -2.0990
 -1.7964
 -2.2984
 -1.6814
 -1.7408
 -2.0434

(4 ,.,.) = 
 -1.3911
 -1.9693
 -1.1958
 -1.7663
 -2.1142
 -1.5712
 -2.3516
 -1.8761
 -1.8827
 -2.2143
 -2.1202
 -1.8146
 -2.3216
 -1.6984
 -1.7584
 -2.0640

(5 ,.,.) = 
 -1.4051
 -1.9892
 -1

Updates 10000, num timesteps 800080, FPS 178, mean/median reward -20.1/-21.0, min/max reward -21.0/-17.0, entropy 1.78954, value loss 0.02658, policy loss 0.01896

 rollout returns 
(0 ,.,.) = 
 -2.3492
 -1.4100
 -1.3710
 -1.2393
 -1.6037
 -1.4744
 -1.1056
 -1.7928
 -1.5700
 -1.6026
 -2.1597
 -2.1506
 -1.4848
 -2.0133
 -1.4844
 -1.8005

(1 ,.,.) = 
 -2.3729
 -1.4242
 -1.3848
 -1.2518
 -1.6199
 -1.4892
 -1.1168
 -1.8110
 -1.5859
 -1.6188
 -2.1815
 -2.1723
 -1.4998
 -2.0336
 -1.4994
 -1.8187

(2 ,.,.) = 
 -2.3969
 -1.4386
 -1.3988
 -1.2645
 -1.6363
 -1.5043
 -1.1281
 -1.8292
 -1.6019
 -1.6351
 -2.2035
 -2.1943
 -1.5150
 -2.0541
 -1.5145
 -1.8371

(3 ,.,.) = 
 -2.4211
 -1.4531
 -1.4129
 -1.2773
 -1.6528
 -1.5195
 -1.1395
 -1.8477
 -1.6181
 -1.6517
 -2.2258
 -2.2164
 -1.5303
 -2.0749
 -1.5298
 -1.8557

(4 ,.,.) = 
 -2.4455
 -1.4678
 -1.4272
 -1.2902
 -1.6695
 -1.5348
 -1.1510
 -1.8664
 -1.6344
 -1.6683
 -2.2482
 -2.2388
 -1.5457
 -2.0959
 -1.5453
 -1.8744

(5 ,.,.) = 
 -2.4702
 -1.4826
 -1

Updates 10500, num timesteps 840080, FPS 179, mean/median reward -19.9/-20.0, min/max reward -21.0/-17.0, entropy 1.78915, value loss 0.01412, policy loss 0.01778

 rollout returns 
(0 ,.,.) = 
 -0.9311
 -2.2630
 -1.3611
 -2.0069
 -1.4797
 -1.1876
 -2.4910
 -2.2704
 -1.2456
 -1.3911
 -0.8286
 -1.5179
 -1.3960
 -1.5299
 -1.9821
 -1.5121

(1 ,.,.) = 
 -0.9405
 -2.2859
 -1.3749
 -2.0271
 -1.4946
 -1.1996
 -2.5162
 -2.2933
 -1.2582
 -1.4052
 -0.8370
 -1.5333
 -1.4101
 -1.5454
 -2.0021
 -1.5273

(2 ,.,.) = 
 -0.9500
 -2.3089
 -1.3888
 -2.0476
 -1.5097
 -1.2117
 -2.5416
 -2.3165
 -1.2709
 -1.4194
 -0.8454
 -1.5488
 -1.4243
 -1.5610
 -2.0223
 -1.5428

(3 ,.,.) = 
 -0.9596
 -2.3323
 -1.4028
 -2.0683
 -1.5250
 -1.2239
 -1.5572
 -2.3399
 -1.2837
 -1.4337
 -0.8540
 -1.5644
 -1.4387
 -1.5768
 -2.0428
 -1.5583

(4 ,.,.) = 
 -0.9693
 -2.3558
 -1.4170
 -2.0892
 -1.5404
 -1.2363
 -1.5729
 -2.3635
 -1.2967
 -0.4381
 -0.8626
 -1.5802
 -1.4532
 -1.5927
 -2.0634
 -1.5741

(5 ,.,.) = 
 -0.9791
 -2.3796
 -1

Updates 11000, num timesteps 880080, FPS 179, mean/median reward -19.8/-20.0, min/max reward -21.0/-18.0, entropy 1.78763, value loss 0.02679, policy loss -0.06317

 rollout returns 
(0 ,.,.) = 
 -0.4992
 -1.1242
 -1.5672
 -2.3460
 -1.9115
 -1.7315
 -0.9900
 -1.1755
 -1.1011
 -1.8326
 -2.3080
 -1.1543
 -0.4197
 -1.4592
 -1.3768
 -1.9426

(1 ,.,.) = 
 -0.5042
 -1.1356
 -1.5831
 -1.3596
 -1.9308
 -1.7490
 -1.0000
 -1.1874
 -1.1122
 -1.8512
 -2.3313
 -1.1659
 -0.4240
 -1.4739
 -1.3908
 -1.9622

(2 ,.,.) = 
 -0.5093
 -1.1471
 -1.5991
 -1.3733
 -1.9503
 -1.7667
 -1.3589
 -1.1994
 -1.1234
 -1.8699
 -2.3548
 -1.1777
 -0.4283
 -1.4888
 -1.4048
 -1.9820

(3 ,.,.) = 
 -0.5144
 -1.1587
 -1.6152
 -1.3872
 -1.9700
 -1.7845
 -1.3727
 -1.2115
 -1.1348
 -1.8887
 -2.3786
 -1.1896
 -0.4326
 -1.5038
 -1.4190
 -2.0020

(4 ,.,.) = 
 -0.5196
 -1.1704
 -1.6315
 -1.4012
 -1.9899
 -1.8025
 -1.3865
 -1.2237
 -1.1463
 -1.9078
 -1.3925
 -1.2016
 -0.4370
 -1.5190
 -1.4333
 -2.0223

(5 ,.,.) = 
 -0.5249
 -1.1822
 -

Updates 11500, num timesteps 920080, FPS 179, mean/median reward -19.2/-20.0, min/max reward -21.0/-17.0, entropy 1.78879, value loss 0.02407, policy loss -0.06111

 rollout returns 
(0 ,.,.) = 
 -1.3980
 -1.5586
 -1.6714
 -1.8771
 -1.3180
 -1.5901
 -1.2876
 -1.2350
 -1.2730
 -1.0087
 -1.4946
 -1.3209
 -1.1722
 -2.0767
 -1.8289
 -1.3241

(1 ,.,.) = 
 -1.4121
 -1.5743
 -1.6883
 -1.8960
 -1.3314
 -1.6062
 -1.3006
 -1.2475
 -1.2858
 -1.0189
 -1.5097
 -1.3342
 -1.1841
 -2.0976
 -1.8474
 -1.3375

(2 ,.,.) = 
 -1.4264
 -1.5902
 -1.7054
 -1.9152
 -1.3448
 -1.6224
 -1.3137
 -1.2601
 -1.2988
 -1.0292
 -1.5249
 -1.3477
 -1.1960
 -2.1188
 -1.8660
 -1.3510

(3 ,.,.) = 
 -1.4408
 -1.6063
 -1.7226
 -1.9345
 -1.3584
 -1.6388
 -1.3270
 -1.2728
 -1.3119
 -1.0396
 -1.5403
 -1.3613
 -1.2081
 -2.1402
 -1.8849
 -1.3647

(4 ,.,.) = 
 -1.4554
 -1.6225
 -1.7400
 -1.9541
 -1.3721
 -1.6553
 -1.3404
 -1.2856
 -1.3252
 -1.0501
 -1.5559
 -1.3750
 -1.2203
 -2.1618
 -1.9039
 -1.3785

(5 ,.,.) = 
 -1.4701
 -1.6389
 -

Updates 12000, num timesteps 960080, FPS 180, mean/median reward -19.6/-20.0, min/max reward -21.0/-17.0, entropy 1.78167, value loss 0.01988, policy loss 0.00217

 rollout returns 
(0 ,.,.) = 
 -1.5786
 -1.3962
 -1.0870
 -1.6320
 -1.3235
 -1.1054
 -1.1482
 -1.5884
 -1.3493
 -1.3468
 -1.3023
 -1.2876
 -1.0061
 -0.5091
 -1.7864
 -1.9707

(1 ,.,.) = 
 -1.5945
 -1.4103
 -1.0980
 -1.6485
 -1.3369
 -1.1166
 -1.1598
 -1.6045
 -1.3629
 -1.3604
 -1.3154
 -1.3006
 -1.0163
 -0.5143
 -1.8045
 -1.9906

(2 ,.,.) = 
 -1.6106
 -1.4245
 -1.1091
 -1.6651
 -1.3504
 -1.1279
 -1.1716
 -1.6207
 -1.3767
 -1.3741
 -1.3287
 -1.3138
 -1.0265
 -0.5195
 -1.8227
 -2.0107

(3 ,.,.) = 
 -1.6269
 -1.4389
 -1.1203
 -1.6820
 -1.3640
 -1.1393
 -1.1834
 -1.6371
 -1.3906
 -1.3880
 -1.3421
 -1.3270
 -1.0369
 -0.5247
 -1.8411
 -2.0310

(4 ,.,.) = 
 -1.6433
 -1.4535
 -1.1316
 -1.6989
 -1.3778
 -1.1508
 -1.1954
 -1.6536
 -1.4046
 -1.4020
 -1.3557
 -1.3404
 -1.0474
 -0.5300
 -0.8496
 -2.0515

(5 ,.,.) = 
 -1.6599
 -1.4682
 -1

Updates 12500, num timesteps 1000080, FPS 180, mean/median reward -19.8/-20.0, min/max reward -21.0/-18.0, entropy 1.78102, value loss 0.03556, policy loss 0.00114

 rollout returns 
(0 ,.,.) = 
 -1.3071
 -0.9324
 -2.1214
 -2.0471
 -1.3945
 -2.2536
 -0.9225
 -1.1226
 -1.2216
 -1.4079
 -1.4483
 -1.3217
 -2.1076
 -0.9561
 -0.9189
 -0.7702

(1 ,.,.) = 
 -1.3203
 -0.9418
 -1.1327
 -2.0677
 -1.4086
 -2.2764
 -0.9318
 -1.1339
 -1.2339
 -1.4221
 -1.4629
 -1.3351
 -2.1289
 -0.9658
 -0.9282
 -0.7780

(2 ,.,.) = 
 -1.3337
 -0.9514
 -1.1441
 -2.0886
 -1.4228
 -2.2994
 -0.9412
 -1.1454
 -1.2464
 -1.4365
 -1.4777
 -1.3486
 -2.1504
 -0.9756
 -0.9376
 -0.7858

(3 ,.,.) = 
 -1.3471
 -0.9610
 -1.1557
 -2.1097
 -1.4372
 -2.3226
 -0.9507
 -1.1570
 -1.2589
 -1.4510
 -1.4926
 -1.3622
 -2.1721
 -0.9854
 -0.9471
 -0.7938

(4 ,.,.) = 
 -1.3607
 -0.9707
 -1.1674
 -2.1310
 -1.4517
 -1.3360
 -0.9603
 -1.1686
 -1.2717
 -1.4657
 -1.5077
 -1.3760
 -2.1940
 -0.9954
 -0.9566
 -0.8018

(5 ,.,.) = 
 -1.3745
 -0.9805
 -

Updates 13000, num timesteps 1040080, FPS 179, mean/median reward -19.1/-19.0, min/max reward -21.0/-16.0, entropy 1.77770, value loss 0.03438, policy loss -0.00833

 rollout returns 
(0 ,.,.) = 
 -1.4516
 -1.7813
 -1.5762
 -2.0651
 -1.5555
 -1.0703
 -1.6770
 -2.1511
 -2.0931
 -2.4939
 -1.4590
 -1.6704
 -1.3351
 -1.2766
 -1.9923
 -1.2725

(1 ,.,.) = 
 -1.4663
 -1.7992
 -1.5921
 -2.0860
 -1.5713
 -1.0811
 -1.6939
 -2.1729
 -2.1142
 -2.5191
 -1.4737
 -1.6873
 -1.3486
 -1.2895
 -2.0125
 -1.2853

(2 ,.,.) = 
 -1.4811
 -1.8174
 -1.6082
 -2.1071
 -1.5871
 -1.0920
 -1.7110
 -2.1948
 -2.1356
 -2.5445
 -1.4886
 -1.7043
 -1.3622
 -1.3025
 -2.0328
 -1.2983

(3 ,.,.) = 
 -1.4960
 -1.8358
 -1.6244
 -2.1283
 -1.6032
 -1.1031
 -1.7283
 -2.2170
 -2.1572
 -2.5702
 -1.5036
 -1.7215
 -1.3760
 -1.3157
 -2.0533
 -1.3114

(4 ,.,.) = 
 -1.5112
 -1.8543
 -1.6408
 -2.1498
 -1.6194
 -1.1142
 -1.7458
 -2.2394
 -2.1790
 -2.5962
 -1.5188
 -1.7389
 -1.3899
 -1.3290
 -2.0741
 -1.3247

(5 ,.,.) = 
 -1.5264
 -1.8730
 

Updates 13500, num timesteps 1080080, FPS 179, mean/median reward -18.4/-19.0, min/max reward -21.0/-15.0, entropy 1.77275, value loss 0.03191, policy loss -0.01367

 rollout returns 
(0 ,.,.) = 
 -2.1498
 -2.0104
 -1.4278
 -1.2626
 -0.9629
 -1.0282
 -1.4757
 -1.3749
 -1.1461
 -0.8981
 -0.4674
 -1.5577
 -1.5768
 -1.6065
 -1.1240
 -0.9611

(1 ,.,.) = 
 -2.1715
 -2.0307
 -1.4422
 -1.2753
 -0.9726
 -1.0386
 -1.4906
 -1.3888
 -1.1576
 -0.9072
 -0.4721
 -1.5735
 -1.5928
 -1.6228
 -1.1354
 -0.9708

(2 ,.,.) = 
 -2.1934
 -2.0512
 -1.4568
 -1.2882
 -0.9824
 -1.0491
 -1.5057
 -1.4028
 -1.1693
 -0.9163
 -0.4769
 -1.5894
 -1.6089
 -1.6392
 -1.1469
 -0.9806

(3 ,.,.) = 
 -2.2156
 -2.0720
 -1.4715
 -1.3012
 -0.9923
 -1.0597
 -1.5209
 -1.4170
 -1.1811
 -0.9256
 -0.4817
 -1.6054
 -1.6251
 -1.6557
 -1.1584
 -0.9905

(4 ,.,.) = 
 -2.2379
 -2.0929
 -1.4863
 -1.3144
 -1.0024
 -1.0704
 -1.5362
 -1.4313
 -1.1931
 -0.9349
 -0.4866
 -1.6216
 -1.6415
 -1.6724
 -1.1701
 -1.0005

(5 ,.,.) = 
 -2.2605
 -1.1039
 

Updates 14000, num timesteps 1120080, FPS 179, mean/median reward -18.2/-19.0, min/max reward -21.0/-15.0, entropy 1.78432, value loss 0.03681, policy loss 0.09595

 rollout returns 
(0 ,.,.) = 
 -1.3236
 -1.2394
 -1.1569
 -1.2434
 -1.1768
 -1.0804
 -0.8890
 -1.1948
 -1.0725
 -1.1409
 -1.6369
 -1.6463
 -1.1834
 -0.5900
 -1.8491
 -1.1749

(1 ,.,.) = 
 -1.3369
 -1.2519
 -1.1686
 -1.2559
 -1.1886
 -1.0913
 -0.8980
 -1.2069
 -1.0834
 -1.1524
 -1.6535
 -1.6629
 -1.1954
 -0.5959
 -1.8678
 -1.1868

(2 ,.,.) = 
 -1.3504
 -1.2646
 -1.1804
 -1.2686
 -1.2006
 -1.1023
 -0.9071
 -1.2191
 -1.0943
 -1.1640
 -0.6601
 -1.6797
 -1.2075
 -0.6019
 -1.8866
 -1.1988

(3 ,.,.) = 
 -1.3641
 -1.2773
 -1.1923
 -1.2814
 -1.2128
 -1.1135
 -0.9163
 -1.2314
 -1.1054
 -1.1758
 -0.6667
 -1.6967
 -1.2197
 -0.6080
 -1.9057
 -1.2109

(4 ,.,.) = 
 -1.3779
 -1.2902
 -1.2043
 -1.2944
 -1.2250
 -1.1247
 -0.9255
 -1.2439
 -1.1165
 -1.1877
 -0.6735
 -1.7138
 -1.2320
 -0.6142
 -1.9250
 -1.2231

(5 ,.,.) = 
 -1.3918
 -1.3033
 -

Updates 14500, num timesteps 1160080, FPS 179, mean/median reward -18.9/-19.0, min/max reward -21.0/-16.0, entropy 1.77231, value loss 0.05110, policy loss -0.02389

 rollout returns 
(0 ,.,.) = 
 -1.3777
 -0.9551
 -2.2130
 -0.4138
 -1.5441
 -2.3002
 -1.7101
 -1.1420
 -1.1281
 -1.7713
 -1.0394
 -1.8191
 -1.3802
 -1.3457
 -2.0885
 -1.4962

(1 ,.,.) = 
 -1.3917
 -0.9647
 -2.2353
 -0.4180
 -1.5597
 -2.3234
 -1.7274
 -1.1536
 -1.1394
 -1.7892
 -1.0499
 -1.8375
 -1.3941
 -1.3593
 -2.1096
 -1.5113

(2 ,.,.) = 
 -1.4057
 -0.9745
 -1.2478
 -0.4222
 -1.5754
 -2.3469
 -1.7448
 -1.1652
 -1.1510
 -1.8073
 -1.0605
 -1.8560
 -1.4082
 -1.3730
 -2.1309
 -1.5266

(3 ,.,.) = 
 -1.4199
 -0.9843
 -1.2604
 -0.4265
 -1.5913
 -1.3605
 -1.7625
 -1.1770
 -1.1626
 -1.8255
 -1.0712
 -1.8748
 -1.4224
 -1.3869
 -2.1525
 -1.5420

(4 ,.,.) = 
 -1.4343
 -0.9943
 -1.2731
 -0.4308
 -1.6074
 -1.3742
 -1.7803
 -1.1889
 -1.1743
 -1.8440
 -1.0820
 -1.8937
 -1.4368
 -1.4009
 -2.1742
 -1.5576

(5 ,.,.) = 
 -1.4487
 -1.0043
 

Updates 15000, num timesteps 1200080, FPS 174, mean/median reward -17.8/-18.0, min/max reward -21.0/-14.0, entropy 1.74964, value loss 0.02224, policy loss 0.05909

 rollout returns 
(0 ,.,.) = 
 -1.1640
 -1.1367
 -1.4693
 -1.1412
 -2.0268
 -1.1322
 -1.6337
 -1.2517
 -1.4178
 -1.1669
 -1.1401
 -1.2984
 -1.2326
 -1.9292
 -1.3019
 -1.3120

(1 ,.,.) = 
 -1.1757
 -1.1482
 -1.4841
 -1.1527
 -2.0472
 -1.1437
 -1.6502
 -1.2644
 -1.4322
 -1.1787
 -1.1516
 -1.3115
 -1.2451
 -1.9487
 -1.3150
 -1.3252

(2 ,.,.) = 
 -1.1876
 -1.1598
 -1.4991
 -1.1643
 -2.0679
 -1.1552
 -1.6669
 -1.2771
 -1.4466
 -1.1906
 -1.1632
 -1.3247
 -1.2577
 -1.9684
 -1.3283
 -1.3386

(3 ,.,.) = 
 -1.1996
 -1.1715
 -1.5143
 -1.1761
 -2.0888
 -1.1669
 -1.6837
 -1.2900
 -1.4612
 -1.2026
 -1.1750
 -1.3381
 -1.2704
 -1.9883
 -1.3417
 -1.3522

(4 ,.,.) = 
 -1.2117
 -1.1834
 -1.5296
 -1.1880
 -2.1099
 -1.1787
 -1.7007
 -1.3031
 -1.4760
 -1.2148
 -1.1868
 -1.3516
 -1.2832
 -2.0083
 -1.3553
 -1.3658

(5 ,.,.) = 
 -1.2240
 -1.1953
 -

Updates 15500, num timesteps 1240080, FPS 174, mean/median reward -18.6/-19.0, min/max reward -21.0/-15.0, entropy 1.77533, value loss 0.06868, policy loss -0.19196

 rollout returns 
(0 ,.,.) = 
 -0.8619
 -2.0033
 -1.8951
 -0.8612
 -1.4807
 -1.8658
 -1.2487
 -0.9606
 -1.1575
 -0.2969
 -0.6030
 -0.9575
 -0.9913
 -1.0750
 -1.9329
 -1.0659

(1 ,.,.) = 
 -0.8706
 -2.0235
 -1.9143
 -0.8699
 -1.4957
 -1.8847
 -1.2613
 -0.9703
 -1.1692
 -0.2999
 -0.6091
 -0.9671
 -1.0013
 -1.0858
 -1.9525
 -1.0767

(2 ,.,.) = 
 -0.8794
 -2.0439
 -1.9336
 -0.8787
 -1.5108
 -1.9037
 -1.2741
 -0.9801
 -1.1810
 -0.3030
 -0.6153
 -0.9769
 -1.0114
 -1.0968
 -0.9621
 -1.0876

(3 ,.,.) = 
 -0.8883
 -2.0646
 -1.9532
 -0.8876
 -1.5260
 -1.9229
 -1.2869
 -0.9900
 -1.1930
 -0.3060
 -0.6215
 -0.9868
 -1.0216
 -1.1079
 -0.9718
 -1.0986

(4 ,.,.) = 
 -0.8973
 -1.0753
 -1.9729
 -0.8965
 -1.5414
 -1.9424
 -1.2999
 -1.0000
 -1.2050
 -0.3091
 -0.6278
 -0.9967
 -1.0319
 -1.1191
 -0.9816
 -1.1097

(5 ,.,.) = 
 -0.9063
 -1.0862
 

Updates 16000, num timesteps 1280080, FPS 174, mean/median reward -18.7/-19.0, min/max reward -21.0/-16.0, entropy 1.78228, value loss 0.01164, policy loss -0.03224

 rollout returns 
(0 ,.,.) = 
 -1.3313
 -0.6801
 -0.8587
 -0.6370
 -1.9971
 -1.0527
 -1.1471
 -0.9045
 -0.7227
 -1.2611
 -1.0945
 -1.0444
 -1.5294
 -1.2208
 -0.9899
 -1.1472

(1 ,.,.) = 
 -1.3448
 -0.6869
 -0.8674
 -0.6434
 -2.0173
 -1.0633
 -1.1587
 -0.9136
 -0.7300
 -1.2739
 -1.1056
 -1.0549
 -1.5448
 -1.2332
 -0.9999
 -1.1588

(2 ,.,.) = 
 -1.3584
 -0.6939
 -0.8761
 -0.6499
 -2.0377
 -1.0741
 -1.1704
 -0.9229
 -0.7373
 -1.2867
 -1.1167
 -1.0656
 -1.5604
 -1.2456
 -1.0100
 -1.1705

(3 ,.,.) = 
 -1.3721
 -0.7009
 -0.8850
 -0.6565
 -1.0482
 -1.0849
 -1.1823
 -0.9322
 -0.7448
 -1.2997
 -1.1280
 -1.0764
 -1.5762
 -1.2582
 -1.0202
 -1.1823

(4 ,.,.) = 
 -1.3859
 -0.7080
 -0.8939
 -0.6631
 -1.0587
 -1.0959
 -1.1942
 -0.9416
 -0.7523
 -1.3128
 -1.1394
 -1.0872
 -1.5921
 -1.2709
 -1.0305
 -1.1943

(5 ,.,.) = 
 -1.3999
 -0.7151
 

Updates 16500, num timesteps 1320080, FPS 174, mean/median reward -18.6/-19.0, min/max reward -21.0/-17.0, entropy 1.76449, value loss 0.00636, policy loss -0.00413

 rollout returns 
(0 ,.,.) = 
 -1.0510
 -1.0041
 -1.1044
 -1.1147
 -1.3309
 -1.2836
 -1.4300
 -1.0842
 -0.9838
 -0.9522
 -0.4781
 -1.1026
 -1.1906
 -0.9820
 -1.8695
 -1.1839

(1 ,.,.) = 
 -1.0616
 -1.0142
 -1.1156
 -1.1259
 -1.3443
 -1.2965
 -1.4444
 -1.0952
 -0.9937
 -0.9618
 -0.4829
 -1.1137
 -1.2027
 -0.9919
 -1.8884
 -1.1959

(2 ,.,.) = 
 -1.0724
 -1.0245
 -1.1269
 -1.1373
 -1.3579
 -1.3096
 -1.4590
 -1.1063
 -1.0038
 -0.9715
 -0.4878
 -1.1249
 -1.2148
 -1.0020
 -1.9075
 -1.2080

(3 ,.,.) = 
 -1.0832
 -1.0348
 -1.1382
 -1.1488
 -1.3716
 -1.3229
 -1.4738
 -1.1174
 -1.0139
 -0.9814
 -0.4927
 -1.1363
 -1.2271
 -1.0121
 -0.9166
 -1.2202

(4 ,.,.) = 
 -1.0941
 -1.0453
 -1.1497
 -1.1604
 -1.3855
 -1.3362
 -1.4886
 -1.1287
 -1.0241
 -0.9913
 -0.4977
 -1.1478
 -1.2395
 -1.0223
 -0.9259
 -1.2325

(5 ,.,.) = 
 -1.1052
 -1.0558
 

Process Process-22:
Process Process-25:
Process Process-24:
Process Process-31:
Process Process-29:
Process Process-30:
Process Process-32:
Process Process-18:
Process Process-27:
Process Process-19:
Process Process-28:
Process Process-21:
Process Process-20:
Process Process-23:
Process Process-26:
Process Process-17:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):


  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._re

KeyboardInterrupt: 

In [8]:
from utils import orthogonal

class Categorical(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(Categorical, self).__init__()
        self.linear = nn.Linear(num_inputs, num_outputs)

    def forward(self, x):
        x = self.linear(x)
        return x

    def sample(self, x, deterministic):
        x = self(x)

        probs = F.softmax(x)
        if deterministic is False:
            action = probs.multinomial()
        else:
            action = probs.max(1)[1]
        return action

    def logprobs_and_entropy(self, x, actions):
        x = self(x)

        log_probs = F.log_softmax(x)
        probs = F.softmax(x)

        action_log_probs = log_probs.gather(1, actions)

        dist_entropy = -(log_probs * probs).sum(-1).mean()
        return action_log_probs, dist_entropy
    

def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1 or classname.find('Linear') != -1:
        orthogonal(m.weight.data)
        if m.bias is not None:
            m.bias.data.fill_(0)


class CNNPolicy(nn.Module):
    def __init__(self, num_inputs, action_space):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)

        self.linear1 = nn.Linear(32 * 7 * 7, 512)

        self.critic_linear = nn.Linear(512, 1)

        num_outputs = action_space.n
        self.dist = Categorical(512, num_outputs)

        self.train() # training mode. Only affects dropout, batchnorm etc
        self.reset_parameters()
        
    def act(self, inputs, states, masks, deterministic=False):
        value, x, states = self(inputs, states, masks)
        action = self.dist.sample(x, deterministic=deterministic)
        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, action)
        return value, action, action_log_probs, states

    def evaluate_actions(self, inputs, states, masks, actions):
        value, x, states = self(inputs, states, masks)
        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, actions)
        return value, action_log_probs, dist_entropy, states

    @property
    def state_size(self):
        return 1

    def reset_parameters(self):
        self.apply(weights_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.linear1.weight.data.mul_(relu_gain)

    def forward(self, inputs, states, masks):
        x = self.conv1(inputs / 255.0)
        x = F.relu(x)

        x = self.conv2(x)
        x = F.relu(x)

        x = self.conv3(x)
        x = F.relu(x)

        x = x.view(-1, 32 * 7 * 7)
        x = self.linear1(x)
        x = F.relu(x)

        return self.critic_linear(x), x, states

In [9]:
class RolloutStorage(object):
    def __init__(self, num_steps, num_processes, obs_shape, action_space, state_size):
        self.observations = torch.zeros(num_steps + 1, num_processes, *obs_shape)
        self.states = torch.zeros(num_steps + 1, num_processes, state_size)
        self.rewards = torch.zeros(num_steps, num_processes, 1)
        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
        self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
        
        action_shape = 1

        self.actions = torch.zeros(num_steps, num_processes, action_shape)
            
        self.actions = self.actions.long()
        self.masks = torch.ones(num_steps + 1, num_processes, 1)

    def cuda(self):
        self.observations = self.observations.cuda()
        self.states = self.states.cuda()
        self.rewards = self.rewards.cuda()
        self.value_preds = self.value_preds.cuda()
        self.returns = self.returns.cuda()
        self.action_log_probs = self.action_log_probs.cuda()
        self.actions = self.actions.cuda()
        self.masks = self.masks.cuda()

    def insert(self, step, current_obs, state, action, action_log_prob, value_pred, reward, mask):
        self.observations[step + 1].copy_(current_obs)
        self.states[step + 1].copy_(state)
        self.actions[step].copy_(action)
        self.action_log_probs[step].copy_(action_log_prob)
        self.value_preds[step].copy_(value_pred)
        self.rewards[step].copy_(reward)
        self.masks[step + 1].copy_(mask)

    def after_update(self):
        self.observations[0].copy_(self.observations[-1])
        self.states[0].copy_(self.states[-1])
        self.masks[0].copy_(self.masks[-1])

    def compute_returns(self, next_value, use_gae, gamma, tau):
        self.returns[-1] = next_value
        for step in reversed(range(self.rewards.size(0))):
            self.returns[step] = self.returns[step + 1] * \
                gamma * self.masks[step + 1] + self.rewards[step]


In [10]:
from multiprocessing import Process, Pipe

def worker(remote, parent_remote, env_fn_wrapper):
    parent_remote.close()
    env = env_fn_wrapper.x()
    while True:
        cmd, data = remote.recv()
        if cmd == 'step':
            ob, reward, done, info = env.step(data)
            if done:
                ob = env.reset()
            remote.send((ob, reward, done, info))
        elif cmd == 'reset':
            ob = env.reset()
            remote.send(ob)
        elif cmd == 'reset_task':
            ob = env.reset_task()
            remote.send(ob)
        elif cmd == 'close':
            remote.close()
            break
        elif cmd == 'get_spaces':
            remote.send((env.action_space, env.observation_space))
        else:
            raise NotImplementedError


class CloudpickleWrapper(object):
    """
    Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
    """
    def __init__(self, x):
        self.x = x
    def __getstate__(self):
        import cloudpickle
        return cloudpickle.dumps(self.x)
    def __setstate__(self, ob):
        import pickle
        self.x = pickle.loads(ob)



class SubprocVecEnv(object):
    def __init__(self, env_fns):
        """
        envs: list of gym environments to run in subprocesses
        """
        self.closed = False
        nenvs = len(env_fns)
        self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
        self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
            for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
        for p in self.ps:
            p.daemon = True # if the main process crashes, we should not cause things to hang
            p.start()
        for remote in self.work_remotes:
            remote.close()

        self.remotes[0].send(('get_spaces', None))
        self.action_space, self.observation_space = self.remotes[0].recv()


    def step(self, actions):
        for remote, action in zip(self.remotes, actions):
            remote.send(('step', action))
        results = [remote.recv() for remote in self.remotes]
        obs, rews, dones, infos = zip(*results)
        return np.stack(obs), np.stack(rews), np.stack(dones), infos

    def reset(self):
        for remote in self.remotes:
            remote.send(('reset', None))
        return np.stack([remote.recv() for remote in self.remotes])

    def reset_task(self):
        for remote in self.remotes:
            remote.send(('reset_task', None))
        return np.stack([remote.recv() for remote in self.remotes])

    def close(self):
        if self.closed:
            return

        for remote in self.remotes:
            remote.send(('close', None))
        for p in self.ps:
            p.join()
        self.closed = True

    @property
    def num_envs(self):
        return len(self.remotes)

