In [1]:
import copy
import glob
import os
import time

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from utils import orthogonal

from all_stuff import *

from envs import make_env # had to manually add some files into directory for env to reference bc baselines 
# modules not working right

#from storage import RolloutStorage

In [9]:
class args:
    def __init__(self):
        self.env_name='PongNoFrameskip-v4'
        self.seed=1
        self.log_dir=''
        self.save_dir='saved_models'
        self.cuda=False
        self.algo='a2c'
        self.num_stack=4
        self.num_steps=5
        self.num_processes=2
        self.lr=7e-4
        self.eps=1e-5
        self.alpha=.99
        self.max_grad_norm=.5
        self.value_loss_coef=.5
        self.entropy_coef=.1
        self.num_frames=8e6
        self.use_gae=False
        self.gamma=.99
        self.tau=.95
        self.save_interval=1000
        self.log_interval=100
        self.from_saved=False
        
args = args()

SAVE_PATH = "saved_models/pong_112917.pt"

num_updates = int(args.num_frames) // args.num_steps // args.num_processes

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [36]:

os.environ['OMP_NUM_THREADS'] = '1'

envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)]

if args.num_processes > 1:
    envs = SubprocVecEnv(envs)
else:
    envs = DummyVecEnv(envs)

if len(envs.observation_space.shape) == 1:
    envs = VecNormalize(envs)

obs_shape = envs.observation_space.shape
obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:]) #4 x 84 x 84

actor_critic = CNNPolicy(obs_shape[0], envs.action_space)

if args.from_saved:
    print("loading saved model from " + SAVE_PATH)
    actor_critic.load_state_dict(torch.load(SAVE_PATH))

if envs.action_space.__class__.__name__ == "Discrete":
    action_shape = 1
else:
    action_shape = envs.action_space.shape[0]

if args.cuda:
    actor_critic.cuda()

optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)

rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)
current_obs = torch.zeros(args.num_processes, *obs_shape)

def update_current_obs(obs):
    shape_dim0 = envs.observation_space.shape[0]
    obs = torch.from_numpy(obs).float()
    if args.num_stack > 1:
        current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
    current_obs[:, -shape_dim0:] = obs

obs = envs.reset()
update_current_obs(obs)

rollouts.observations[0].copy_(current_obs)

# These variables are used to compute average rewards for all processes.
episode_rewards = torch.zeros([args.num_processes, 1])
final_rewards = torch.zeros([args.num_processes, 1])

if args.cuda:
    current_obs = current_obs.cuda()
    rollouts.cuda()

start = time.time()
for j in range(num_updates):
    for step in range(args.num_steps):
        # Sample actions
        value, action, action_log_prob, dist_entropy = actor_critic(Variable(rollouts.observations[step], volatile=True),
                                                                  Variable(rollouts.masks[step], volatile=True))
        
        print("\n Stepping \nvalue", value, "\naction", action, "\naction_log_prob", action_log_prob,\
              "\ndist_entropy",dist_entropy)
        cpu_actions = action.data.squeeze(1).cpu().numpy()

        # Obser reward and next obs
        obs, reward, done, info = envs.step(cpu_actions)
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
        episode_rewards += reward

        # If done then clean the history of observations.
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
        final_rewards *= masks
        final_rewards += (1 - masks) * episode_rewards
        episode_rewards *= masks

        if args.cuda:
            masks = masks.cuda()

        if current_obs.dim() == 4:
            current_obs *= masks.unsqueeze(2).unsqueeze(2)
        else:
            current_obs *= masks

        update_current_obs(obs)
        rollouts.insert(step, current_obs, action.data, action_log_prob.data, value.data, reward, masks)
    
    print("################################\nDone stepping")

    next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                              Variable(rollouts.masks[-1], volatile=True))[0].data

    rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

    
    values, actions, action_log_probs, dist_entropy = actor_critic(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                           Variable(rollouts.masks[:-1].view(-1, 1)))
    #Variable(rollouts.actions.view(-1, action_shape))
    
    values = values.view(args.num_steps, args.num_processes, 1)
    action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)
    
    # this line gets out of "diff stoch variables req reward" error
    # action_log_probs = Variable(action_log_probs.data, requires_grad=False)

    advantages = Variable(rollouts.returns[:-1]) - values
    value_loss = advantages.pow(2).mean()

    #action_log_probs is the one causing the error
    action_loss = -(Variable(advantages.data) * action_log_probs).mean()
    print("\nnext value", next_value, "\nvalues", values, "\nactions", actions,\
          "\naction_log_probs", action_log_probs, "\nreturns", rollouts.returns[:-1],
          "\ndist_entropy", dist_entropy, "\nadvantages", advantages)

    optimizer.zero_grad()
    
    (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

    optimizer.step()

    rollouts.after_update()

    if j % args.save_interval == 0 and args.save_dir != "":

        # A really ugly way to save a model to CPU
        save_model = actor_critic
        if args.cuda:
            save_model = copy.deepcopy(actor_critic).cpu() # save THIS one

        torch.save(save_model.state_dict(), SAVE_PATH)

    if j % args.log_interval == 0:
        end = time.time()
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            format(j, total_num_steps,
                   int(total_num_steps / (end - start)),
                   final_rewards.mean(),
                   final_rewards.median(),
                   final_rewards.min(),
                   final_rewards.max(), dist_entropy.data[0],
                   value_loss.data[0], action_loss.data[0]))


 Stepping 
value Variable containing:
1.00000e-02 *
 -8.2926
 -8.0802
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 1
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.9392
-1.3881
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7634
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -5.2210
 -6.2289
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 0
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8058
-1.7931
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7549
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -5.2485
 -5.6389
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 3
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.6338
-1.6338
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7407
[torch.FloatTensor of size 1]


 St

 
returns 
(0 ,.,.) = 
  0.3695
  0.3715

(1 ,.,.) = 
  0.3733
  0.3752

(2 ,.,.) = 
  0.3770
  0.3790

(3 ,.,.) = 
  0.3808
  0.3829

(4 ,.,.) = 
  0.3847
  0.3867
[torch.FloatTensor of size 5x2x1]
 
dist_entropy Variable containing:
 1.7827
[torch.FloatTensor of size 1]
 
advantages Variable containing:
(0 ,.,.) = 
1.00000e-02 *
  -1.9728
  -1.8750

(1 ,.,.) = 
1.00000e-02 *
  -1.7008
  -1.2465

(2 ,.,.) = 
1.00000e-02 *
  -1.5318
  -0.7725

(3 ,.,.) = 
1.00000e-02 *
  -1.0190
  -1.1809

(4 ,.,.) = 
1.00000e-02 *
  -0.4385
  -0.2199
[torch.FloatTensor of size 5x2x1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -8.1458
 -8.2304
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 0
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7798
-1.7923
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7909
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -8.1218
 -7.7435
[torch.FloatTensor of


 Stepping 
value Variable containing:
1.00000e-02 *
 -1.5834
 -2.1126
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 1
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8015
-1.7830
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7916
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -1.7552
 -1.5500
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 1
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7832
-1.8003
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7916
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -1.8053
 -1.4120
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 4
 4
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8176
-1.8182
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7916
[torch.FloatTensor of size 1]


 St


 Stepping 
value Variable containing:
-0.5052
-0.5026
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 4
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.9052
-1.8508
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7845
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.4993
-0.5026
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 0
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.9017
-1.8548
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7845
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.5023
-0.5042
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 2
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8676
-1.6320
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7846
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.5035
-0.50


 Stepping 
value Variable containing:
1.00000e-02 *
 -2.2998
 -1.9220
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 3
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7867
-1.7945
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7917
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -1.7172
 -1.8729
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 0
 4
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8099
-1.7702
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7917
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -2.0462
 -2.1888
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 4
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7691
-1.7968
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7917
[torch.FloatTensor of size 1]


 St


 Stepping 
value Variable containing:
1.00000e-02 *
  3.0706
  3.0135
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 0
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7945
-1.8018
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7917
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
  3.1330
  2.9184
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 4
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7998
-1.7768
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7917
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
  2.9956
  2.8616
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 3
 2
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7914
-1.7914
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7917
[torch.FloatTensor of size 1]


 St


 Stepping 
value Variable containing:
1.00000e-02 *
 -6.8197
 -6.9703
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 1
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8029
-1.7811
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7917
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -6.8464
 -6.7445
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 0
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7895
-1.7828
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7917
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -6.8877
 -6.8177
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 2
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8055
-1.8049
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7917
[torch.FloatTensor of size 1]


 St


 Stepping 
value Variable containing:
1.00000e-02 *
 -4.2581
 -4.1769
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 1
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8256
-1.7657
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7916
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -4.4027
 -4.2420
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7666
-1.7653
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7916
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -4.3729
 -4.1146
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 2
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7876
-1.7878
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7916
[torch.FloatTensor of size 1]


 St

next value 
-0.1650
-0.1685
[torch.FloatTensor of size 2x1]
 
values Variable containing:
(0 ,.,.) = 
 -0.1666
 -0.1651

(1 ,.,.) = 
 -0.1645
 -0.1641

(2 ,.,.) = 
 -0.1671
 -0.1677

(3 ,.,.) = 
 -0.1656
 -0.1671

(4 ,.,.) = 
 -0.1668
 -0.1684
[torch.FloatTensor of size 5x2x1]
 
actions Variable containing:
    4
    5
    3
    5
    3
    4
    4
    5
    1
    1
[torch.LongTensor of size 10x1]
 
action_log_probs Variable containing:
(0 ,.,.) = 
 -1.8562
 -1.7632

(1 ,.,.) = 
 -1.7778
 -1.7636

(2 ,.,.) = 
 -1.7764
 -1.8580

(3 ,.,.) = 
 -1.8565
 -1.7639

(4 ,.,.) = 
 -1.8353
 -1.8325
[torch.FloatTensor of size 5x2x1]
 
returns 
(0 ,.,.) = 
 -0.1569
 -0.1603

(1 ,.,.) = 
 -0.1585
 -0.1619

(2 ,.,.) = 
 -0.1601
 -0.1635

(3 ,.,.) = 
 -0.1617
 -0.1652

(4 ,.,.) = 
 -0.1633
 -0.1668
[torch.FloatTensor of size 5x2x1]
 
dist_entropy Variable containing:
 1.7910
[torch.FloatTensor of size 1]
 
advantages Variable containing:
(0 ,.,.) = 
1.00000e-03 *
   9.6989
   4.8636

(1 ,.,.) = 
1.000


 Stepping 
value Variable containing:
-0.2481
-0.2467
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 3
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8153
-1.7225
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7901
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.2484
-0.2472
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 3
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8154
-1.7226
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7901
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.2489
-0.2482
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 0
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7439
-1.7220
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7901
[torch.FloatTensor of size 1]

################################
Done stepping

next


next value 
-0.2260
-0.2239
[torch.FloatTensor of size 2x1]
 
values Variable containing:
(0 ,.,.) = 
 -0.2289
 -0.2240

(1 ,.,.) = 
 -0.2258
 -0.2250

(2 ,.,.) = 
 -0.2297
 -0.2239

(3 ,.,.) = 
 -0.2272
 -0.2255

(4 ,.,.) = 
 -0.2270
 -0.2239
[torch.FloatTensor of size 5x2x1]
 
actions Variable containing:
    0
    2
    0
    4
    1
    0
    4
    0
    3
    5
[torch.LongTensor of size 10x1]
 
action_log_probs Variable containing:
(0 ,.,.) = 
 -1.7130
 -1.7573

(1 ,.,.) = 
 -1.7101
 -1.8940

(2 ,.,.) = 
 -1.9013
 -1.7128

(3 ,.,.) = 
 -1.8979
 -1.7135

(4 ,.,.) = 
 -1.7995
 -1.6979
[torch.FloatTensor of size 5x2x1]
 
returns 
(0 ,.,.) = 
 -0.2149
 -0.2130

(1 ,.,.) = 
 -0.2170
 -0.2151

(2 ,.,.) = 
 -0.2192
 -0.2173

(3 ,.,.) = 
 -0.2215
 -0.2195

(4 ,.,.) = 
 -0.2237
 -0.2217
[torch.FloatTensor of size 5x2x1]
 
dist_entropy Variable containing:
 1.7886
[torch.FloatTensor of size 1]
 
advantages Variable containing:
(0 ,.,.) = 
1.00000e-02 *
   1.4026
   1.1018

(1 ,.,.) = 
1.00

returns 
(0 ,.,.) = 
 -0.2687
 -0.2697

(1 ,.,.) = 
 -0.2714
 -0.2725

(2 ,.,.) = 
 -0.2741
 -0.2752

(3 ,.,.) = 
 -0.2769
 -0.2780

(4 ,.,.) = 
 -0.2797
 -0.2808
[torch.FloatTensor of size 5x2x1]
 
dist_entropy Variable containing:
 1.7865
[torch.FloatTensor of size 1]
 
advantages Variable containing:
(0 ,.,.) = 
1.00000e-02 *
   1.6731
   1.3949

(1 ,.,.) = 
1.00000e-02 *
   1.1592
   0.9986

(2 ,.,.) = 
1.00000e-02 *
   1.0112
   0.7769

(3 ,.,.) = 
1.00000e-02 *
   0.8399
   0.4149

(4 ,.,.) = 
1.00000e-02 *
   0.2368
   0.0134
[torch.FloatTensor of size 5x2x1]


 Stepping 
value Variable containing:
-0.2271
-0.2263
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 1
 2
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.9571
-1.7621
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7885
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.2276
-0.2275
[torch.FloatTensor of size 2x1]
 
action Variable conta


 Stepping 
value Variable containing:
1.00000e-02 *
 -9.9455
 -9.5769
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 3
 4
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7796
-1.7750
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7911
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -9.6665
 -9.6697
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 1
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7628
-1.8584
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7911
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
1.00000e-02 *
 -9.6439
 -9.5591
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8235
-1.7823
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7911
[torch.FloatTensor of size 1]


 St


 Stepping 
value Variable containing:
-0.1073
-0.1066
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 1
 2
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8771
-1.7547
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7910
[torch.FloatTensor of size 1]

################################
Done stepping

next value 
-0.1068
-0.1070
[torch.FloatTensor of size 2x1]
 
values Variable containing:
(0 ,.,.) = 
 -0.1064
 -0.1068

(1 ,.,.) = 
 -0.1068
 -0.1080

(2 ,.,.) = 
 -0.1077
 -0.1070

(3 ,.,.) = 
 -0.1075
 -0.1065

(4 ,.,.) = 
 -0.1073
 -0.1066
[torch.FloatTensor of size 5x2x1]
 
actions Variable containing:
    1
    0
    5
    5
    0
    0
    3
    4
    5
    2
[torch.LongTensor of size 10x1]
 
action_log_probs Variable containing:
(0 ,.,.) = 
 -1.8788
 -1.7864

(1 ,.,.) = 
 -1.7656
 -1.7630

(2 ,.,.) = 
 -1.7844
 -1.7859

(3 ,.,.) = 
 -1.7730
 -1.8025

(4 ,.,.) = 
 -1.7652
 -1.7547
[torch.FloatTensor of size 5x2x1]
 
retur


next value 
1.00000e-02 *
 -5.2393
 -5.1527
[torch.FloatTensor of size 2x1]
 
values Variable containing:
(0 ,.,.) = 
1.00000e-02 *
  -5.2991
  -5.0075

(1 ,.,.) = 
1.00000e-02 *
  -5.0844
  -5.5222

(2 ,.,.) = 
1.00000e-02 *
  -5.3268
  -4.8931

(3 ,.,.) = 
1.00000e-02 *
  -5.3772
  -5.1808

(4 ,.,.) = 
1.00000e-02 *
  -5.1410
  -5.3787
[torch.FloatTensor of size 5x2x1]
 
actions Variable containing:
    5
    5
    0
    4
    1
    5
    4
    3
    4
    1
[torch.LongTensor of size 10x1]
 
action_log_probs Variable containing:
(0 ,.,.) = 
 -1.7748
 -1.7739

(1 ,.,.) = 
 -1.7896
 -1.8075

(2 ,.,.) = 
 -1.8284
 -1.7732

(3 ,.,.) = 
 -1.8083
 -1.7774

(4 ,.,.) = 
 -1.8077
 -1.8297
[torch.FloatTensor of size 5x2x1]
 
returns 
(0 ,.,.) = 
1.00000e-02 *
  -4.9825
  -4.9002

(1 ,.,.) = 
1.00000e-02 *
  -5.0329
  -4.9497

(2 ,.,.) = 
1.00000e-02 *
  -5.0837
  -4.9997

(3 ,.,.) = 
1.00000e-02 *
  -5.1350
  -5.0502

(4 ,.,.) = 
1.00000e-02 *
  -5.1869
  -5.1012
[torch.FloatTensor of size 5x


 Stepping 
value Variable containing:
-0.3961
-0.3943
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 4
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7123
-1.8064
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7900
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.3955
-0.3960
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 3
 4
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7975
-1.8062
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7900
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.3941
-0.3942
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 4
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8072
-1.7975
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7900
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.3961
-0.39

values Variable containing:
(0 ,.,.) = 
 -0.2761
 -0.2740

(1 ,.,.) = 
 -0.2781
 -0.2759

(2 ,.,.) = 
 -0.2782
 -0.2771

(3 ,.,.) = 
 -0.2781
 -0.2774

(4 ,.,.) = 
 -0.2773
 -0.2759
[torch.FloatTensor of size 5x2x1]
 
actions Variable containing:
    4
    2
    1
    4
    1
    4
    1
    0
    3
    5
[torch.LongTensor of size 10x1]
 
action_log_probs Variable containing:
(0 ,.,.) = 
 -1.7967
 -1.7443

(1 ,.,.) = 
 -1.8517
 -1.7961

(2 ,.,.) = 
 -1.8522
 -1.7953

(3 ,.,.) = 
 -1.8516
 -1.8157

(4 ,.,.) = 
 -1.7843
 -1.7604
[torch.FloatTensor of size 5x2x1]
 
returns 
(0 ,.,.) = 
 -1.2433
 -0.2626

(1 ,.,.) = 
 -1.2559
 -0.2653

(2 ,.,.) = 
 -1.2686
 -0.2680

(3 ,.,.) = 
 -0.2713
 -0.2707

(4 ,.,.) = 
 -0.2740
 -0.2734
[torch.FloatTensor of size 5x2x1]
 
dist_entropy Variable containing:
 1.7911
[torch.FloatTensor of size 1]
 
advantages Variable containing:
(0 ,.,.) = 
 -0.9673
  0.0114

(1 ,.,.) = 
 -0.9778
  0.0106

(2 ,.,.) = 
 -0.9904
  0.0091

(3 ,.,.) = 
  0.0068
  0.0067

(4


 Stepping 
value Variable containing:
-0.4099
-0.4148
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7366
-1.7359
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7893
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.4097
-0.4154
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 0
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7205
-1.8761
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7893
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.4116
-0.4129
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 0
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7205
-1.8757
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7893
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.4088
-0.41


 Stepping 
value Variable containing:
-0.7029
-0.7048
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.6202
-1.6189
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7822
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.7036
-0.7025
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 2
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.6196
-1.6406
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7822
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.7063
-0.7041
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 4
 0
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.9640
-1.8621
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7821
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.7053
-0.70

 
dist_entropy Variable containing:
 1.7859
[torch.FloatTensor of size 1]
 
advantages Variable containing:
(0 ,.,.) = 
1.00000e-02 *
   2.7844
   2.7684

(1 ,.,.) = 
1.00000e-02 *
   2.0983
   1.8213

(2 ,.,.) = 
1.00000e-02 *
   1.4084
   1.4246

(3 ,.,.) = 
1.00000e-02 *
   0.6864
   1.0774

(4 ,.,.) = 
1.00000e-02 *
   0.1732
   0.4505
[torch.FloatTensor of size 5x2x1]


 Stepping 
value Variable containing:
-0.5242
-0.5253
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.6883
-1.7335
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7871
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.5245
-0.5258
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 4
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8274
-1.7336
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7871
[torch.FloatTensor of 


 Stepping 
value Variable containing:
-0.7718
-0.7739
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 0
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8366
-1.6978
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7828
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.7745
-0.7705
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 0
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7638
-1.8360
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7829
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.7740
-0.7726
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 3
 0
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.6509
-1.8362
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7829
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.7769
-0.77


 Stepping 
value Variable containing:
-0.6614
-0.6623
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 5
 4
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.6870
-1.8768
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7883
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.6627
-0.6614
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7242
-1.6879
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7883
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.6630
-0.6593
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7252
-1.6871
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7883
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.6633
-0.66

Process Process-69:
Process Process-70:
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):



 Stepping 
value Variable containing:
-0.3639
-0.3603
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 2
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.7541
-1.7671
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7908
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.3637
-0.3611
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 1
 3
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8529
-1.7670
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7908
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.3639
-0.3611
[torch.FloatTensor of size 2x1]
 
action Variable containing:
 1
 5
[torch.LongTensor of size 2x1]
 
action_log_prob Variable containing:
-1.8526
-1.7409
[torch.FloatTensor of size 2x1]
 
dist_entropy Variable containing:
 1.7908
[torch.FloatTensor of size 1]


 Stepping 
value Variable containing:
-0.3638
-0.36

  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/home/beans/pytorch-a2c-ppo-acktr/all_stuff.py", line 8, in worker
    cmd, data = remote.recv()
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/beans/pytorch-a2c-ppo-acktr/all_stuff.py", line 8, in worker
    cmd, data = remote.recv()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
  File 

KeyboardInterrupt: 

In [35]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1 or classname.find('Linear') != -1:
        orthogonal(m.weight.data)
        if m.bias is not None:
            m.bias.data.fill_(0)
    
    
class CNNPolicy(nn.Module):
    def __init__(self, num_inputs, action_space):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)

        self.linear1 = nn.Linear(32 * 7 * 7, 512)

        self.critic_linear = nn.Linear(512, 1)

        self.actor_linear = nn.Linear(512, action_space.n)

        self.train()
        self.reset_parameters()

    @property
    def state_size(self):
        return 1

    def forward(self, inputs, masks):
        x = self.conv1(inputs / 255.0)
        x = F.relu(x)

        x = self.conv2(x)
        x = F.relu(x)

        x = self.conv3(x)
        x = F.relu(x)

        x = x.view(-1, 32 * 7 * 7)
        x = self.linear1(x)
        
        x = F.relu(x)
        
        value = self.critic_linear(x)
        
        actions_dist = self.actor_linear(x)
        
        
        # calculates the probabilities of each action available
        probs = F.softmax(actions_dist)
        log_probs = F.log_softmax(actions_dist)
        
        # chooses randomly based on probabilities
        # never defauts to deterministic max value
        action = probs.multinomial() 
        
        # don't want to allow model to change action when differentiating
        action = Variable(action.data, requires_grad=False)
        
        # selects the probability for the action taken
        action_log_prob = log_probs.gather(1, action)
        
        dist_entropy = -(log_probs * probs).sum(-1).mean()
        
        return value, action, action_log_prob, dist_entropy
    
    
    def act(self, inputs, masks, deterministic=False):
        value, x = self(inputs, masks)
        action = self.action_dist_linear.sample(x, deterministic=deterministic)
        action_log_probs, dist_entropy = self.action_dist_linear.logprobs_and_entropy(x, action)
        return value, action, action_log_probs

    def evaluate_actions(self, inputs, masks, actions):
        value, x = self(inputs, masks)
        action_log_probs, dist_entropy = self.action_dist_linear.logprobs_and_entropy(x, actions)
        return value, action_log_probs, dist_entropy
    
    def reset_parameters(self):
        self.apply(weights_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.linear1.weight.data.mul_(relu_gain)
        
        
        


In [3]:
class RolloutStorage(object):
    def __init__(self, num_steps, num_processes, obs_shape, action_space):
        self.observations = torch.zeros(num_steps + 1, num_processes, *obs_shape)
        self.rewards = torch.zeros(num_steps, num_processes, 1)
        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
        self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
        if action_space.__class__.__name__ == 'Discrete':
            action_shape = 1
        else:
            action_shape = action_space.shape[0]
        self.actions = torch.zeros(num_steps, num_processes, action_shape)
        if action_space.__class__.__name__ == 'Discrete':
            self.actions = self.actions.long()
        self.masks = torch.ones(num_steps + 1, num_processes, 1)

    def cuda(self):
        self.observations = self.observations.cuda()
        self.rewards = self.rewards.cuda()
        self.value_preds = self.value_preds.cuda()
        self.returns = self.returns.cuda()
        self.action_log_probs = self.action_log_probs.cuda()
        self.actions = self.actions.cuda()
        self.masks = self.masks.cuda()

    def insert(self, step, current_obs, action, action_log_prob, value_pred, reward, mask):
        self.observations[step + 1].copy_(current_obs)
        self.actions[step].copy_(action)
        self.action_log_probs[step].copy_(action_log_prob)
        self.value_preds[step].copy_(value_pred)
        self.rewards[step].copy_(reward)
        self.masks[step + 1].copy_(mask)

    def after_update(self):
        self.observations[0].copy_(self.observations[-1])
        self.masks[0].copy_(self.masks[-1])

    def compute_returns(self, next_value, use_gae, gamma, tau):
        if use_gae:
            self.value_preds[-1] = next_value
            gae = 0
            for step in reversed(range(self.rewards.size(0))):
                delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step + 1] - self.value_preds[step]
                gae = delta + gamma * tau * self.masks[step + 1] * gae
                self.returns[step] = gae + self.value_preds[step]
        else:
            self.returns[-1] = next_value
            for step in reversed(range(self.rewards.size(0))):
                self.returns[step] = self.returns[step + 1] * \
                    gamma * self.masks[step + 1] + self.rewards[step]
