In [8]:
import copy
import glob
import os
import time

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


from distributions import Categorical, DiagGaussian
from utils import orthogonal

from all_stuff import *

from envs import make_env # had to manually add some files into directory for env to reference bc baselines 
# modules not working right

#from storage import RolloutStorage

In [11]:
class args:
    def __init__(self):
        self.env_name='PongNoFrameskip-v4'
        self.seed=1
        self.log_dir=''
        self.save_dir='saved_models'
        self.cuda=False
        self.algo='a2c'
        self.num_stack=4
        self.num_steps=5
        self.num_processes=16
        self.lr=7e-4
        self.eps=1e-5
        self.alpha=.99
        self.max_grad_norm=.5
        self.value_loss_coef=.5
        self.entropy_coef=.1
        self.num_frames=8e6
        self.use_gae=False
        self.gamma=.99
        self.tau=.95
        self.save_interval=1000
        self.log_interval=100
        self.from_saved=False
        
args = args()

SAVE_PATH = "saved_models/pong_112917.pt"

num_updates = int(args.num_frames) // args.num_steps // args.num_processes

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

In [13]:

os.environ['OMP_NUM_THREADS'] = '1'

envs = [make_env(args.env_name, args.seed, i, args.log_dir) for i in range(args.num_processes)]

if args.num_processes > 1:
    envs = SubprocVecEnv(envs)
else:
    envs = DummyVecEnv(envs)

if len(envs.observation_space.shape) == 1:
    envs = VecNormalize(envs)

obs_shape = envs.observation_space.shape
obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

global actor_critic

actor_critic = CNNPolicy(obs_shape[0], envs.action_space)

if args.from_saved:
    print("loading saved model from " + SAVE_PATH)
    actor_critic.load_state_dict(torch.load(SAVE_PATH))

if envs.action_space.__class__.__name__ == "Discrete":
    action_shape = 1
else:
    action_shape = envs.action_space.shape[0]

if args.cuda:
    actor_critic.cuda()

optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)

rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space)
current_obs = torch.zeros(args.num_processes, *obs_shape)

def update_current_obs(obs):
    shape_dim0 = envs.observation_space.shape[0]
    obs = torch.from_numpy(obs).float()
    if args.num_stack > 1:
        current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
    current_obs[:, -shape_dim0:] = obs

obs = envs.reset()
update_current_obs(obs)

rollouts.observations[0].copy_(current_obs)

# These variables are used to compute average rewards for all processes.
episode_rewards = torch.zeros([args.num_processes, 1])
final_rewards = torch.zeros([args.num_processes, 1])

if args.cuda:
    current_obs = current_obs.cuda()
    rollouts.cuda()

start = time.time()
for j in range(num_updates):
    for step in range(args.num_steps):
        # Sample actions
        value, action, action_log_prob = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                  Variable(rollouts.masks[step], volatile=True))
        cpu_actions = action.data.squeeze(1).cpu().numpy()

        # Obser reward and next obs
        obs, reward, done, info = envs.step(cpu_actions)
        reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
        episode_rewards += reward

        # If done then clean the history of observations.
        masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
        final_rewards *= masks
        final_rewards += (1 - masks) * episode_rewards
        episode_rewards *= masks

        if args.cuda:
            masks = masks.cuda()

        if current_obs.dim() == 4:
            current_obs *= masks.unsqueeze(2).unsqueeze(2)
        else:
            current_obs *= masks

        update_current_obs(obs)
        rollouts.insert(step, current_obs, action.data, action_log_prob.data, value.data, reward, masks)

    next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                              Variable(rollouts.masks[-1], volatile=True))[0].data

    rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

    values, action_log_probs, dist_entropy = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                   Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                   Variable(rollouts.actions.view(-1, action_shape)))

    values = values.view(args.num_steps, args.num_processes, 1)
    action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)

    advantages = Variable(rollouts.returns[:-1]) - values
    value_loss = advantages.pow(2).mean()

    action_loss = -(Variable(advantages.data) * action_log_probs).mean()

    optimizer.zero_grad()
    (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

    nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

    optimizer.step()


    rollouts.after_update()

    if j % args.save_interval == 0 and args.save_dir != "":

        # A really ugly way to save a model to CPU
        save_model = actor_critic
        if args.cuda:
            save_model = copy.deepcopy(actor_critic).cpu() # save THIS one

        torch.save(save_model.state_dict(), SAVE_PATH)

    if j % args.log_interval == 0:
        end = time.time()
        total_num_steps = (j + 1) * args.num_processes * args.num_steps
        print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
            format(j, total_num_steps,
                   int(total_num_steps / (end - start)),
                   final_rewards.mean(),
                   final_rewards.median(),
                   final_rewards.min(),
                   final_rewards.max(), dist_entropy.data[0],
                   value_loss.data[0], action_loss.data[0]))

Updates 0, num timesteps 80, FPS 66, mean/median reward 0.0/0.0, min/max reward 0.0/0.0, entropy 1.58978, value loss 0.04211, policy loss 0.22921


Process Process-41:
Process Process-34:
Process Process-33:
Process Process-47:
Process Process-36:
Process Process-43:
Traceback (most recent call last):
Process Process-38:
Process Process-44:
Process Process-39:
Process Process-42:
Process Process-46:
Process Process-48:
Process Process-35:
Process Process-40:
Traceback (most recent call last):
Process Process-45:
Traceback (most recent call last):
Process Process-37:
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.5/multiprocessing/process.py", l

  File "/home/beans/pytorch-a2c-ppo-acktr/all_stuff.py", line 8, in worker
    cmd, data = remote.recv()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/home/beans/pytorch-a2c-ppo-acktr/all_stuff.py", line 8, in worker
    cmd, data = remote.recv()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt
KeyboardInterrupt
  File "/usr/lib/python3.5/multiprocessing/connection.py", line 379, in _recv
    chunk = read(h

KeyboardInterrupt: 

In [6]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1 or classname.find('Linear') != -1:
        orthogonal(m.weight.data)
        if m.bias is not None:
            m.bias.data.fill_(0)


class CNNPolicy(nn.Module):
    def __init__(self, num_inputs, action_space):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)

        self.linear1 = nn.Linear(32 * 7 * 7, 512)

        self.critic_linear = nn.Linear(512, 1)

        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(512, num_outputs)
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(512, num_outputs)
        else:
            raise NotImplementedError

        self.train()
        self.reset_parameters()

    @property
    def state_size(self):
        return 1

    def act(self, inputs, masks, deterministic=False):
        value, x = self(inputs, masks)
        action = self.dist.sample(x, deterministic=deterministic)
        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, action)
        return value, action, action_log_probs

    def evaluate_actions(self, inputs, masks, actions):
        value, x = self(inputs, masks)
        action_log_probs, dist_entropy = self.dist.logprobs_and_entropy(x, actions)
        return value, action_log_probs, dist_entropy

    def reset_parameters(self):
        self.apply(weights_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.linear1.weight.data.mul_(relu_gain)

        if self.dist.__class__.__name__ == "DiagGaussian":
            self.dist.fc_mean.weight.data.mul_(0.01)

    def forward(self, inputs, masks):
        x = self.conv1(inputs / 255.0)
        x = F.relu(x)

        x = self.conv2(x)
        x = F.relu(x)

        x = self.conv3(x)
        x = F.relu(x)

        x = x.view(-1, 32 * 7 * 7)
        x = self.linear1(x)
        x = F.relu(x)

        return self.critic_linear(x), x


In [5]:
class RolloutStorage(object):
    def __init__(self, num_steps, num_processes, obs_shape, action_space):
        self.observations = torch.zeros(num_steps + 1, num_processes, *obs_shape)
        self.rewards = torch.zeros(num_steps, num_processes, 1)
        self.value_preds = torch.zeros(num_steps + 1, num_processes, 1)
        self.returns = torch.zeros(num_steps + 1, num_processes, 1)
        self.action_log_probs = torch.zeros(num_steps, num_processes, 1)
        if action_space.__class__.__name__ == 'Discrete':
            action_shape = 1
        else:
            action_shape = action_space.shape[0]
        self.actions = torch.zeros(num_steps, num_processes, action_shape)
        if action_space.__class__.__name__ == 'Discrete':
            self.actions = self.actions.long()
        self.masks = torch.ones(num_steps + 1, num_processes, 1)

    def cuda(self):
        self.observations = self.observations.cuda()
        self.rewards = self.rewards.cuda()
        self.value_preds = self.value_preds.cuda()
        self.returns = self.returns.cuda()
        self.action_log_probs = self.action_log_probs.cuda()
        self.actions = self.actions.cuda()
        self.masks = self.masks.cuda()

    def insert(self, step, current_obs, action, action_log_prob, value_pred, reward, mask):
        self.observations[step + 1].copy_(current_obs)
        self.actions[step].copy_(action)
        self.action_log_probs[step].copy_(action_log_prob)
        self.value_preds[step].copy_(value_pred)
        self.rewards[step].copy_(reward)
        self.masks[step + 1].copy_(mask)

    def after_update(self):
        self.observations[0].copy_(self.observations[-1])
        self.masks[0].copy_(self.masks[-1])

    def compute_returns(self, next_value, use_gae, gamma, tau):
        if use_gae:
            self.value_preds[-1] = next_value
            gae = 0
            for step in reversed(range(self.rewards.size(0))):
                delta = self.rewards[step] + gamma * self.value_preds[step + 1] * self.masks[step + 1] - self.value_preds[step]
                gae = delta + gamma * tau * self.masks[step + 1] * gae
                self.returns[step] = gae + self.value_preds[step]
        else:
            self.returns[-1] = next_value
            for step in reversed(range(self.rewards.size(0))):
                self.returns[step] = self.returns[step + 1] * \
                    gamma * self.masks[step + 1] + self.rewards[step]


