In [1]:
import copy
import glob
import os
import time

import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

from arguments import get_args

#from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
#from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
#from baselines.common.vec_env.vec_normalize import VecNormalize
from all_stuff import * # this has the above modules consolidated into a single file. god this was a bitch

from envs import make_env # had to manually add some files into directory for env to reference bc baselines 
# modules not working right

from kfac import KFACOptimizer
from model import CNNPolicy, MLPPolicy
from storage import RolloutStorage
from visualize import visdom_plot

In [2]:
from visdom import Visdom

In [3]:
class args:
    def __init__(self):
        self.env_name='PongNoFrameskip-v4'
        self.seed=1
        self.log_dir=''
        self.save_dir='saved_models'
        self.cuda=False
        self.algo='a2c'
        self.num_stack=4
        self.num_steps=5
        self.num_processes=16
        self.recurrent_policy=False
        self.vis=False
        self.lr=7e-4
        self.eps=1e-5
        self.alpha=.99
        self.max_grad_norm=.5
        self.value_loss_coef=.5
        self.entropy_coef=.1
        self.num_frames=1e6
        self.use_gae=False
        self.gamma=.99
        self.tau=.95
        self.save_interval=1000
        self.log_interval=100
        self.vis_interval=100
        self.from_saved_model=True
        
args = args()

save_path = os.path.join(args.save_dir, args.algo)
SAVE_PATH = os.path.join(save_path, args.env_name + ".pt")

In [4]:
num_updates = int(args.num_frames) // args.num_steps // args.num_processes

torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

try:
    os.makedirs(args.log_dir)
except OSError:
    files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv'))
    for f in files:
        os.remove(f)

In [37]:
def main():
    print("#######")
    print("WARNING: All rewards are clipped or normalized so you need to use a monitor (see envs.py) or visdom plot to get true rewards")
    print("#######")

    os.environ['OMP_NUM_THREADS'] = '1'

    if args.vis:
        from visdom import Visdom
        viz = Visdom()
        win = None

    envs = [make_env(args.env_name, args.seed, i, args.log_dir)
                for i in range(args.num_processes)]

    if args.num_processes > 1:
        envs = SubprocVecEnv(envs)
    else:
        envs = DummyVecEnv(envs)

    if len(envs.observation_space.shape) == 1:
        envs = VecNormalize(envs)

    obs_shape = envs.observation_space.shape
    obs_shape = (obs_shape[0] * args.num_stack, *obs_shape[1:])

    if args.from_saved_model:
        print("loading saved model from ", SAVE_PATH)
        actor_critic = torch.load(SAVE_PATH)
    else:
        if len(envs.observation_space.shape) == 3:
            actor_critic = CNNPolicy(obs_shape[0], envs.action_space, args.recurrent_policy)
        else:
            assert not args.recurrent_policy, \
                "Recurrent policy is not implemented for the MLP controller"
            actor_critic = MLPPolicy(obs_shape[0], envs.action_space)

    if envs.action_space.__class__.__name__ == "Discrete":
        action_shape = 1
    else:
        action_shape = envs.action_space.shape[0]

    if args.cuda:
        actor_critic.cuda()

    optimizer = optim.RMSprop(actor_critic.parameters(), args.lr, eps=args.eps, alpha=args.alpha)

    global rollouts
    rollouts = RolloutStorage(args.num_steps, args.num_processes, obs_shape, envs.action_space,\
                              actor_critic.state_size)
    current_obs = torch.zeros(args.num_processes, *obs_shape)

    def update_current_obs(obs):
        shape_dim0 = envs.observation_space.shape[0]
        obs = torch.from_numpy(obs).float()
        if args.num_stack > 1:
            current_obs[:, :-shape_dim0] = current_obs[:, shape_dim0:]
        current_obs[:, -shape_dim0:] = obs

    obs = envs.reset()
    update_current_obs(obs)

    rollouts.observations[0].copy_(current_obs)

    # These variables are used to compute average rewards for all processes.
    episode_rewards = torch.zeros([args.num_processes, 1])
    final_rewards = torch.zeros([args.num_processes, 1])

    if args.cuda:
        current_obs = current_obs.cuda()
        rollouts.cuda()

    start = time.time()
    for j in range(num_updates):
        for step in range(args.num_steps):
            
            #print("\n############# NEW STEP ######################\n")
            
            # Sample actions
            # look at the state, predict value and actions
            # Using model in predict mode. Sampling actions from the distribution, also getting a value
            # which we're not using (unless gae)
            value, action, action_log_prob, dist_entropy, states = actor_critic.act(Variable(rollouts.observations[step], volatile=True),
                                                                      Variable(rollouts.states[step], volatile=True),
                                                                      Variable(rollouts.masks[step], volatile=True))
        
            cpu_actions = action.data.squeeze(1).cpu().numpy()
            
            #print("\nthese come from initial actor_critic.act\ncpu_actions", cpu_actions, "\nvalue", value, "\nstates", states)

            # Take the actions in the environments, Obser reward and next obs
            # obs is single frame for each actor
            obs, reward, done, info = envs.step(cpu_actions)
            reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float()
            episode_rewards += reward
            
            #print("\n\nthese come after we've taken a step and observed reward and next obs\n",\
            #     "obs shape", obs.shape, "\nreward", reward,"\nepisode_rewards", episode_rewards, "\ndone", done)

            # If done then clean the history of observations.
            # if final, zero out the rewards then fill them in with episode_rewards, then use masks to
            # reset episode_rewards to zero for the actors that just finished.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done])
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            if args.cuda:
                masks = masks.cuda()

            if current_obs.dim() == 4:
                current_obs *= masks.unsqueeze(2).unsqueeze(2)
            else:
                current_obs *= masks

            update_current_obs(obs)
            # this returns four frames for each actor, 
            #print("\ncurrent_obs", current_obs)
            
            # the value we're entering is value pred. Do we use this for calculating Advantage? 
            # i don't believe so. I believe it's only relevent when using gae
            rollouts.insert(step, current_obs, states.data, action.data, action_log_prob.data, value.data, reward, masks, dist_entropy.data)

        
        #print("\n############# DONE W STEPS ######################\n")   
        # estimating the value of the last state, V(S). We'll discount this backwards 
        # and add it to discounted rewards to get the total returns for each step
        
        next_value = actor_critic(Variable(rollouts.observations[-1], volatile=True),
                                  Variable(rollouts.states[-1], volatile=True),
                                  Variable(rollouts.masks[-1], volatile=True))[0].data
        #print("\nnext_value, taken from actor_critic", next_value)

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        
        
        # this predicts values for each state. Why don't we just gather values as we're
        # collecting SAR touples? Because we need to be able to go backwards on them.
        values, action_log_probs, dist_entropy, states = actor_critic.evaluate_actions(Variable(rollouts.observations[:-1].view(-1, *obs_shape)),
                                                                                       Variable(rollouts.states[0].view(-1, actor_critic.state_size)),
                                                                                       Variable(rollouts.masks[:-1].view(-1, 1)),
                                                                                       Variable(rollouts.actions.view(-1, action_shape)))
        
        values = values.view(args.num_steps, args.num_processes, 1)
        action_log_probs = action_log_probs.view(args.num_steps, args.num_processes, 1)
        
        #print("\naction_log_probs", action_log_probs, "\ndist_entropy", dist_entropy)
        
        #print("\nvalues", values)
        
        # these are the values we gathered during acting. They appear identical to those generated
        # above. 

        """
        action_log_probs = Variable(rollouts.action_log_probs) # this is 4X2X1 
        dist_entropy = Variable(torch.FloatTensor([rollouts.dist_entropy.mean()])) # should be a single number
        values = Variable(rollouts.value_preds[:-1]) # chop off end 
        print("\nRollout action log probs, dist entropy and values\n", action_log_probs, dist_entropy, values)"""
        
        rollout_returns = rollouts.returns[:-1]
        advantages = Variable(rollout_returns) - values
        
        #print("\n rollout returns, after compute. These are discounted backwards from next_value", rollout_returns)
        
        #print("\nadvantages. Equals rollout_returns - values", advantages)
        value_loss = advantages.pow(2).mean()

        action_loss = -(Variable(advantages.data) * action_log_probs).mean()
        
        #print("\nvalue_loss (mean of squared Advantages), action_loss (Advantages * action_log_probs)", value_loss, action_loss)

        optimizer.zero_grad()
        
        # summing up total loss, then going backwards on it.
        (value_loss * args.value_loss_coef + action_loss - dist_entropy * args.entropy_coef).backward()

        nn.utils.clip_grad_norm(actor_critic.parameters(), args.max_grad_norm)

        #print("stepping the optimizer with value loss, action loss, and entropy")
        optimizer.step()
        
        
        rollouts.after_update()

        if j % args.save_interval == 0 and args.save_dir != "":
            
            try:
                os.makedirs(save_path)
            except OSError:
                pass

            # A really ugly way to save a model to CPU
            save_model = actor_critic
            if args.cuda:
                save_model = copy.deepcopy(actor_critic).cpu()
                
            torch.save(save_model, SAVE_PATH)
            print("model saved to ", SAVE_PATH)

        if j % args.log_interval == 0:
            end = time.time()
            total_num_steps = (j + 1) * args.num_processes * args.num_steps
            print("Updates {}, num timesteps {}, FPS {}, mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, entropy {:.5f}, value loss {:.5f}, policy loss {:.5f}".
                format(j, total_num_steps,
                       int(total_num_steps / (end - start)),
                       final_rewards.mean(),
                       final_rewards.median(),
                       final_rewards.min(),
                       final_rewards.max(), dist_entropy.data[0],
                       value_loss.data[0], action_loss.data[0]))
        if args.vis and j % args.vis_interval == 0:
            try:
                # Sometimes monitor doesn't properly flush the outputs
                win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo)
            except IOError:
                pass
            
main()

#######
#######
loading saved model from  saved_models/a2c/PongNoFrameskip-v4.pt

############# NEW STEP ######################


these come from initial actor_critic.act
cpu_actions [4 3] 
value Variable containing:
-0.1040
-0.1040
[torch.FloatTensor of size 2x1]
 
states Variable containing:
 0
 0
[torch.FloatTensor of size 2x1]



these come after we've taken a step and observed reward and next obs
 obs shape (2, 1, 84, 84) 
reward 
 0
 0
[torch.FloatTensor of size 2x1]
 
episode_rewards 
 0
 0
[torch.FloatTensor of size 2x1]
 
done [False False]

############# NEW STEP ######################


these come from initial actor_critic.act
cpu_actions [0 0] 
value Variable containing:
-0.2085
-0.2397
[torch.FloatTensor of size 2x1]
 
states Variable containing:
 0
 0
[torch.FloatTensor of size 2x1]



these come after we've taken a step and observed reward and next obs
 obs shape (2, 1, 84, 84) 
reward 
 0
 0
[torch.FloatTensor of size 2x1]
 
episode_rewards 
 0
 0
[torch.FloatTensor of 

RuntimeError: there are no graph nodes that require computing gradients


 1
[torch.FloatTensor of size 1]

In [13]:
action_log_probs_.view(args.num_steps, args.num_processes, 1)


-1.7887 -1.8077
-1.7896 -1.7960
-1.8077 -1.7887
-1.7960 -1.7960
[torch.FloatTensor of size 4x2]