# Implementing the Simplest Policy Gradient
https://github.com/openai/spinningup/blob/master/spinup/examples/pytorch/pg_math/1_simple_pg.py


In [2]:
import torch
import torch.nn as nn
from   torch.distributions.categorical import Categorical
from   torch.optim import Adam
import numpy as np
import gym
from   gym.spaces import Discrete, Box

def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
    """ Build a feedforward neural network. """
    layers = []
    for j in range(len(sizes)-1):
        act = activation if j < len(sizes)-2 else output_activation
        layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
    return nn.Sequential(*layers)

def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2, epochs=50, batch_size=5000, render=False):    
    env = gym.make(env_name)         # make environment, check spaces, get obs / act dims
    assert isinstance(env.observation_space, Box), "This example only works for envs with continuous state spaces."
    assert isinstance(env.action_space, Discrete), "This example only works for envs with discrete action spaces."

    obs_dim = env.observation_space.shape[0]
    n_acts  = env.action_space.n
    
    logits_net = mlp(sizes=[obs_dim]+hidden_sizes+[n_acts])  # make core of policy network
    
    def get_policy(obs):                                     
        """ make function to compute action distribution """
        logits = logits_net(obs)
        return Categorical(logits=logits)

    def get_action(obs):
        """ make action selection function (outputs int actions, sampled from policy) """
        return get_policy(obs).sample().item()
    
    def compute_loss(obs, act, weights):
        """ make loss function whose gradient, for the right data, is policy gradient """
        logp = get_policy(obs).log_prob(act)
        return -(logp * weights).mean()

    # make optimizer
    optimizer = Adam(logits_net.parameters(), lr=lr)

    # for training policy
    def train_one_epoch():
        # make some empty lists for logging.
        batch_obs = []          # for observations
        batch_acts = []         # for actions
        batch_weights = []      # for R(tau) weighting in policy gradient
        batch_rets = []         # for measuring episode returns
        batch_lens = []         # for measuring episode lengths

        # reset episode-specific variables
        obs = env.reset()       # first obs comes from starting distribution
        done = False            # signal from environment that episode is over
        ep_rews = []            # list for rewards accrued throughout ep

        # render first episode of each epoch
        finished_rendering_this_epoch = False

        max_x = obs[0]
        
        # collect experience by acting in the environment with current policy
        while True:            
            if (not finished_rendering_this_epoch) and render:   # rendering
                env.render()
            
            batch_obs.append(obs.copy()) # save obs

            # act in the environment
            act = get_action(torch.as_tensor(obs, dtype=torch.float32))
            obs, rew, done, _ = env.step(act)
                        
            # save action, reward
            batch_acts.append(act)
            ep_rews.append(rew)

            if obs[0] > max_x:
                max_x = obs[0]
            
            if done:  
                ep_rews[-1] += 10*max_x
                
                ep_ret, ep_len = sum(ep_rews), len(ep_rews)    # if episode is over, record info about episode
                batch_rets.append(ep_ret)
                batch_lens.append(ep_len)
                
                batch_weights += [ep_ret] * ep_len             # the weight for each logprob(a|s) is R(tau)

                # reset episode-specific variables
                obs, done, ep_rews = env.reset(), False, []

                # won't render again this epoch
                finished_rendering_this_epoch = True

                # end experience loop if we have enough of it
                if len(batch_obs) > batch_size:
                    break

        # take a single policy gradient update step
        optimizer.zero_grad()
        batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
                                  act=torch.as_tensor(batch_acts, dtype=torch.int32),
                                  weights=torch.as_tensor(batch_weights, dtype=torch.float32)
                                  )
        batch_loss.backward()
        optimizer.step()
        return batch_loss, batch_rets, batch_lens

    # training loop
    for i in range(epochs):
        batch_loss, batch_rets, batch_lens = train_one_epoch()
        print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
                (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))

In [3]:
train(env_name='CartPole-v1', lr=0.01)

  batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),


epoch:   0 	 loss: 25.984 	 return: 31.919 	 ep_len: 21.987
epoch:   1 	 loss: 26.632 	 return: 32.143 	 ep_len: 24.529
epoch:   2 	 loss: 28.108 	 return: 32.619 	 ep_len: 26.005
epoch:   3 	 loss: 37.076 	 return: 45.403 	 ep_len: 26.935
epoch:   4 	 loss: 28.253 	 return: 32.839 	 ep_len: 30.246
epoch:   5 	 loss: 34.850 	 return: 37.292 	 ep_len: 33.033
epoch:   6 	 loss: 33.964 	 return: 41.074 	 ep_len: 36.312
epoch:   7 	 loss: 41.742 	 return: 45.293 	 ep_len: 40.960
epoch:   8 	 loss: 42.544 	 return: 52.826 	 ep_len: 43.803
epoch:   9 	 loss: 42.762 	 return: 55.361 	 ep_len: 44.946
epoch:  10 	 loss: 45.958 	 return: 57.418 	 ep_len: 48.835
epoch:  11 	 loss: 51.742 	 return: 64.364 	 ep_len: 55.374
epoch:  12 	 loss: 62.251 	 return: 79.835 	 ep_len: 64.410
epoch:  13 	 loss: 58.492 	 return: 75.949 	 ep_len: 61.096
epoch:  14 	 loss: 73.000 	 return: 93.179 	 ep_len: 73.912
epoch:  15 	 loss: 70.361 	 return: 86.301 	 ep_len: 67.676
epoch:  16 	 loss: 76.820 	 return: 97.8