# Policy Gradient

In [1]:
import gym
import numpy as np
from collections import deque
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam

# Environment

In [2]:
env = gym.make('LunarLander-v2').env



In [3]:
env.observation_space

Box(8,)

In [4]:
env.action_space

Discrete(4)

# Neural Network

In [5]:
def network(*, n_input, n_output, n_hidden=32, hidden_layers=1, 
            activation=nn.Tanh, output_activation=nn.Identity):
    layers = [nn.Linear(n_input, n_hidden), activation()]
    for _ in range(hidden_layers):
        layers += [nn.Linear(n_hidden, n_hidden), activation()]
    layers += [nn.Linear(n_hidden, n_output), output_activation()]
    return nn.Sequential(*layers)    

# Buffer

In [6]:
class Buffer:
    def __init__(self, maxlen):
        self._traj_rewards = []
        self._cache = []
        self._memory = []
    
    @property
    def size(self): return len(self._memory)
    
    def cache(self, state, action, reward, next_state, done):
        self._traj_rewards.append(reward)
        self._cache.append([state, action, reward, next_state, done])
    
    def flush(self):        
        traj_rewards = np.cumsum(np.array(self._traj_rewards)[::-1])[::-1]
        self._memory += [sample + [reward] for sample, reward in zip(self._cache, traj_rewards)]
        self._traj_rewards = []
        self._cache = []
        
    def get_batch(self):
        states, actions, rewards, next_states, dones, traj_rewards = np.array(self._memory).transpose()         
        states, next_states = np.stack(states), np.stack(next_states)
        actions, traj_rewards = actions.astype(int), traj_rewards.astype(float)
        return states, actions, rewards, next_states, dones, traj_rewards
    
    def clear(self):
        self._memory = []

# Agent

In [7]:
class Agent:
    def __init__(self, env):
        self._env = env
        self._buffer = Buffer(5000)
        self._logits_net = network(n_input=env.observation_space.shape[0], 
                                   n_hidden=32, 
                                   n_output=env.action_space.n,
                                   hidden_layers=2,
                                  )
        self._optimizer = Adam(self._logits_net.parameters(), lr=0.001)

    def policy(self, state):
        state = torch.as_tensor(state, dtype=torch.float32)
        return Categorical(logits=self._logits_net(state))
    
    def choose_action(self, state, *, epilson=0.5):        
        if np.random.random()>epilson:
            return self._env.action_space.sample()
        else:
            return self.policy(state).sample().item()

In [8]:
class Agent(Agent):
    def play(self, *, n_steps=500, render=False):
        state = env.reset()
        done = False
        rewards = 0
        for i_steps in range(1, n_steps+1):
            action = self.choose_action(state, epilson=1)
            next_state, reward, done, info = env.step(action)            
            rewards += reward
            state = next_state
            if render: 
                env.render()
            if done: 
                break
        if render: 
            print(f'Steps taken: {i_steps}, rewards earned: {rewards}')
            env.close()
        else:
            return rewards

In [9]:
class Agent(Agent):
    def train(self):
        states, actions, rewards, next_states, dones, traj_rewards = self._buffer.get_batch()
        states = torch.as_tensor(states, dtype=torch.float32)
        actions = torch.as_tensor(actions, dtype=torch.int32)
        traj_rewards = torch.as_tensor(traj_rewards, dtype=torch.float32)
        # gradient ascent
        self._optimizer.zero_grad()        
        logp = self.policy(states).log_prob(actions)
        batch_loss = -(logp*traj_rewards).mean()
        batch_loss.backward()
        self._optimizer.step()
        return batch_loss.detach().numpy()

In [10]:
class Agent(Agent):
    def run(self, n_eps=1000, n_steps=500, batch_size=3000):
        scores = deque(maxlen=10)
        for i_eps in range(1, n_eps+1):
            while True:
                state = env.reset()
                done = False
                for i_steps in range(1,n_steps+1):
                    action = self.choose_action(state, epilson=1)
                    next_state, reward, done, info = env.step(action)
                    self._buffer.cache(state, action, reward, next_state, done)
                    state = next_state 
                    if done: break
                self._buffer.flush()
                if self._buffer.size>=batch_size: 
                    break
            self.train()
            self._buffer.clear()
            if i_eps%2==0:
                scores.append(self.play())
            if i_eps%1==0:
                print('#', end='')
            if i_eps%10==0:
                mean_score = sum(scores)/len(scores)
                print(f' | Episode {i_eps:>4d} | mean rewards: {mean_score:.1f}')
                if mean_score>=200:
                    print(f'\nMean score of {mean_score:.1f} is considered solved.')
                    break

# Run

In [11]:
agent = Agent(env)

In [12]:
agent.run()
while input('Continue training? (y/[N]) ').upper()=='Y':
    agent.run()

########## | Episode   10 | mean rewards: -123.2
########## | Episode   20 | mean rewards: -147.2
########## | Episode   30 | mean rewards: -154.3
########## | Episode   40 | mean rewards: -132.8
########## | Episode   50 | mean rewards: -151.8
########## | Episode   60 | mean rewards: -152.7
########## | Episode   70 | mean rewards: -117.2
########## | Episode   80 | mean rewards: -127.8
########## | Episode   90 | mean rewards: -122.8
########## | Episode  100 | mean rewards: -106.4
########## | Episode  110 | mean rewards: -105.8
########## | Episode  120 | mean rewards: -104.0
########## | Episode  130 | mean rewards: -100.5
########## | Episode  140 | mean rewards: -97.1
########## | Episode  150 | mean rewards: -106.1
########## | Episode  160 | mean rewards: -103.3
########## | Episode  170 | mean rewards: -108.9
########## | Episode  180 | mean rewards: -128.0
########## | Episode  190 | mean rewards: -122.0
########## | Episode  200 | mean rewards: -89.1
########## | Episode  

Continue training? (y/[N])  n


# Evaluation

In [15]:
agent.play(render=True)

Steps taken: 500, rewards earned: 148.71544247161026


# Comment

* RTG PG converge just slightly faster than VPG.
* The maximum reward earn is slightly higher, but not enough to guarantee always solving the environment.
* The lunarlander will usually land successfully but don't know when to stop firing the engine, thus losing some reward.