In [15]:
%config IPCompleter.greedy=True

import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam

import numpy as np
import gym
from gym.spaces import Box, Discrete

In [16]:
class Actor(nn.Module):
    def __init__(self, obs_dimensions, act_dimensions):
        super().__init__()
        self.actor = nn.Sequential(
            nn.Linear(obs_dimensions, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, act_dimensions)
        )
        
    def get_distribution(self, obs):
        return Categorical(logits=self.actor(obs))
    
    def forward(self, obs, act=None):
        # Produce action distributions for given observations, and 
        # optionally compute the log likelihood of given actions under
        # those distributions
        pi = self.get_distribution(obs)
        logp_a = None
        if act is not None:
            logp_a = pi.log_prob(act)
        return pi, logp_a
        
class Critic(nn.Module):
    def __init__(self, obs_dimensions):
        super().__init__()
        self.critic = nn.Sequential(
            nn.Linear(obs_dimensions, 64),
            nn.Tanh(),
            nn.Linear(64, 64),
            nn.Tanh(),
            nn.Linear(64, 1)
        )
    
    def forward(self, obs):
        return torch.squeeze(self.critic(obs), -1)

class ActorCritic(nn.Module):
    def __init__(self, obs_space, act_space):
        super().__init__()
        
        obs_dimensions = obs_space.shape[0]
        
        # policy network
        self.pi = Actor(obs_dimensions, act_space.n)
        
        # value network
        self.v = Critic(obs_dimensions)
        
        
    def step(self, obs):
        with torch.no_grad():
            pi = self.pi.get_distribution(obs)
            action = pi.sample()
            logp_a = pi.log_prob(action)
            
            val = self.v(obs)

        return action.numpy(), val.numpy(), logp_a.numpy()
    
    def get_action(self, obs):
        return self.step(obs)[0]
        

In [19]:
# Hyperparameter definitions
pi_lr = 3e-4
vf_lr = 1e-3
epochs = 10 # 50
steps_per_epoch = 100 # 4000
max_ep_len=1000
train_pi_iters = 80
train_v_iters = 80
gamma = 0.99
lam = 0.97

In [20]:
# Generalized Advantage Estimation
def gae(rewards, values):
    deltas = rewards[:-1] + gamma * values[:-1] - values[:-1]

    # RLlab -- https://github.com/rll/rllab/blob/ba78e4c16dc492982e648f117875b22af3965579/rllab/misc/special.py#L107
    # Computing discounted cumulative sums of vectors
    return scipy.signal.lfilter([1], [1, float(-gamma * lam)], deltas[::-1], axis=0)[::-1]

In [None]:
def ppo():
    env = gym.make('MountainCar-v0')
    obs_space = env.observation_space
    act_space = env.action_space
    
    actor_critic = ActorCritic(obs_space, act_space)
    
    obs_buf, action_buf, advantage_buf, val_buf, reward_buf, logp_buf = [], [], [], [], [], []
    
    pi_optim = Adam(actor_critic.pi.parameters(), lr=pi_lr)
    v_optim = Adam(actor_critic.v.parameters(), lr=vf_lr)
    
    def ppo_pi_loss(obs, action, advantage, old_logp):
        pi, logp = actor_critic.pi(obs, action)
        pi_ratio = torch.exp(logp - old_logp)
        clip = torch.clamp(pi_ratio, 1 - pi_ratio, 1 + pi_ratio)
        pi_loss = -torch.min(pi_ratio * advantage, clip * advantage).mean()
        return pi_loss
    
    def ppo_v_loss(obs, reward):
        return (actor_critic.v(obs) - reward).pow(2).mean()
    
    def ppo_update():
        advantage = gae(reward_buf, val_buf)
        for i in range(train_pi_iters):
            pi_optim.zero_grad()
            pi_loss = ppo_pi_loss(obs, action, advantage, old_logp)
            pi_loss.backward()
            pi_optim.step()
            
        for i in range(train_v_iters):
            v_optim.zero_grad()
            v_loss = ppo_v_loss(obs, reward)
            v_loss.backward()
            v_optim.step()

    obs, ep_reward, ep_len = env.reset(), 0, 0
    
    for ep in range(epochs):
        for t in range(steps_per_epoch):
            action, val, logp_a = actor_critic.step(torch.FloatTensor(obs))
            
            new_obs, reward, done, _ = env.step(action)
            ep_reward += reward
            ep_len += 1
            
            obs_buf.append(obs)
            action_buf.append(action)
            reward_buf.append(reward)
            val_buf.append(val)
            logp_buf.append(logp_a)
            
            obs = new_obs
            done = done or (ep_len == max_ep_len)
            if done:
                obs, ep_reward, ep_len = env.reset(), 0, 0
                break
            
        ppo_update()
                
    
ppo()