## Install, import and utilities

In [0]:
!pip install gym > /dev/null 2>&1

In [0]:
!pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1

In [0]:
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
!pip install --upgrade setuptools 2>&1
!pip install ez_setup > /dev/null 2>&1

Requirement already up-to-date: setuptools in /usr/local/lib/python3.6/dist-packages (45.2.0)


In [0]:
import gym
from gym import logger as gymlogger
from gym.wrappers import Monitor
gymlogger.set_level(40) #error only

import torch
import torch.nn as nn
import torch.nn.functional as F 
from torch import optim
import numpy as np
import pandas as pd

import seaborn as sns
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay
from IPython.display import clear_output
from pathlib import Path

import random, os.path, math, glob, csv, base64, itertools, sys
from pprint import pprint

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import io
from IPython.display import HTML

from copy import deepcopy


In [0]:
# The following code is will be used to visualize the environments.
def show_video(directory):
    html = []
    for mp4 in Path(directory).glob("*.mp4"):
        video_b64 = base64.b64encode(mp4.read_bytes())
        html.append('''<video alt="{}" autoplay 
                      loop controls style="height: 400px;">
                      <source src="data:video/mp4;base64,{}" type="video/mp4" />
                 </video>'''.format(mp4, video_b64.decode('ascii')))
    ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))
    
def make_seed(seed):
    np.random.seed(seed=seed)
    torch.manual_seed(seed=seed)
  
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

xdpyinfo was not found, X start can not be checked! Please install xdpyinfo!


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>

## A2C



for n = 1, ..., N //number of iterations<br>
> collect $K$ samples

> for i = 0,...,$K$:

>> execute action $a_i\sim\pi(s)$

>> observe reward $r_i$ and next state $s'_i$

>> store $(s_i,a_i,r_i,s'_i)$

>> if done: reset

> compute target $y_i$ for each sample $i \in [0, K]$

> compute $\delta_i$ for each sample $i \in [0,K]$

> Compute estimate of $V$ by gradient descent on MSE

$$L_{MSE}(\omega|data) =\sum_{i=1}^{K} (V_{\omega}(s_i) - y_i)^2$$

> Update policy by gradient descent on policy loss

$$L_\pi(\theta|data) = \frac{1}{K} \sum_{i=1}^K 
      \Big[ 
                                \log \pi_\theta( a_i|s_i) \delta_i + \lambda_e \Omega(\pi_{\theta}(\cdot| s_i))
                        \Big]$$







In [0]:
class ActorNetwork(nn.Module):
    """
     This network represents the policy
    """

    def __init__(self, input_size, hidden_size, action_size):
        super(ActorNetwork, self).__init__()
        self.n_actions = action_size
        self.dim_observation = input_size
        
        self.net = nn.Sequential(
            nn.Linear(in_features=self.dim_observation, out_features=hidden_size),
            nn.ReLU(),
            nn.Linear(in_features=hidden_size, out_features=hidden_size),
            nn.ReLU(),
            nn.Linear(in_features=hidden_size, out_features=self.n_actions),
            nn.Softmax(dim=-1)
        )
        
    def policy(self, state):
        state = torch.tensor(state, dtype=torch.float)
        return self.net(state)
    
    def sample_action(self, state):
        state = torch.tensor(state, dtype=torch.float)
        action = torch.multinomial(self.policy(state), 1)
        return action.item()

In [0]:
class ValueNetwork(nn.Module):
  """
   This class represents the value function
  """

  def __init__(self, input_size, hidden_size, output_size):
      super(ValueNetwork, self).__init__()
      self.fc1 = nn.Linear(input_size, hidden_size)
      self.fc2 = nn.Linear(hidden_size, hidden_size)
      self.fc3 = nn.Linear(hidden_size, output_size)

  def forward(self, x):
      out = F.relu(self.fc1(x))
      out = F.relu(self.fc2(out))
      out = self.fc3(out)
      return out
  
  def value(self, state):
      state = torch.tensor(state, dtype=torch.float)
      return self.forward(state)


In [0]:
# You can select your environment here
env_id = 'CartPole-v1'  #@param ["CartPole-v1", "Acrobot-v1", "MountainCar-v0"]
env = gym.make(env_id)

eval_env = gym.make(env_id) # environment to evaluate the policy

In [0]:
# Define you networks
value_network = ValueNetwork(env.observation_space.shape[0], 16, 1)
actor_network = ActorNetwork(env.observation_space.shape[0], 16, env.action_space.n)
print(value_network)
print(actor_network)

# Define your optimizers
value_network_optimizer = torch.optim.RMSprop(value_network.parameters(), lr=0.01)
actor_network_optimizer = torch.optim.RMSprop(actor_network.parameters(), lr=0.01)


num_iterations = 250   # How many update step do we perform   
batch_size = 1024      # How many samples to collect
gamma = 1
lambda_entropy = 0.001 # regularization coefficient for entropy
compute_objective= torch.nn.MSELoss()


for iteration in range(num_iterations):
    # Initialize batch storage
    batch_losses = torch.zeros(batch_size)
    batch_returns = np.zeros(batch_size)


    states = np.empty((batch_size,) + env.observation_space.shape, dtype=np.float)        # shape (batch_size, state_dim)
    rewards = np.empty((batch_size,), dtype=np.float)                                     # shape (batch_size, )                                 
    next_states = np.empty((batch_size,) + env.observation_space.shape, dtype=np.float)   # shape (batch_size, state_dim)
    dones = np.empty((batch_size,), dtype=np.bool)                                        # shape (batch_size, ) 
    proba = torch.empty((batch_size,), dtype=np.float)                                    # shape (batch_size, ), store pi(a_t|s_t)
    next_value = 0                                
  
    # Intialize environment
    state = env.reset()

    # Generate batch
    for i in range(batch_size):
        action = actor_network.sample_action(state)
        next_state, reward, done, _ = env.step(action)

        states[i] = state
        rewards[i] = reward
        next_states[i] = next_state
        dones[i] = done
        proba[i] = actor_network.policy(state)[action]

        state = next_state
        if done:
          state = env.reset()

    if not done:
        next_value = value_network.value(next_states[-1]).detach().numpy()[0]

    # compute returns (with bootstrapping)
    returns = np.zeros((batch_size,), dtype=np.float)
   
    for t in range(batch_size):
      s = 0
      for k in range(t, batch_size):
        T = k
        s+= gamma**(k-t)*rewards[k]

        if dones[T]:
          break
      returns[t] = s

      if not dones[T]:
        returns[t] += gamma**(T-t+1)*value_network.value(next_states[T]).detach().numpy()[0]
        

    # compute advantage
    values = value_network.value(states)
    advantages = returns - values.detach().numpy().squeeze()

    # Compute MSE
    value_network_optimizer.zero_grad()
    loss_value = compute_objective(values, torch.tensor(returns, requires_grad=True))
    loss_value.backward()
    value_network_optimizer.step()

    # compute entropy term
    probs = actor_network.policy(states)
    crossEntropy=-torch.sum(probs*torch.log(probs))

    # Compute Actor Gradient
    actor_network_optimizer.zero_grad()
    loss_policy = -torch.sum(torch.log(proba)*torch.tensor(advantages, requires_grad=True))- lambda_entropy * crossEntropy
    loss_policy.backward()
    actor_network_optimizer.step()

    # this code is to evaluate the policy every 10 iterations
    if( (iteration+1)%10 == 0 ):
        eval_rewards = np.zeros(5)
        for sim in range(5):
            eval_done = False
            eval_state = eval_env.reset()
            while not eval_done:
                eval_action = actor_network.sample_action(eval_state)
                eval_next_state, eval_reward, eval_done, _ = eval_env.step(eval_action)
                eval_rewards[sim] += eval_reward
                eval_state = eval_next_state
        
        print("it, rewards = ", 
              iteration +1, eval_rewards.mean())


ValueNetwork(
  (fc1): Linear(in_features=4, out_features=16, bias=True)
  (fc2): Linear(in_features=16, out_features=16, bias=True)
  (fc3): Linear(in_features=16, out_features=1, bias=True)
)
ActorNetwork(
  (net): Sequential(
    (0): Linear(in_features=4, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): ReLU()
    (4): Linear(in_features=16, out_features=2, bias=True)
    (5): Softmax(dim=-1)
  )
)


  return F.mse_loss(input, target, reduction=self.reduction)


it, rewards =  10 66.4
it, rewards =  20 100.8
it, rewards =  30 113.2
it, rewards =  40 423.0
it, rewards =  50 500.0
it, rewards =  60 500.0
it, rewards =  70 500.0
it, rewards =  80 500.0
it, rewards =  90 490.2
it, rewards =  100 500.0
it, rewards =  110 500.0
it, rewards =  120 500.0
it, rewards =  130 500.0
it, rewards =  140 500.0
it, rewards =  150 500.0
it, rewards =  160 200.4
it, rewards =  170 500.0
it, rewards =  180 500.0
it, rewards =  190 500.0
it, rewards =  200 500.0
it, rewards =  210 500.0
it, rewards =  220 500.0
it, rewards =  230 500.0
it, rewards =  240 500.0
it, rewards =  250 500.0


In [0]:
env = Monitor(env, "./gym-results", force=True, video_callable=lambda episode: True)
for episode in range(4):
    done = False
    state = env.reset()
    while not done:
        action = actor_network.sample_action(state)
        state, reward, done, info = env.step(action)
env.close()
show_video("./gym-results")

