In [1]:
import warnings
warnings.filterwarnings('ignore')

### Run in collab
<a href="https://colab.research.google.com/github/racousin/rl_introduction/blob/master/notebooks/5_policy_gradient-reinforce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# uncomment 2 lines
#!git clone https://github.com/racousin/rl_introduction.git
#from rl_introduction.rl_introduction.tools import discount_cumsum, run_experiment_episode_train, DeepAgent

### Run locally

In [3]:
from rl_introduction.tools import discount_cumsum, run_experiment_episode_train, DeepAgent




### Objective
Here we present an alternative of Q learning: policy gradient algorithm

# Policy gradient
In policy gradient, we parametrize directly the policy $\pi_\theta$. It's especially welcome when the action space is continuous; in that case greedy policy based on Q-learning need to compute the $argmax_a Q(s,a)$. This could be pretty tedious. More generally, policy gradient algorithms are better to explore large state-action spaces.

$J(\pi_{\theta}) = E_{\tau \sim \pi_{\theta}}[{G(\tau)}]$

We can proof  that:


$\nabla_{\theta} J(\pi_{\theta}) = E_{\tau \sim \pi_{\theta}}[{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) G(\tau)}]$ 

In discrete action space

we parametrize $\pi$ with $\theta$, such as $\pi_\theta : S \rightarrow [0,1]^{dim(A)}$ and $\forall s$ $\sum \pi_\theta(s) = 1$.

In continous action space

we parametrize $\pi$ with $\theta$, such as $\pi_\theta : S \rightarrow \mu^{dim(A)} \times \sigma^{dim(A)} =  \mathbb{R}^{dim(A)} \times \mathbb{R}_{+,*}^{dim(A)}$



It is possible to show that the loss for discrete action ($1,...,N$) with softmax policy is weighted negative binary crossentropy:
$-G\sum_{j=1}^N[a^j\log(\hat{a}^j) + (1-a^j)\log(1 - \hat{a}^j)]$

with:
$a^j=1$ if $a_t = j$, $0$ otherwise.

$\hat{a}^j = \pi_\theta(s_t)^j$.

$G$ is the discounted empirical return $G_t = \sum_{k=0}^{T-t-1} \gamma^k R_{t+k+1}$ from state $s_t$ and $a_t$


It is possible to show that the loss for conitnous action ($1,...,N$) with multivariate Gaussian (identity Covariance) policy is given by:

$-G\sum_{j=1}^N[(a^j - \hat{a}^j)^2]$

$\hat{a}^j = \pi_\theta(s_t)^j$.



see https://aleksispi.github.io/assets/pg_autodiff.pdf for more explanation

# Reinforce - discrete action

In [4]:
import numpy as np
import gym
import copy
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K
import tensorflow as tf

2022-03-06 12:44:03.955231: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/raphael/rl_introduction/venv/lib/python3.7/site-packages/cv2/../../lib64:
2022-03-06 12:44:03.955256: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
#TODO: write custom loss for policy gradient
def policy_gradient_loss(returns):
    def modified_crossentropy(one_hot_action, action_probs):
        log_probs = None #to complete
        loss = -K.mean(returns * log_probs)
        return loss
    return modified_crossentropy

In [6]:
#Done: write custom loss for policy gradient
def policy_gradient_loss(returns):
    def modified_crossentropy(one_hot_action, action_probs):
        log_probs = K.sum(one_hot_action * K.log(action_probs) + (1 - one_hot_action) * K.log(1 - action_probs), axis=1)
        loss = -K.mean(returns * log_probs)
        return loss
    return modified_crossentropy

In [7]:
def build_model(state_sim, action_dim):
    input_state = Input(name='input_state', shape=(state_dim,), dtype='float32')
    input_discount_reward = Input(name='input_discount_reward', shape=(1,), dtype='float32')
    x = Dense(32, activation='relu')(input_state)
    x = Dense(32, activation='relu')(x)
    x = Dense(action_dim, activation='softmax')(x)
    model = Model(inputs=input_state, outputs=x)
    return model

In [8]:
#TODO: complete training of vanilla policy gradient
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, multiply, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop, Adam


class ReinforceAgent(DeepAgent):
    def __init__(self, env, compiled_model, gamma = .99, epsilon = .01):
        super().__init__(env,gamma, epsilon)
        
        self.model = compiled_model
        self.model.summary()
        
        self.episode = []

    def act(self, state):
        state = state.reshape(1, -1)
        prob = self.model.predict(state, batch_size=1).flatten()
        action = np.random.choice(self.action_dim, 1, p=prob)[0]
        return action

    def train(self, current_state, action, reward, next_state, done):
        self.episode.append(np.array([current_state, action, reward]))
        if done is True:
            episode = np.asarray(self.episode)
            discounted_return = discount_cumsum(episode[:,2], self.gamma)
            pass # complete here

    def save_model(self, path):
        self.model.save(path)

In [9]:
#Done: complete training of vanilla policy gradient
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, multiply, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop, Adam


class ReinforceAgent(DeepAgent):
    def __init__(self, env, compiled_model, gamma = .99, epsilon = .01):
        super().__init__(env,gamma, epsilon)
        
        self.model = compiled_model
        self.model.summary()
        
        self.episode = []

    def act(self, state):
        state = state.reshape(1, -1)
        prob = self.model.predict(state, batch_size=1).flatten()
        action = np.random.choice(self.action_dim, 1, p=prob)[0]
        return action

    def train(self, current_state, action, reward, next_state, done):
        self.episode.append(np.array([current_state, action, reward]))
        if done is True:
            episode = np.asarray(self.episode)
            discounted_return = discount_cumsum(episode[:,2], self.gamma)
            states = np.vstack(episode[:,0])
            actions = np.zeros((len(episode), self.action_dim))
            actions[np.arange(len(episode)), episode[:,1].astype(int)] = 1
            loss = policy_gradient_loss(discounted_return)
            self.model.compile(loss=loss, optimizer=Adam(learning_rate=1e-3))
            self.model.train_on_batch(states,actions)
            self.episode = []

    def save_model(self, path):
        self.model.save(path)

In [None]:
env = gym.make('CartPole-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
model = build_model(state_dim, action_dim)
q_agent = ReinforceAgent(env, model)
rewards = run_experiment_episode_train(env, q_agent, 300)
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards,'+')
ax.set_title('cumulative reward per episode - vpg_agent')

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_state (InputLayer)    [(None, 4)]               0         
                                                                 
 dense (Dense)               (None, 32)                160       
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dense_2 (Dense)             (None, 2)                 66        
                                                                 
Total params: 1,282
Trainable params: 1,282
Non-trainable params: 0
_________________________________________________________________


2022-03-06 12:44:10.519125: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /home/raphael/rl_introduction/venv/lib/python3.7/site-packages/cv2/../../lib64:
2022-03-06 12:44:10.519154: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-06 12:44:10.519175: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (raphael-XPS-13-9370): /proc/driver/nvidia/version does not exist
2022-03-06 12:44:10.519346: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


episode: 0 - cum reward 15.0
episode: 1 - cum reward 28.0
episode: 2 - cum reward 12.0
episode: 3 - cum reward 24.0
episode: 4 - cum reward 21.0
episode: 5 - cum reward 23.0
episode: 6 - cum reward 18.0
episode: 7 - cum reward 31.0
episode: 8 - cum reward 22.0
episode: 9 - cum reward 40.0
episode: 10 - cum reward 36.0
episode: 11 - cum reward 18.0
episode: 12 - cum reward 39.0
episode: 13 - cum reward 20.0
episode: 14 - cum reward 41.0
episode: 15 - cum reward 31.0
episode: 16 - cum reward 54.0
episode: 17 - cum reward 20.0
episode: 18 - cum reward 45.0
episode: 19 - cum reward 29.0
episode: 20 - cum reward 26.0
episode: 21 - cum reward 43.0
episode: 22 - cum reward 33.0
episode: 23 - cum reward 19.0
episode: 24 - cum reward 26.0
episode: 25 - cum reward 15.0
episode: 26 - cum reward 25.0
episode: 27 - cum reward 22.0
episode: 28 - cum reward 47.0
episode: 29 - cum reward 12.0
episode: 30 - cum reward 27.0
episode: 31 - cum reward 11.0
episode: 32 - cum reward 19.0
episode: 33 - cum re

episode: 221 - cum reward 107.0
episode: 222 - cum reward 52.0
episode: 223 - cum reward 35.0
episode: 224 - cum reward 44.0
episode: 225 - cum reward 85.0
episode: 226 - cum reward 48.0
episode: 227 - cum reward 83.0
episode: 228 - cum reward 36.0
episode: 229 - cum reward 81.0
episode: 230 - cum reward 31.0
episode: 231 - cum reward 41.0
episode: 232 - cum reward 86.0
episode: 233 - cum reward 33.0
episode: 234 - cum reward 46.0
episode: 235 - cum reward 66.0
episode: 236 - cum reward 50.0
episode: 237 - cum reward 36.0
episode: 238 - cum reward 44.0
episode: 239 - cum reward 21.0
episode: 240 - cum reward 38.0
episode: 241 - cum reward 33.0
episode: 242 - cum reward 59.0
episode: 243 - cum reward 57.0
episode: 244 - cum reward 64.0
episode: 245 - cum reward 33.0
episode: 246 - cum reward 53.0
episode: 247 - cum reward 74.0
episode: 248 - cum reward 29.0
episode: 249 - cum reward 25.0
episode: 250 - cum reward 26.0
episode: 251 - cum reward 47.0
episode: 252 - cum reward 53.0
episode

# Reinforce with memory - discrete action

In [None]:
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, multiply, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop, Adam


class ReinforceAgentWithMemory(DeepAgent):
    def __init__(self, env, compiled_model, gamma = .99, epsilon = .01, memory_size = 3):
        super().__init__(env,  gamma, epsilon)
        
        self.model = compiled_model
        
        self.model.summary()
        
        self.episode = []
        self.memory_size = memory_size
        self.episodes = []
        
    def act(self, state):
        state = state.reshape(1, -1)
        prob = self.model.predict(state, batch_size=1).flatten()
        action = np.random.choice(self.action_dim, 1, p=prob)[0]
        
        return action

    def train(self, current_state, action, reward, next_state, done):
        self.episode.append(np.array([current_state, action, reward]))
        if done is True:
            episode = np.asarray(self.episode)
            self.episode = []
            discounted_return = discount_cumsum(episode[:,2], self.gamma)
            X = np.vstack(episode[:,0])
            Y = np.zeros((len(episode), self.action_dim))
            Y[np.arange(len(episode)), episode[:,1].astype(int)] = 1
            if len(self.episodes) == self.memory_size:
                Xs = np.vstack([ep[0] for ep in self.episodes])
                Ys = np.vstack([ep[1] for ep in self.episodes])
                discounted_returns = np.hstack([ep[2] for ep in self.episodes])
                discounted_returns -= discounted_returns.mean()
                discounted_returns /= discounted_returns.std()
                self.episodes = []
                loss = policy_gradient_loss(discounted_returns)
                self.model.compile(loss=loss, optimizer=Adam(learning_rate=1e-2))
                self.model.train_on_batch(Xs,Ys)
            else:
                self.episodes.append([X,Y,discounted_return])

In [None]:
env = gym.make('CartPole-v0')
model = build_model(state_dim, action_dim)
q_agent = ReinforceAgentWithMemory(env, model)
rewards = run_experiment_episode_train(env, q_agent, 300)
plt.plot(rewards)
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards,'+')
ax.set_title('cumulative reward per episode - vpg_agent large memory')

# other improvements 

### GAE(general advantage estimation) actor critic
We can rewrite the policy gradient

$\nabla_{\theta} J(\pi_{\theta}) = E_{\tau \sim \pi_{\theta}}[{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Phi_t}]$,

whith $\Phi_t$ could be any of:
- $\Phi_t =  G_t$
- $\Phi_t = \sum_{t'=t}^T R_{t+1} - V(s_t)$
- $\Phi_t = \sum_{t'=t}^T R_{t+1} - Q(s_t,a_t)$


For the last 2 cases we need to estimate V or Q (the critics). We do it as the same way at deepQ.
https://arxiv.org/pdf/1506.02438.pdf

$\phi_k = \arg \min_{\phi} E_{s_t, G_t \sim \pi_k}[{\left( V_{\phi}(s_t) - G_t \right)^2}]$

### off policy
To build an experience replay for policy gradient, it is necessary to unbias the experiences.
https://arxiv.org/pdf/1205.4839.pdf

### clipping