In [None]:
import warnings
warnings.filterwarnings('ignore')

### Run in collab
<a href="https://colab.research.google.com/github/racousin/rl_introduction/blob/master/notebooks/5_policy_gradient-reinforce.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!apt-get install swig build-essential python-dev python3-dev > /dev/null 2>&1
!pip install pygame==2.1.0 > /dev/null 2>&1
!pip install gym==0.23.1 > /dev/null 2>&1
!git clone https://github.com/racousin/rl_introduction.git > /dev/null 2>&1
from rl_introduction.rl_introduction.tools import Agent, DeepAgent, plot_values_lake, policy_improvement, discount_cumsum, run_experiment_episode_train

In [None]:
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, multiply, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop, Adam

In [None]:
# We will experiment our algo with CartPole
env = gym.make('CartPole-v0')

### Objective
Here we present an alternative of Q learning: policy gradient algorithm

**Complete the TODO steps! Good luck!**

# Policy gradient
In policy gradient, we parametrize directly the policy $\pi_\theta$. It's especially welcome when the action space is continuous; in that case greedy policy based on Q-learning need to compute the $argmax_a Q(s,a)$. This could be pretty tedious. More generally, policy gradient algorithms are better to explore large state-action spaces.

$J(\pi_{\theta}) = E_{\tau \sim \pi_{\theta}}[{G(\tau)}]$

We can proof  that:


$\nabla_{\theta} J(\pi_{\theta}) = E_{\tau \sim \pi_{\theta}}[{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) G(\tau)}]$ 

1. In discrete action space

we parametrize $\pi$ with $\theta$, such as $\pi_\theta : S \rightarrow [0,1]^{dim(A)}$ and $\forall s$ $\sum \pi_\theta(s) = 1$.


2. In continous action space

we parametrize $\pi$ with $\theta$, such as $\pi_\theta : S \rightarrow \mu^{dim(A)} \times \sigma^{dim(A)} =  \mathbb{R}^{dim(A)} \times \mathbb{R}_{+,*}^{dim(A)}$



In keras, it is easier to pass the loss than the gradient.
1. It is possible to show that the loss for discrete action ($1,...,N$) with softmax policy is weighted negative binary crossentropy:
$-G\sum_{j=1}^N[a^j\log(\hat{a}^j) + (1-a^j)\log(1 - \hat{a}^j)]$

with:
$a^j=1$ if $a_t = j$, $0$ otherwise.

$\hat{a}^j = \pi_\theta(s_t)^j$.

$G$ is the discounted empirical return $G_t = \sum_{k=0}^{T-t-1} \gamma^k R_{t+k+1}$ from state $s_t$ and $a_t$


2. It is possible to show that the loss for conitnous action ($1,...,N$) with multivariate Gaussian (identity Covariance) policy is given by:

$-G\sum_{j=1}^N[(a^j - \hat{a}^j)^2]$

$\hat{a}^j = \pi_\theta(s_t)^j$.



see https://aleksispi.github.io/assets/pg_autodiff.pdf for more explanation

# Reinforce - discrete action

In [None]:
import numpy as np
import gym
import copy
import matplotlib.pyplot as plt
import tensorflow.keras.backend as K
import tensorflow as tf

### TODO 0): write policy gradient interaction with the environment

In [None]:
#TODO: write a keras model that represent our parametrized pi function
# We should be able to run pi.predict([s]) and it should return [[P(a_0|s), P(a_1|s) .. P(a_m|s)]] where m is action size
def build_model(state_dim, action_dim):
    return model
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
model = build_model(state_dim, action_dim)
model.predict(np.random.rand(1,state_dim))

In [None]:
#TODO: write the action choosen by our initial policy gradient function.
# It should be a ~ P(.|s) = U(pi_fonction(s))
class ReinforceAgent(DeepAgent):
    def __init__(self, env, compiled_model, gamma = .99, epsilon = .01):
        super().__init__(env,gamma, epsilon)
        
        self.model = compiled_model
        self.model.summary()
        
        self.episode = []

    def act(self, state):
        # complete here
        return action

In [None]:
def run_experiment_episode(env, agent, nb_episode):
    rewards = np.zeros(nb_episode)
    for i in range(nb_episode):
        state = env.reset()
        done = False
        rews = []
        while done is False:
            action = agent.act(state)
            current_state = state
            state, reward, done, info = env.step(action)
            rews.append(reward)
        rewards[i] = sum(rews)
        print('episode: {} - cum reward {}'.format(i, rewards[i]))
    return rewards

In [None]:
#TODO: interact with the environment through episode and display the return

### TODO 1): write custom loss for policy gradient

weighted negative binary crossentropy:
$-G\sum_{j=1}^N[a^j\log(\hat{a}^j) + (1-a^j)\log(1 - \hat{a}^j)]$

In [None]:
#TODO: write custom loss for policy gradient
def policy_gradient_loss(returns):
    def modified_crossentropy(one_hot_action, action_probs):
        log_probs = None #to complete
        loss = -K.mean(returns * log_probs)
        return loss
    return modified_crossentropy

### TODO 2): complete training of vanilla policy gradient

In [None]:
#TODO: complete training of vanilla policy gradient
# 
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, multiply, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop, Adam


class ReinforceAgent(DeepAgent):
    def __init__(self, env, compiled_model, gamma = .99, epsilon = .01):
        super().__init__(env,gamma, epsilon)
        
        self.model = compiled_model
        self.model.summary()
        
        self.episode = []

    def act(self, state):
        state = state.reshape(1, -1)
        prob = self.model.predict(state, batch_size=1, verbose=0).flatten()
        action = np.random.choice(self.action_dim, 1, p=prob)[0]
        return action

    def train(self, current_state, action, reward, next_state, done):
        self.episode.append(np.array([current_state, action, reward])) # save the trajectory
        if done is True: # Compute and use the discouted_reward at the end of the episode and train
            episode = np.asarray(self.episode)
            discounted_return = discount_cumsum(episode[:,2], self.gamma)
            pass # complete here
            # Compute the custom loss
            # train

    def save_model(self, path):
        self.model.save(path)

In [None]:
env = gym.make('CartPole-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
model = build_model(state_dim, action_dim)
r_agent = ReinforceAgent(env, model)
rewards = run_experiment_episode_train(env, r_agent, 300)
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards,'+')
ax.set_title('cumulative reward per episode - vpg_agent')

### TODO 3) : Try different hyerparamters models (number of layers, nodes) and compare learning

In [None]:
# Example of our parametrize policy function discrete action space
def your_build_model(state_sim, action_dim):
    input_state = Input(name='input_state', shape=(state_dim,), dtype='float32')
    x = Dense(1, activation='relu')(input_state)
    x = Dense(1, activation='relu')(x)
    x = Dense(action_dim, activation='softmax')(x)
    model = Model(inputs=input_state, outputs=x)
    return model

In [None]:
env = gym.make('CartPole-v0')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n
your_model = your_build_model(state_dim, action_dim)
your_r_agent = ReinforceAgent(env, your_model)
your_rewards = run_experiment_episode_train(env, your_r_agent, 300)
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards,label='initial_model')
ax.plot(your_rewards,label='your_model')
ax.set_title('cumulative reward per episode - deep_reinforce_agent')
ax.legend()

# Reinforce with memory - discrete action

In opposite as Q learning, policy optimization is an on-policy algorithm, so we are training directly on the policy output and we need to compute them first.

In [None]:
import gym
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, multiply, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers import RMSprop, Adam


class ReinforceAgentWithMemory(DeepAgent):
    def __init__(self, env, compiled_model, gamma = .99, epsilon = .01, memory_size = 3):
        super().__init__(env,  gamma, epsilon)
        
        self.model = compiled_model
        
        self.model.summary()
        
        self.episode = []
        self.memory_size = memory_size
        self.episodes = []
        
    def act(self, state):
        state = state.reshape(1, -1)
        prob = self.model.predict(state, batch_size=1, verbose=0).flatten()
        action = np.random.choice(self.action_dim, 1, p=prob)[0]
        
        return action

    def train(self, current_state, action, reward, next_state, done):
        self.episode.append(np.array([current_state, action, reward]))
        if done is True:
            episode = np.asarray(self.episode)
            self.episode = []
            discounted_return = discount_cumsum(episode[:,2], self.gamma)
            X = np.vstack(episode[:,0])
            Y = np.zeros((len(episode), self.action_dim))
            Y[np.arange(len(episode)), episode[:,1].astype(int)] = 1
            if len(self.episodes) == self.memory_size:
                Xs = np.vstack([ep[0] for ep in self.episodes])
                Ys = np.vstack([ep[1] for ep in self.episodes])
                discounted_returns = np.hstack([ep[2] for ep in self.episodes])
                discounted_returns -= discounted_returns.mean()
                discounted_returns /= discounted_returns.std()
                self.episodes = []
                loss = policy_gradient_loss(discounted_returns)
                self.model.compile(loss=loss, optimizer=Adam(learning_rate=1e-2))
                self.model.train_on_batch(Xs,Ys)
            else:
                self.episodes.append([X,Y,discounted_return])

In [None]:
env = gym.make('CartPole-v0')
model = build_model(state_dim, action_dim)
q_agent = ReinforceAgentWithMemory(env, model)
rewards = run_experiment_episode_train(env, q_agent, 300)
plt.plot(rewards)
fig,ax = plt.subplots(figsize=(10,10))
ax.plot(rewards,'+')
ax.set_title('cumulative reward per episode - vpg_agent large memory')

# other improvements 

### GAE(general advantage estimation) actor critic
We can rewrite the policy gradient

$\nabla_{\theta} J(\pi_{\theta}) = E_{\tau \sim \pi_{\theta}}[{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Phi_t}]$,

whith $\Phi_t$ could be any of:
- $\Phi_t =  G_t$
- $\Phi_t = \sum_{t'=t}^T R_{t+1} - V(s_t)$
- $\Phi_t = \sum_{t'=t}^T R_{t+1} - Q(s_t,a_t)$


For the last 2 cases we need to estimate V or Q (the critics). We do it as the same way at deepQ.
https://arxiv.org/pdf/1506.02438.pdf

$\phi_k = \arg \min_{\phi} E_{s_t, G_t \sim \pi_k}[{\left( V_{\phi}(s_t) - G_t \right)^2}]$

### off policy
To build an experience replay for policy gradient, it is necessary to unbias the experiences.
https://arxiv.org/pdf/1205.4839.pdf

### clipping