In [1]:
import warnings
warnings.filterwarnings('ignore')

### Run in collab
<a href="https://colab.research.google.com/github/racousin/data_science_practice/blob/master/website/public/modules/data-science-practice/module13/exercise/module13_exercise5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!apt-get install swig build-essential python-dev python3-dev > /dev/null 2>&1
!pip install gym==0.23.1 > /dev/null 2>&1

In [4]:
import gymnasium as gym
import numpy as np

In [None]:
# We will experiment our algo with CartPole
env = gym.make('CartPole-v0')

### Objective
Here we present an alternative of Q learning: policy gradient algorithm

**Complete the TODO steps! Good luck!**

# Policy gradient
In policy gradient, we parametrize directly the policy $\pi_\theta$. It's especially welcome when the action space is continuous; in that case greedy policy based on Q-learning need to compute the $argmax_a Q(s,a)$. This could be pretty tedious. More generally, policy gradient algorithms are better to explore large state-action spaces.

$J(\pi_{\theta}) = E_{\tau \sim \pi_{\theta}}[{G(\tau)}]$

We can proof  that:


$\nabla_{\theta} J(\pi_{\theta}) = E_{\tau \sim \pi_{\theta}}[{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) G(\tau)}]$

1. In discrete action space

we parametrize $\pi$ with $\theta$, such as $\pi_\theta : S \rightarrow [0,1]^{dim(A)}$ and $\forall s$ $\sum \pi_\theta(s) = 1$.


2. In continous action space

we parametrize $\pi$ with $\theta$, such as $\pi_\theta : S \rightarrow \mu^{dim(A)} \times \sigma^{dim(A)} =  \mathbb{R}^{dim(A)} \times \mathbb{R}_{+,*}^{dim(A)}$



In keras, it is easier to pass the loss than the gradient.
1. It is possible to show that the loss for discrete action ($1,...,N$) with softmax policy is weighted negative binary crossentropy:
$-G\sum_{j=1}^N[a^j\log(\hat{a}^j) + (1-a^j)\log(1 - \hat{a}^j)]$

with:
$a^j=1$ if $a_t = j$, $0$ otherwise.

$\hat{a}^j = \pi_\theta(s_t)^j$.

$G$ is the discounted empirical return $G_t = \sum_{k=0}^{T-t-1} \gamma^k R_{t+k+1}$ from state $s_t$ and $a_t$


2. It is possible to show that the loss for conitnous action ($1,...,N$) with multivariate Gaussian (identity Covariance) policy is given by:

$-G\sum_{j=1}^N[(a^j - \hat{a}^j)^2]$

$\hat{a}^j = \pi_\theta(s_t)^j$.



see https://aleksispi.github.io/assets/pg_autodiff.pdf for more explanation

# Reinforce - discrete action

In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        """
        Initialize the policy network.

        Args:
            input_size (int): Dimension of state space
            hidden_size (int): Number of hidden units
            output_size (int): Dimension of action space
        """
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        """
        Forward pass through the network.

        Args:
            x (torch.Tensor): Input state

        Returns:
            torch.Tensor: Action probabilities
        """
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.softmax(self.fc3(x), dim=-1)
        return x

class REINFORCE:
    def __init__(self, env, hidden_size=128, learning_rate=1e-3, gamma=0.99):
        """
        Initialize the REINFORCE agent.

        Args:
            env: Gymnasium environment
            hidden_size (int): Number of hidden units in the policy network
            learning_rate (float): Learning rate for optimization
            gamma (float): Discount factor
        """
        self.env = env
        self.gamma = gamma

        # Initialize policy network
        self.policy = PolicyNetwork(
            input_size=env.observation_space.shape[0],
            hidden_size=hidden_size,
            output_size=env.action_space.n
        )

        self.optimizer = optim.Adam(self.policy.parameters(), lr=learning_rate)

        # Storage for trajectory
        self.states = []
        self.actions = []
        self.rewards = []

    def select_action(self, state):
        """
        Select an action using the policy network.

        Args:
            state (numpy.ndarray): Current state

        Returns:
            int: Selected action
        """
        state = torch.FloatTensor(state)
        probs = self.policy(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample()

        # Store log probability for training
        self.log_probs = action_dist.log_prob(action)

        return action.item()

    def store_transition(self, state, action, reward):
        """
        Store state, action, and reward for the current transition.

        Args:
            state (numpy.ndarray): Current state
            action (int): Selected action
            reward (float): Received reward
        """
        self.states.append(state)
        self.actions.append(action)
        self.rewards.append(reward)

    def calculate_returns(self):
        """
        Calculate discounted returns for the episode.

        Returns:
            torch.Tensor: Tensor of discounted returns
        """
        returns = []
        G = 0

        # Calculate returns backwards
        for reward in reversed(self.rewards):
            G = reward + self.gamma * G
            returns.insert(0, G)

        returns = torch.FloatTensor(returns)

        # Normalize returns for stability
        if len(returns) > 1:  # Only normalize if we have more than one return
            returns = (returns - returns.mean()) / (returns.std() + 1e-8)

        return returns

    def update_policy(self):
        """
        Update the policy network using the REINFORCE algorithm.
        """
        returns = self.calculate_returns()

        # Calculate policy loss
        policy_loss = 0
        for log_prob, G in zip(self.saved_log_probs, returns):
            policy_loss += -log_prob * G

        # Optimize the policy
        self.optimizer.zero_grad()
        policy_loss.backward()
        self.optimizer.step()

        # Clear trajectory storage
        self.states = []
        self.actions = []
        self.rewards = []
        self.saved_log_probs = []

    def train(self, num_episodes, max_steps=1000):
        """
        Train the agent for a specified number of episodes.

        Args:
            num_episodes (int): Number of episodes to train
            max_steps (int): Maximum steps per episode
        """
        episode_rewards = []

        for episode in range(num_episodes):
            state, _ = self.env.reset()
            episode_reward = 0
            self.saved_log_probs = []

            for step in range(max_steps):
                # Select action
                action = self.select_action(state)
                self.saved_log_probs.append(self.log_probs)

                # Take action in environment
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                # Store transition
                self.store_transition(state, action, reward)
                episode_reward += reward

                if done:
                    break

                state = next_state

            # Update policy after episode
            self.update_policy()
            episode_rewards.append(episode_reward)

            # Print episode statistics
            if (episode + 1) % 10 == 0:
                avg_reward = np.mean(episode_rewards[-10:])
                print(f"Episode {episode + 1}, Average Reward (last 10): {avg_reward:.2f}")

        return episode_rewards


In [9]:
# Create environment
env = gym.make('CartPole-v1')

# Initialize and train agent
agent = REINFORCE(env)
rewards = agent.train(num_episodes=500)

# Test the trained policy
state, _ = env.reset()  # Gymnasium returns (state, info)
done = False
total_reward = 0

while not done:
    action = agent.select_action(state)
    state, reward, terminated, truncated, _ = env.step(action)  # Gymnasium step API
    done = terminated or truncated
    total_reward += reward

print(f"Test Episode Reward: {total_reward}")
env.close()

Episode 10, Average Reward (last 10): 18.50
Episode 20, Average Reward (last 10): 19.80
Episode 30, Average Reward (last 10): 24.40
Episode 40, Average Reward (last 10): 23.90
Episode 50, Average Reward (last 10): 28.10
Episode 60, Average Reward (last 10): 43.20
Episode 70, Average Reward (last 10): 32.20
Episode 80, Average Reward (last 10): 34.40
Episode 90, Average Reward (last 10): 67.80
Episode 100, Average Reward (last 10): 69.30
Episode 110, Average Reward (last 10): 79.70
Episode 120, Average Reward (last 10): 103.10
Episode 130, Average Reward (last 10): 108.50
Episode 140, Average Reward (last 10): 135.10
Episode 150, Average Reward (last 10): 166.30
Episode 160, Average Reward (last 10): 247.10
Episode 170, Average Reward (last 10): 323.90
Episode 180, Average Reward (last 10): 197.60
Episode 190, Average Reward (last 10): 199.00
Episode 200, Average Reward (last 10): 243.50
Episode 210, Average Reward (last 10): 320.00
Episode 220, Average Reward (last 10): 324.70
Episode 

### TODO : Try different hyerparamters models (number of layers, nodes) and compare learning speed and stability

In [None]:
# Create environment
env = gym.make('CartPole-v1')

# Initialize and train agent
agent = REINFORCE(env)
rewards = agent.train(num_episodes=500)

# Test the trained policy
state, _ = env.reset()  # Gymnasium returns (state, info)
done = False
total_reward = 0

while not done:
    action = agent.select_action(state)
    state, reward, terminated, truncated, _ = env.step(action)  # Gymnasium step API
    done = terminated or truncated
    total_reward += reward

print(f"Test Episode Reward: {total_reward}")
env.close()

# other improvements

### GAE(general advantage estimation) actor critic
We can rewrite the policy gradient

$\nabla_{\theta} J(\pi_{\theta}) = E_{\tau \sim \pi_{\theta}}[{\sum_{t=0}^{T} \nabla_{\theta} \log \pi_{\theta}(a_t |s_t) \Phi_t}]$,

whith $\Phi_t$ could be any of:
- $\Phi_t =  G_t$
- $\Phi_t = \sum_{t'=t}^T R_{t+1} - V(s_t)$
- $\Phi_t = \sum_{t'=t}^T R_{t+1} - Q(s_t,a_t)$


For the last 2 cases we need to estimate V or Q (the critics). We do it as the same way at deepQ.
https://arxiv.org/pdf/1506.02438.pdf

$\phi_k = \arg \min_{\phi} E_{s_t, G_t \sim \pi_k}[{\left( V_{\phi}(s_t) - G_t \right)^2}]$

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import gymnasium as gym

class ActorNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        """
        Actor network that outputs action probabilities.

        Args:
            input_size (int): State dimension
            hidden_size (int): Number of hidden units
            output_size (int): Action dimension
        """
        super(ActorNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, state):
        """
        Forward pass through the actor network.

        Args:
            state (torch.Tensor): Input state

        Returns:
            torch.Tensor: Action probabilities
        """
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        action_probs = F.softmax(self.fc3(x), dim=-1)
        return action_probs

class CriticNetwork(nn.Module):
    def __init__(self, input_size, hidden_size):
        """
        Critic network that estimates the value function.

        Args:
            input_size (int): State dimension
            hidden_size (int): Number of hidden units
        """
        super(CriticNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, 1)

    def forward(self, state):
        """
        Forward pass through the critic network.

        Args:
            state (torch.Tensor): Input state

        Returns:
            torch.Tensor: Estimated state value
        """
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        value = self.fc3(x)
        return value

class ActorCritic:
    def __init__(self, env, hidden_size=128, actor_lr=3e-4, critic_lr=1e-3, gamma=0.99):
        """
        Initialize the Actor-Critic agent.

        Args:
            env: Gymnasium environment
            hidden_size (int): Number of hidden units in networks
            actor_lr (float): Learning rate for actor network
            critic_lr (float): Learning rate for critic network
            gamma (float): Discount factor
        """
        self.env = env
        self.gamma = gamma

        # Initialize actor network
        self.actor = ActorNetwork(
            input_size=env.observation_space.shape[0],
            hidden_size=hidden_size,
            output_size=env.action_space.n
        )

        # Initialize critic network
        self.critic = CriticNetwork(
            input_size=env.observation_space.shape[0],
            hidden_size=hidden_size
        )

        # Setup optimizers
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr)

    def select_action(self, state):
        """
        Select an action using the actor network.

        Args:
            state (numpy.ndarray): Current state

        Returns:
            tuple: Selected action and log probability
        """
        state = torch.FloatTensor(state)
        action_probs = self.actor(state)
        dist = torch.distributions.Categorical(action_probs)
        action = dist.sample()
        log_prob = dist.log_prob(action)
        return action.item(), log_prob

    def get_value(self, state):
        """
        Estimate the value of a state using the critic network.

        Args:
            state (numpy.ndarray): Input state

        Returns:
            torch.Tensor: Estimated state value
        """
        state = torch.FloatTensor(state)
        value = self.critic(state)
        return value

    def update(self, state, action_log_prob, reward, next_state, done):
        """
        Update both actor and critic networks.

        Args:
            state (numpy.ndarray): Current state
            action_log_prob (torch.Tensor): Log probability of taken action
            reward (float): Received reward
            next_state (numpy.ndarray): Next state
            done (bool): Whether episode has terminated
        """
        # Convert to tensors
        state = torch.FloatTensor(state)
        next_state = torch.FloatTensor(next_state)
        reward = torch.FloatTensor([reward])

        # Get current and next state values
        value = self.critic(state)
        next_value = self.critic(next_state)

        # Calculate TD error and value loss
        if done:
            expected_value = reward
        else:
            expected_value = reward + self.gamma * next_value

        advantage = expected_value.detach() - value
        critic_loss = F.mse_loss(value, expected_value.detach())

        # Calculate actor loss
        actor_loss = -action_log_prob * advantage.detach()

        # Update critic
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

        # Update actor
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        return actor_loss.item(), critic_loss.item()

    def train(self, num_episodes, max_steps=1000):
        """
        Train the agent for a specified number of episodes.

        Args:
            num_episodes (int): Number of episodes to train
            max_steps (int): Maximum steps per episode

        Returns:
            list: Episode rewards
        """
        episode_rewards = []

        for episode in range(num_episodes):
            state, _ = self.env.reset()
            episode_reward = 0
            actor_losses = []
            critic_losses = []

            for step in range(max_steps):
                # Select and perform action
                action, action_log_prob = self.select_action(state)
                next_state, reward, terminated, truncated, _ = self.env.step(action)
                done = terminated or truncated

                # Update networks
                actor_loss, critic_loss = self.update(
                    state, action_log_prob, reward, next_state, done
                )
                actor_losses.append(actor_loss)
                critic_losses.append(critic_loss)

                episode_reward += reward

                if done:
                    break

                state = next_state

            episode_rewards.append(episode_reward)

            # Print episode statistics
            if (episode + 1) % 10 == 0:
                avg_reward = np.mean(episode_rewards[-10:])
                avg_actor_loss = np.mean(actor_losses)
                avg_critic_loss = np.mean(critic_losses)
                print(f"Episode {episode + 1}")
                print(f"Average Reward (last 10): {avg_reward:.2f}")
                print(f"Average Actor Loss: {avg_actor_loss:.4f}")
                print(f"Average Critic Loss: {avg_critic_loss:.4f}\n")

        return episode_rewards

### TODO : Try different hyerparamters models (number of layers, nodes) and compare learning speed and stability

In [11]:
# Create environment
env = gym.make('CartPole-v1')

# Initialize and train agent
agent = ActorCritic(env)
rewards = agent.train(num_episodes=500)

# Test the trained policy
state, _ = env.reset()
done = False
total_reward = 0

while not done:
    action, _ = agent.select_action(state)
    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    total_reward += reward

print(f"\nTest Episode Reward: {total_reward}")
env.close()

Episode 10
Average Reward (last 10): 18.70
Average Actor Loss: 0.1906
Average Critic Loss: 10.4676

Episode 20
Average Reward (last 10): 18.20
Average Actor Loss: -0.1193
Average Critic Loss: 17.5052

Episode 30
Average Reward (last 10): 20.90
Average Actor Loss: 0.0156
Average Critic Loss: 11.0133

Episode 40
Average Reward (last 10): 16.70
Average Actor Loss: -0.1793
Average Critic Loss: 14.3520

Episode 50
Average Reward (last 10): 28.30
Average Actor Loss: -0.0947
Average Critic Loss: 8.5526

Episode 60
Average Reward (last 10): 47.20
Average Actor Loss: 0.0546
Average Critic Loss: 12.8212

Episode 70
Average Reward (last 10): 49.40
Average Actor Loss: -0.2119
Average Critic Loss: 18.3696

Episode 80
Average Reward (last 10): 65.60
Average Actor Loss: -0.0920
Average Critic Loss: 19.0536

Episode 90
Average Reward (last 10): 70.20
Average Actor Loss: 0.0877
Average Critic Loss: 17.7627

Episode 100
Average Reward (last 10): 80.20
Average Actor Loss: -0.4060
Average Critic Loss: 10.

# Actor Critic with other improvements
Architecture Improvements:

- Layer normalization for better stability
- Orthogonal weight initialization
- Separate target network for critic
- PPO-style clipping for more stable updates


Advanced Features:

- Generalized Advantage Estimation (GAE)
- Entropy regularization for exploration
- Gradient clipping to prevent exploding gradients
- Mini-batch updates for better sample efficiency
- Experience replay buffer with proper advantage computation


Training Stabilizers:

- Advantage normalization
- Multiple update epochs per batch
- Proper handling of episode termination
- Target network periodic updates
- Proper PPO-style policy updates with clipping

### TODO : Implement improvments and try them on other environements

In [None]:
env = gym.make("LunarLander-v2")

agent = ActorCritic(env)
rewards = agent.train(num_episodes=500)

# Test the trained policy
state, _ = env.reset()
done = False
total_reward = 0

while not done:
    action, _, _, _ = agent.select_action(state, evaluate=True)
    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    total_reward += reward

print(f"\nTest Episode Reward: {total_reward}")
env.close()