In [8]:
import gym
import matplotlib.pyplot as plt
import torch
import numpy as np
from tqdm import tqdm 

from src2.ValueNetwork import ValueNetwork
from src2.PolicyNetwork import PolicyNetwork
from src2.Trajectories import Trajectories

In [None]:
class PPOAgent:
    def __init__(self,
                 env: gym.Env,
                 state_space_size: int,
                 action_space_size: int,
                 batch_size: int,
                 gamma: float,
                 lmbda: float,
                 epsilon: float,
                 smooting_const: float,
                 normalize_advantages: bool = True,
                 dtype: torch.dtype = torch.float32,
                 device: str = 'cpu'):

        self.env = env

        self.state_space_size = state_space_size
        self.action_space_size = action_space_size
        self.gamma = gamma
        self.lmbda = lmbda
        self.epsilon = epsilon
        self.smoothing_constant = smooting_const
        self.normalize_advantages = normalize_advantages
        self.batch_size = batch_size

        self.dtype = dtype
        self.device = device

        self.value_net = ValueNetwork(state_space_size=self.state_space_size,
                                      dtype=self.dtype,
                                      device=self.device)

        self.policy_net = PolicyNetwork(state_space_size=self.state_space_size,
                                        action_space_size=self.action_space_size,
                                        dtype=self.dtype,
                                        device=self.device)

        self.policy_net_OLD = PolicyNetwork(state_space_size=self.state_space_size,
                                            action_space_size=self.action_space_size,
                                            dtype=self.dtype,
                                            device=self.device)
        # Initialize to same weights as policy net
        self.policy_net_OLD.load_state_dict(self.policy_net.state_dict())

    def get_normalized_advantages(self, advantages: torch.Tensor) -> torch.Tensor:

        normalized_advantages = (advantages - advantages.mean()) / (torch.std(advantages) + self.smoothing_constant)
        return normalized_advantages

    def compute_GAE(self, deltas: torch.Tensor) -> torch.Tensor:

        advantages = torch.zeros_like(deltas)
        advantage = 0.0
        for t in reversed(range(len(deltas))):
            advantage = deltas[t] + self.gamma * self.lmbda * advantage
            advantages[t] = advantage

        if self.normalize_advantages:
            return self.get_normalized_advantages(advantages=advantages)
        return advantages

    def compute_TD_residual(self, reward_t: float, next_value_t: float, value_t: float) -> float:

        return reward_t + self.gamma * next_value_t - value_t

    def get_policy_loss(self, state: torch.Tensor, action: int, advantage: float):

        # Compute the probability of the action taken under the old policy
        action_probs_old = self.policy_net_OLD(state)
        pi_old = action_probs_old[action]

        # Compute the probability of the action taken under the current policy
        action_probs_new = self.policy_net(state)
        pi_new = action_probs_new[action]

        # Compute the ratio r(θ)
        r = pi_new / pi_old

        # Compute the clipped surrogate objective
        surrogate_obj = r * advantage
        clipped_obj = torch.clamp(r, 1 - self.epsilon, 1 + self.epsilon) * advantage

        # Compute the PPO-Clip loss
        loss = -torch.min(surrogate_obj, clipped_obj).mean()
        return loss

    def get_value_loss(self, state: torch.Tensor, next_state: torch.Tensor, reward: torch.Tensor, is_last_step: bool):

        # Compute target value
        if is_last_step:  # If it's the last step in the episode
            target_value = reward
        else:
            # We detach the value estimate of the next state to prevent it from being
            # updated during the gradient descent of the current state's value.
            # This is done to treat the next state's value estimate as a constant target.
            target_value = reward + self.gamma * self.value_net(next_state).detach()

        # Compute estimated value
        value_estimate = self.value_net(state)

        # Compute the value loss
        value_loss = torch.nn.functional.mse_loss(value_estimate, target_value.reshape(value_estimate.shape))

        return value_loss

    def train(self, episodes: int, policy_lr: float, value_lr: float, num_policy_epochs: int, num_value_epochs: int):

        # Define the optimizer for the policy network
        policy_optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=policy_lr)

        # Define the optimizer for the value network
        value_optimizer = torch.optim.Adam(self.value_net.parameters(), lr=value_lr)

        avg_accumulated_reward = []

        for episode in tqdm(range(episodes)):
            # Retrieving batch of trajectories
            trajectories = Trajectories(batch_size=self.batch_size, env=self.env, policy_network=self.policy_net)
            states_batch, actions_batch, rewards_batch, next_states_batch = trajectories.get_batch()

            # Saving game length and batch size in variable
            game_length = actions_batch.shape[1]

            advantages_batch = []

            for batch in range(self.batch_size):

                states = states_batch[batch]
                rewards = rewards_batch[batch]
                next_states = next_states_batch[batch]

                # Iterate backwards through the trajectory to compute deltas and advantages
                deltas = torch.zeros(size=(game_length,))
                for t in range(game_length):

                    # Retrieve data for current time step
                    state_t, next_state_t, reward_t = states[t], next_states[t], rewards[t]

                    # Compute value estimates
                    value_t = self.value_net(state_t)
                    if t == game_length - 1:  # If it's the last step in the episode
                        next_value_t = torch.tensor([[0.0]])  # The value is 0 at the end of the episode
                    else:
                        next_value_t = self.value_net(next_state_t)

                    # Compute the TD residual (delta)
                    deltas[t] = self.compute_TD_residual(reward_t=reward_t, next_value_t=next_value_t, value_t=value_t)

                advantages_batch.append(self.compute_GAE(deltas=deltas))

            # Store the old policy parameters (before update)
            self.policy_net_OLD.load_state_dict(self.policy_net.state_dict())

            # For a fixed number of policy update epochs:
            for policy_epoch in range(num_policy_epochs):

                for batch in range(self.batch_size):
                    states = states_batch[batch]
                    actions = actions_batch[batch]
                    advantages = advantages_batch[batch]

                    for t in range(game_length):
                        # Retrieve t'th step of trajectory
                        state_t, action_t, advantage_t = states[t].detach(), actions[t].detach(), advantages[t].detach()

                        # Compute the policy loss
                        policy_loss = self.get_policy_loss(state=state_t, action=action_t, advantage=advantage_t)

                        # Update policy parameters using the optimizer
                        policy_optimizer.zero_grad()
                        policy_loss.backward()
                        policy_optimizer.step()

            # Step 4: Value Network Update
            for value_epoch in range(num_value_epochs):

                for batch in range(self.batch_size):
                    states = states_batch[batch]
                    rewards = rewards_batch[batch]
                    next_states = next_states_batch[batch]

                    is_last_step = False
                    for t in range(game_length):
                        # Retrieve t'th step of trajectory
                        state_t, next_state_t, reward_t = states[t], next_states[t], rewards[t]
                        if t == game_length - 1:
                            is_last_step = True
                        # Compute value loss
                        value_loss = self.get_value_loss(state=state_t,
                                                         next_state=next_state_t,
                                                         reward=reward_t,
                                                         is_last_step=is_last_step)

                        # Update value network parameters using the optimizer
                        value_optimizer.zero_grad()
                        value_loss.backward()
                        value_optimizer.step()

            avg_accumulated_reward.append(float(torch.mean(torch.sum(rewards_batch, dim=1)).detach().numpy()))
        return avg_accumulated_reward




In [9]:
# THE GAME: https://www.gymlibrary.dev/environments/classic_control/cart_pole/#rewards
env = gym.make('CartPole-v1')

input_dim = env.observation_space.shape[0]
output_dim = env.action_space.n

In [11]:
policy_net = PolicyNetwork(state_space_size=input_dim,
                           action_space_size=output_dim)
value_net = ValueNetwork(state_space_size=input_dim)

In [12]:
trajectories = Trajectories(env=env,batch_size=3,policy_network=policy_net)

In [21]:
states_batch, actions_batch, rewards_batch, next_states_batch = trajectories.get_batch()

In [44]:
states, actions, rewards, next_states = trajectories.get_trajectory()

In [46]:
states[0]

[-0.03350643068552017,
 -0.018029730767011642,
 -0.041487984359264374,
 0.04232301935553551]

In [35]:
states_batch[0]

tensor([[-0.0328, -0.0125,  0.0177, -0.0261],
        [-0.0331, -0.2079,  0.0172,  0.2721],
        [-0.0373, -0.0130,  0.0226, -0.0151],
        [-0.0375, -0.2085,  0.0223,  0.2846],
        [-0.0417, -0.0137,  0.0280, -0.0009],
        [-0.0420, -0.2092,  0.0280,  0.3004],
        [-0.0461, -0.0145,  0.0340,  0.0167],
        [-0.0464,  0.1802,  0.0343, -0.2651],
        [-0.0428, -0.0154,  0.0290,  0.0383],
        [-0.0431, -0.2110,  0.0298,  0.3400]])

In [36]:
states_batch[1]

tensor([[ 0.0422, -0.0323, -0.0363,  0.0437],
        [ 0.0415,  0.1633, -0.0354, -0.2602],
        [ 0.0448, -0.0313, -0.0406,  0.0211],
        [ 0.0442,  0.1644, -0.0402, -0.2841],
        [ 0.0475,  0.3600, -0.0458, -0.5892],
        [ 0.0547,  0.5558, -0.0576, -0.8960],
        [ 0.0658,  0.7516, -0.0755, -1.2062],
        [ 0.0808,  0.9476, -0.0997, -1.5216],
        [ 0.0998,  1.1438, -0.1301, -1.8436],
        [ 0.1226,  1.3401, -0.1670, -2.1737]])

In [42]:
a = states_batch.reshape((states_batch.shape[0]*states_batch.shape[1],states_batch.shape[2]))

tensor([[-3.2849e-02, -1.2517e-02,  1.7686e-02, -2.6089e-02],
        [-3.3099e-02, -2.0789e-01,  1.7165e-02,  2.7212e-01],
        [-3.7257e-02, -1.3015e-02,  2.2607e-02, -1.5099e-02],
        [-3.7517e-02, -2.0845e-01,  2.2305e-02,  2.8463e-01],
        [-4.1686e-02, -1.3657e-02,  2.7998e-02, -9.3572e-04],
        [-4.1959e-02, -2.0917e-01,  2.7979e-02,  3.0045e-01],
        [-4.6142e-02, -1.4456e-02,  3.3988e-02,  1.6718e-02],
        [-4.6432e-02,  1.8016e-01,  3.4322e-02, -2.6505e-01],
        [-4.2828e-02, -1.5433e-02,  2.9021e-02,  3.8258e-02],
        [-4.3137e-02, -2.1096e-01,  2.9786e-02,  3.3995e-01],
        [ 4.2175e-02, -3.2334e-02, -3.6253e-02,  4.3656e-02],
        [ 4.1528e-02,  1.6329e-01, -3.5380e-02, -2.6024e-01],
        [ 4.4794e-02, -3.1311e-02, -4.0584e-02,  2.1077e-02],
        [ 4.4168e-02,  1.6437e-01, -4.0163e-02, -2.8413e-01],
        [ 4.7455e-02,  3.6004e-01, -4.5845e-02, -5.8920e-01],
        [ 5.4656e-02,  5.5577e-01, -5.7629e-02, -8.9597e-01],
        

In [43]:
deltas = torch.zeros(size=(game_length,))
for t in range(game_length):

    # Retrieve data for current time step
    state_t, next_state_t, reward_t = states[t], next_states[t], rewards[t]

    # Compute value estimates
    value_t = self.value_net(state_t)
    if t == game_length - 1:  # If it's the last step in the episode
        next_value_t = torch.tensor([[0.0]])  # The value is 0 at the end of the episode
    else:
        next_value_t = self.value_net(next_state_t)

    # Compute the TD residual (delta)
    deltas[t] = self.compute_TD_residual(reward_t=reward_t, next_value_t=next_value_t, value_t=value_t)

tensor([[0.2931],
        [0.3156],
        [0.2940],
        [0.3170],
        [0.2954],
        [0.3177],
        [0.2971],
        [0.3391],
        [0.2950],
        [0.3157],
        [0.2764],
        [0.2969],
        [0.2755],
        [0.2985],
        [0.3439],
        [0.3861],
        [0.4255],
        [0.4547],
        [0.4681],
        [0.4798],
        [0.2869],
        [0.3105],
        [0.2916],
        [0.3146],
        [0.2953],
        [0.3152],
        [0.2939],
        [0.3177],
        [0.3033],
        [0.3176]], grad_fn=<AddmmBackward0>)